From 351f94d981f363909ae6e76ed57cd0a75c3f5688 Mon Sep 17 00:00:00 2001 From: Rageking8 <106309953+Rageking8@users.noreply.github.com> Date: Thu, 8 Feb 2024 13:05:53 +0800 Subject: [clang][NFC] resolve redundant predicates (#79701) Fixes #79686 --- clang/lib/Sema/SemaChecking.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index b071a02..c775ff2 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -17183,7 +17183,7 @@ public: // evaluates to true. bool EvalResult = false; bool EvalOK = Eval.evaluate(BO->getLHS(), EvalResult); - bool ShouldVisitRHS = !EvalOK || (EvalOK && !EvalResult); + bool ShouldVisitRHS = !EvalOK || !EvalResult; if (ShouldVisitRHS) { Region = RHSRegion; Visit(BO->getRHS()); @@ -17215,7 +17215,7 @@ public: // [...] the second operand is not evaluated if the first operand is false. bool EvalResult = false; bool EvalOK = Eval.evaluate(BO->getLHS(), EvalResult); - bool ShouldVisitRHS = !EvalOK || (EvalOK && EvalResult); + bool ShouldVisitRHS = !EvalOK || EvalResult; if (ShouldVisitRHS) { Region = RHSRegion; Visit(BO->getRHS()); @@ -17266,8 +17266,8 @@ public: // evaluated. [...] bool EvalResult = false; bool EvalOK = Eval.evaluate(CO->getCond(), EvalResult); - bool ShouldVisitTrueExpr = !EvalOK || (EvalOK && EvalResult); - bool ShouldVisitFalseExpr = !EvalOK || (EvalOK && !EvalResult); + bool ShouldVisitTrueExpr = !EvalOK || EvalResult; + bool ShouldVisitFalseExpr = !EvalOK || !EvalResult; if (ShouldVisitTrueExpr) { Region = TrueRegion; Visit(CO->getTrueExpr()); -- cgit v1.1 From 8f6e13e6da84510c8321717860fd506e12118514 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Wed, 7 Feb 2024 21:35:43 -0800 Subject: [clang-format] Fix a regression in dumping the config (#80628) Commit d813af73f70f addressed a regression introduced by commit 3791b3fca6ea but caused `clang-format -dump-config` to "hang". This patch reverts changes to ClangFormat.cpp by both commits and reworks the cleanup. Fixes #80621. --- clang/test/Format/dump-config-objc-stdin.m | 3 ++ clang/test/Format/verbose.cpp | 10 ++---- clang/tools/clang-format/ClangFormat.cpp | 49 +++++++++++++++--------------- 3 files changed, 30 insertions(+), 32 deletions(-) diff --git a/clang/test/Format/dump-config-objc-stdin.m b/clang/test/Format/dump-config-objc-stdin.m index b22ff7b..d81711a 100644 --- a/clang/test/Format/dump-config-objc-stdin.m +++ b/clang/test/Format/dump-config-objc-stdin.m @@ -1,5 +1,8 @@ +// RUN: clang-format -assume-filename=foo.m -dump-config | FileCheck %s + // RUN: clang-format -dump-config - < %s | FileCheck %s // CHECK: Language: ObjC + @interface Foo @end diff --git a/clang/test/Format/verbose.cpp b/clang/test/Format/verbose.cpp index dd625e3..4ab03d8 100644 --- a/clang/test/Format/verbose.cpp +++ b/clang/test/Format/verbose.cpp @@ -1,12 +1,6 @@ -// RUN: clang-format %s 2> %t.stderr +// RUN: clang-format -verbose 2> %t.stderr // RUN: not grep "Formatting" %t.stderr -// RUN: clang-format %s -verbose 2> %t.stderr -// RUN: grep -E "Formatting (.*)verbose.cpp(.*)" %t.stderr -// RUN: clang-format %s -verbose=false 2> %t.stderr -// RUN: not grep "Formatting" %t.stderr - -int a; -// RUN: clang-format %s 2> %t.stderr +// RUN: clang-format %s 2> %t.stderr // RUN: not grep "Formatting" %t.stderr // RUN: clang-format %s -verbose 2> %t.stderr // RUN: grep -E "Formatting (.*)verbose.cpp(.*)" %t.stderr diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp index 5ee6092..e122cea 100644 --- a/clang/tools/clang-format/ClangFormat.cpp +++ b/clang/tools/clang-format/ClangFormat.cpp @@ -399,7 +399,8 @@ class ClangFormatDiagConsumer : public DiagnosticConsumer { }; // Returns true on error. -static bool format(StringRef FileName, bool IsSTDIN) { +static bool format(StringRef FileName) { + const bool IsSTDIN = FileName == "-"; if (!OutputXML && Inplace && IsSTDIN) { errs() << "error: cannot use -i when reading from stdin.\n"; return false; @@ -545,24 +546,25 @@ static void PrintVersion(raw_ostream &OS) { } // Dump the configuration. -static int dumpConfig(bool IsSTDIN) { +static int dumpConfig() { std::unique_ptr Code; - - // `FileNames` must have at least "-" in it even if no file was specified. - assert(!FileNames.empty()); - - // Read in the code in case the filename alone isn't enough to detect the - // language. - ErrorOr> CodeOrErr = - MemoryBuffer::getFileOrSTDIN(FileNames[0]); - if (std::error_code EC = CodeOrErr.getError()) { - llvm::errs() << EC.message() << "\n"; - return 1; + // We can't read the code to detect the language if there's no file name. + if (!FileNames.empty()) { + // Read in the code in case the filename alone isn't enough to detect the + // language. + ErrorOr> CodeOrErr = + MemoryBuffer::getFileOrSTDIN(FileNames[0]); + if (std::error_code EC = CodeOrErr.getError()) { + llvm::errs() << EC.message() << "\n"; + return 1; + } + Code = std::move(CodeOrErr.get()); } - Code = std::move(CodeOrErr.get()); - llvm::Expected FormatStyle = - clang::format::getStyle(Style, IsSTDIN ? AssumeFileName : FileNames[0], + clang::format::getStyle(Style, + FileNames.empty() || FileNames[0] == "-" + ? AssumeFileName + : FileNames[0], FallbackStyle, Code ? Code->getBuffer() : ""); if (!FormatStyle) { llvm::errs() << llvm::toString(FormatStyle.takeError()) << "\n"; @@ -682,11 +684,8 @@ int main(int argc, const char **argv) { return 0; } - if (FileNames.empty()) - FileNames.push_back("-"); - if (DumpConfig) - return dumpConfig(FileNames[0] == "-"); + return dumpConfig(); if (!Files.empty()) { std::ifstream ExternalFileOfFiles{std::string(Files)}; @@ -699,7 +698,10 @@ int main(int argc, const char **argv) { errs() << "Clang-formating " << LineNo << " files\n"; } - if (FileNames.size() != 1 && + if (FileNames.empty()) + return clang::format::format("-"); + + if (FileNames.size() > 1 && (!Offsets.empty() || !Lengths.empty() || !LineRanges.empty())) { errs() << "error: -offset, -length and -lines can only be used for " "single file.\n"; @@ -709,14 +711,13 @@ int main(int argc, const char **argv) { unsigned FileNo = 1; bool Error = false; for (const auto &FileName : FileNames) { - const bool IsSTDIN = FileName == "-"; - if (!IsSTDIN && isIgnored(FileName)) + if (isIgnored(FileName)) continue; if (Verbose) { errs() << "Formatting [" << FileNo++ << "/" << FileNames.size() << "] " << FileName << "\n"; } - Error |= clang::format::format(FileName, IsSTDIN); + Error |= clang::format::format(FileName); } return Error ? 1 : 0; } -- cgit v1.1 From c8ca98a2a9796797f2eab00cc6516610c133633a Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Thu, 8 Feb 2024 13:45:27 +0800 Subject: [InstCombine] Handle IsInf/IsZero idioms (#80607) This patch does the following folds: ``` icmp eq/ne (bitcast X to int), (bitcast +/-inf to int) -> llvm.is.fpclass(X, (~)fcPosInf/fcNegInf) icmp eq/ne (bitcast X to int), (bitcast +0/-0 to int) -> llvm.is.fpclass(X, (~)fcPosZero/fcNegZero) ``` Alive2: https://alive2.llvm.org/ce/z/JJmEE9 --- .../Transforms/InstCombine/InstCombineCompares.cpp | 30 +++-- .../Transforms/InstCombine/fpclass-check-idioms.ll | 150 ++++++++++++++++++++- 2 files changed, 169 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 7aac13f..cbb6988 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -3229,16 +3229,16 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) { if (Cmp.isEquality() && match(Op1, m_Zero())) return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType())); - // If this is a sign-bit test of a bitcast of a casted FP value, eliminate - // the FP extend/truncate because that cast does not change the sign-bit. - // This is true for all standard IEEE-754 types and the X86 80-bit type. - // The sign-bit is always the most significant bit in those types. const APInt *C; bool TrueIfSigned; - if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() && - isSignBitCheck(Pred, *C, TrueIfSigned)) { - if (match(BCSrcOp, m_FPExt(m_Value(X))) || - match(BCSrcOp, m_FPTrunc(m_Value(X)))) { + if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse()) { + // If this is a sign-bit test of a bitcast of a casted FP value, eliminate + // the FP extend/truncate because that cast does not change the sign-bit. + // This is true for all standard IEEE-754 types and the X86 80-bit type. + // The sign-bit is always the most significant bit in those types. + if (isSignBitCheck(Pred, *C, TrueIfSigned) && + (match(BCSrcOp, m_FPExt(m_Value(X))) || + match(BCSrcOp, m_FPTrunc(m_Value(X))))) { // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0 // (bitcast (fpext/fptrunc X)) to iX) > -1 --> (bitcast X to iY) > -1 Type *XType = X->getType(); @@ -3257,6 +3257,20 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) { ConstantInt::getAllOnesValue(NewType)); } } + + // icmp eq/ne (bitcast X to int), special fp -> llvm.is.fpclass(X, class) + Type *FPType = SrcType->getScalarType(); + if (!Cmp.getParent()->getParent()->hasFnAttribute( + Attribute::NoImplicitFloat) && + Cmp.isEquality() && FPType->isIEEELikeFPTy()) { + FPClassTest Mask = APFloat(FPType->getFltSemantics(), *C).classify(); + if (Mask & (fcInf | fcZero)) { + if (Pred == ICmpInst::ICMP_NE) + Mask = ~Mask; + return replaceInstUsesWith(Cmp, + Builder.createIsFPClass(BCSrcOp, Mask)); + } + } } } diff --git a/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll b/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll index 019db34..d2b4536 100644 --- a/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll +++ b/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll @@ -40,13 +40,11 @@ define i1 @f64_fcnan_fcinf(double %a) { ret i1 %cmp } -; TODO: handle more fpclass check idioms define i1 @f32_fcinf(float %a) { ; CHECK-LABEL: define i1 @f32_fcinf( ; CHECK-SAME: float [[A:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[A]]) -; CHECK-NEXT: [[AND:%.*]] = bitcast float [[TMP1]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 2139095040 +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], 0x7FF0000000000000 ; CHECK-NEXT: ret i1 [[CMP]] ; %i32 = bitcast float %a to i32 @@ -55,6 +53,63 @@ define i1 @f32_fcinf(float %a) { ret i1 %cmp } +define i1 @f32_fcposinf(float %a) { +; CHECK-LABEL: define i1 @f32_fcposinf( +; CHECK-SAME: float [[A:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[A]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[CMP]] +; + %i32 = bitcast float %a to i32 + %cmp = icmp eq i32 %i32, 2139095040 + ret i1 %cmp +} + +define i1 @f32_fcneginf(float %a) { +; CHECK-LABEL: define i1 @f32_fcneginf( +; CHECK-SAME: float [[A:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[A]], 0xFFF0000000000000 +; CHECK-NEXT: ret i1 [[CMP]] +; + %i32 = bitcast float %a to i32 + %cmp = icmp eq i32 %i32, 4286578688 + ret i1 %cmp +} + +define i1 @f32_fcposzero(float %a) { +; CHECK-LABEL: define i1 @f32_fcposzero( +; CHECK-SAME: float [[A:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 64) +; CHECK-NEXT: ret i1 [[CMP]] +; + %i32 = bitcast float %a to i32 + %cmp = icmp eq i32 %i32, 0 + ret i1 %cmp +} + +define i1 @f32_fcnegzero(float %a) { +; CHECK-LABEL: define i1 @f32_fcnegzero( +; CHECK-SAME: float [[A:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 32) +; CHECK-NEXT: ret i1 [[CMP]] +; + %i32 = bitcast float %a to i32 + %cmp = icmp eq i32 %i32, 2147483648 + ret i1 %cmp +} + +define i1 @f32_fczero(float %a) { +; CHECK-LABEL: define i1 @f32_fczero( +; CHECK-SAME: float [[A:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[A]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + %i32 = bitcast float %a to i32 + %and = and i32 %i32, 2147483647 + %cmp = icmp eq i32 %and, 0 + ret i1 %cmp +} + +; TODO: handle more fpclass check idioms define i1 @f32_fcnan(float %a) { ; CHECK-LABEL: define i1 @f32_fcnan( ; CHECK-SAME: float [[A:%.*]]) { @@ -101,6 +156,19 @@ define <2 x i1> @f32_fcnan_fcinf_vec(<2 x float> %a) { ret <2 x i1> %cmp } +define <2 x i1> @f32_fcinf_vec(<2 x float> %a) { +; CHECK-LABEL: define <2 x i1> @f32_fcinf_vec( +; CHECK-SAME: <2 x float> [[A:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq <2 x float> [[TMP1]], +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %i32 = bitcast <2 x float> %a to <2 x i32> + %and = and <2 x i32> %i32, + %cmp = icmp eq <2 x i32> %and, + ret <2 x i1> %cmp +} + ; Negative tests define i1 @f32_fcnan_fcinf_wrong_mask1(float %a) { @@ -158,6 +226,18 @@ define i1 @f32_fcnan_fcinf_wrong_pred(float %a) { ret i1 %cmp } +define i1 @f32_fcposzero_wrong_pred(float %a) { +; CHECK-LABEL: define i1 @f32_fcposzero_wrong_pred( +; CHECK-SAME: float [[A:%.*]]) { +; CHECK-NEXT: [[I32:%.*]] = bitcast float [[A]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I32]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %i32 = bitcast float %a to i32 + %cmp = icmp slt i32 %i32, 0 + ret i1 %cmp +} + define i1 @f32_fcnan_fcinf_wrong_type1(<2 x float> %a) { ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type1( ; CHECK-SAME: <2 x float> [[A:%.*]]) { @@ -172,6 +252,18 @@ define i1 @f32_fcnan_fcinf_wrong_type1(<2 x float> %a) { ret i1 %cmp } +define i1 @f32_fcposinf_wrong_type1(<2 x float> %a) { +; CHECK-LABEL: define i1 @f32_fcposinf_wrong_type1( +; CHECK-SAME: <2 x float> [[A:%.*]]) { +; CHECK-NEXT: [[I64:%.*]] = bitcast <2 x float> [[A]] to i64 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[I64]], 2139095040 +; CHECK-NEXT: ret i1 [[CMP]] +; + %i64 = bitcast <2 x float> %a to i64 + %cmp = icmp eq i64 %i64, 2139095040 + ret i1 %cmp +} + define i1 @f32_fcnan_fcinf_wrong_type2(x86_fp80 %a) { ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type2( ; CHECK-SAME: x86_fp80 [[A:%.*]]) { @@ -186,6 +278,18 @@ define i1 @f32_fcnan_fcinf_wrong_type2(x86_fp80 %a) { ret i1 %cmp } +define i1 @f32_fcposzero_wrong_type2(x86_fp80 %a) { +; CHECK-LABEL: define i1 @f32_fcposzero_wrong_type2( +; CHECK-SAME: x86_fp80 [[A:%.*]]) { +; CHECK-NEXT: [[I80:%.*]] = bitcast x86_fp80 [[A]] to i80 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i80 [[I80]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %i80 = bitcast x86_fp80 %a to i80 + %cmp = icmp eq i80 %i80, 0 + ret i1 %cmp +} + define i1 @f32_fcnan_fcinf_noimplicitfloat(float %a) #0 { ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_noimplicitfloat( ; CHECK-SAME: float [[A:%.*]]) #[[ATTR1:[0-9]+]] { @@ -200,4 +304,44 @@ define i1 @f32_fcnan_fcinf_noimplicitfloat(float %a) #0 { ret i1 %cmp } +define i1 @f32_fcposinf_noimplicitfloat(float %a) #0 { +; CHECK-LABEL: define i1 @f32_fcposinf_noimplicitfloat( +; CHECK-SAME: float [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[I32:%.*]] = bitcast float [[A]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095040 +; CHECK-NEXT: ret i1 [[CMP]] +; + %i32 = bitcast float %a to i32 + %cmp = icmp eq i32 %i32, 2139095040 + ret i1 %cmp +} + +define i1 @f32_fcposnan(float %a) { +; CHECK-LABEL: define i1 @f32_fcposnan( +; CHECK-SAME: float [[A:%.*]]) { +; CHECK-NEXT: [[I32:%.*]] = bitcast float [[A]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095041 +; CHECK-NEXT: ret i1 [[CMP]] +; + %i32 = bitcast float %a to i32 + %cmp = icmp eq i32 %i32, 2139095041 + ret i1 %cmp +} + +define i1 @f32_fcposinf_multiuse(float %a) { +; CHECK-LABEL: define i1 @f32_fcposinf_multiuse( +; CHECK-SAME: float [[A:%.*]]) { +; CHECK-NEXT: [[I32:%.*]] = bitcast float [[A]] to i32 +; CHECK-NEXT: call void @usei32(i32 [[I32]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095040 +; CHECK-NEXT: ret i1 [[CMP]] +; + %i32 = bitcast float %a to i32 + call void @usei32(i32 %i32) + %cmp = icmp eq i32 %i32, 2139095040 + ret i1 %cmp +} + +declare void @usei32(i32) + attributes #0 = { noimplicitfloat } -- cgit v1.1 From e17dded8d712fb13c30fd88f7810edaa0ee3e60d Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Thu, 8 Feb 2024 15:07:35 +0800 Subject: [InstSimplify] Generalize `simplifyAndOrOfFCmps` (#81027) This patch generalizes `simplifyAndOrOfFCmps` to simplify patterns like: ``` define i1 @src(float %x, float %y) { %or.cond.i = fcmp ord float %x, 0.000000e+00 %cmp.i.i34 = fcmp olt float %x, %y %cmp.i2.sink.i = and i1 %or.cond.i, %cmp.i.i34 ret i1 %cmp.i2.sink.i } define i1 @tgt(float %x, float %y) { %cmp.i.i34 = fcmp olt float %x, %y ret i1 %cmp.i.i34 } ``` Alive2: https://alive2.llvm.org/ce/z/9rydcx This patch and #80986 will fix the regression introduced by #80941. See also the IR diff https://github.com/dtcxzyw/llvm-opt-benchmark/pull/199#discussion_r1480974120. --- llvm/lib/Analysis/InstructionSimplify.cpp | 43 +++--- .../InstCombine/create-class-from-logic-fcmp.ll | 20 +-- .../test/Transforms/InstSimplify/logic-of-fcmps.ll | 167 +++++++++++++++++++++ 3 files changed, 199 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 01b0171..51e258d 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -1853,35 +1853,36 @@ static Value *simplifyAndOrOfFCmps(const SimplifyQuery &Q, FCmpInst *LHS, return nullptr; FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); - if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) || - (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) { - // (fcmp ord NNAN, X) & (fcmp ord X, Y) --> fcmp ord X, Y - // (fcmp ord NNAN, X) & (fcmp ord Y, X) --> fcmp ord Y, X - // (fcmp ord X, NNAN) & (fcmp ord X, Y) --> fcmp ord X, Y - // (fcmp ord X, NNAN) & (fcmp ord Y, X) --> fcmp ord Y, X - // (fcmp uno NNAN, X) | (fcmp uno X, Y) --> fcmp uno X, Y - // (fcmp uno NNAN, X) | (fcmp uno Y, X) --> fcmp uno Y, X - // (fcmp uno X, NNAN) | (fcmp uno X, Y) --> fcmp uno X, Y - // (fcmp uno X, NNAN) | (fcmp uno Y, X) --> fcmp uno Y, X + if ((PredL == FCmpInst::FCMP_ORD || PredL == FCmpInst::FCMP_UNO) && + ((FCmpInst::isOrdered(PredR) && IsAnd) || + (FCmpInst::isUnordered(PredR) && !IsAnd))) { + // (fcmp ord X, NNAN) & (fcmp o** X, Y) --> fcmp o** X, Y + // (fcmp uno X, NNAN) & (fcmp o** X, Y) --> false + // (fcmp uno X, NNAN) | (fcmp u** X, Y) --> fcmp u** X, Y + // (fcmp ord X, NNAN) | (fcmp u** X, Y) --> true if (((LHS1 == RHS0 || LHS1 == RHS1) && isKnownNeverNaN(LHS0, /*Depth=*/0, Q)) || ((LHS0 == RHS0 || LHS0 == RHS1) && isKnownNeverNaN(LHS1, /*Depth=*/0, Q))) - return RHS; - - // (fcmp ord X, Y) & (fcmp ord NNAN, X) --> fcmp ord X, Y - // (fcmp ord Y, X) & (fcmp ord NNAN, X) --> fcmp ord Y, X - // (fcmp ord X, Y) & (fcmp ord X, NNAN) --> fcmp ord X, Y - // (fcmp ord Y, X) & (fcmp ord X, NNAN) --> fcmp ord Y, X - // (fcmp uno X, Y) | (fcmp uno NNAN, X) --> fcmp uno X, Y - // (fcmp uno Y, X) | (fcmp uno NNAN, X) --> fcmp uno Y, X - // (fcmp uno X, Y) | (fcmp uno X, NNAN) --> fcmp uno X, Y - // (fcmp uno Y, X) | (fcmp uno X, NNAN) --> fcmp uno Y, X + return FCmpInst::isOrdered(PredL) == FCmpInst::isOrdered(PredR) + ? static_cast(RHS) + : ConstantInt::getBool(LHS->getType(), !IsAnd); + } + + if ((PredR == FCmpInst::FCMP_ORD || PredR == FCmpInst::FCMP_UNO) && + ((FCmpInst::isOrdered(PredL) && IsAnd) || + (FCmpInst::isUnordered(PredL) && !IsAnd))) { + // (fcmp o** X, Y) & (fcmp ord X, NNAN) --> fcmp o** X, Y + // (fcmp o** X, Y) & (fcmp uno X, NNAN) --> false + // (fcmp u** X, Y) | (fcmp uno X, NNAN) --> fcmp u** X, Y + // (fcmp u** X, Y) | (fcmp ord X, NNAN) --> true if (((RHS1 == LHS0 || RHS1 == LHS1) && isKnownNeverNaN(RHS0, /*Depth=*/0, Q)) || ((RHS0 == LHS0 || RHS0 == LHS1) && isKnownNeverNaN(RHS1, /*Depth=*/0, Q))) - return LHS; + return FCmpInst::isOrdered(PredL) == FCmpInst::isOrdered(PredR) + ? static_cast(LHS) + : ConstantInt::getBool(LHS->getType(), !IsAnd); } return nullptr; diff --git a/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll b/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll index 24dac97..12c608c 100644 --- a/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll +++ b/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll @@ -1100,8 +1100,8 @@ define i1 @uge_smallest_normal_or_ord(half %x) #0 { ; -> nan | pnormal | pinf define i1 @uge_smallest_normal_or_uno(half %x) #0 { ; CHECK-LABEL: @uge_smallest_normal_or_uno( -; CHECK-NEXT: [[CLASS:%.*]] = call i1 @llvm.is.fpclass.f16(half [[X:%.*]], i32 771) -; CHECK-NEXT: ret i1 [[CLASS]] +; CHECK-NEXT: [[CMP_SMALLEST_NORMAL:%.*]] = fcmp uge half [[X:%.*]], 0xH0400 +; CHECK-NEXT: ret i1 [[CMP_SMALLEST_NORMAL]] ; %uno = fcmp uno half %x, 0.0 %cmp.smallest.normal = fcmp uge half %x, 0xH0400 @@ -1307,8 +1307,8 @@ define i1 @oge_fabs_eq_inf_and_ord(half %x) #0 { define i1 @oge_eq_inf_and_ord(half %x) #0 { ; CHECK-LABEL: @oge_eq_inf_and_ord( -; CHECK-NEXT: [[AND:%.*]] = fcmp oeq half [[X:%.*]], 0xH7C00 -; CHECK-NEXT: ret i1 [[AND]] +; CHECK-NEXT: [[OGE_FABS_INF:%.*]] = fcmp oeq half [[X:%.*]], 0xH7C00 +; CHECK-NEXT: ret i1 [[OGE_FABS_INF]] ; %oge.fabs.inf = fcmp oge half %x, 0xH7C00 %ord = fcmp ord half %x, 0xH0000 @@ -1379,8 +1379,8 @@ define i1 @ult_fabs_eq_inf_or_uno(half %x) #0 { define i1 @ult_eq_inf_or_uno(half %x) #0 { ; CHECK-LABEL: @ult_eq_inf_or_uno( -; CHECK-NEXT: [[OR:%.*]] = fcmp une half [[X:%.*]], 0xH7C00 -; CHECK-NEXT: ret i1 [[OR]] +; CHECK-NEXT: [[ULT_FABS_INF:%.*]] = fcmp une half [[X:%.*]], 0xH7C00 +; CHECK-NEXT: ret i1 [[ULT_FABS_INF]] ; %ult.fabs.inf = fcmp ult half %x, 0xH7C00 %uno = fcmp uno half %x, 0xH0000 @@ -1465,8 +1465,8 @@ define i1 @oeq_neginfinity_or_ord(half %x) #0 { ; -> ninf define i1 @oeq_neginfinity_and_ord(half %x) #0 { ; CHECK-LABEL: @oeq_neginfinity_and_ord( -; CHECK-NEXT: [[CLASS:%.*]] = fcmp oeq half [[X:%.*]], 0xHFC00 -; CHECK-NEXT: ret i1 [[CLASS]] +; CHECK-NEXT: [[OEQ_NEG_INFINITY:%.*]] = fcmp oeq half [[X:%.*]], 0xHFC00 +; CHECK-NEXT: ret i1 [[OEQ_NEG_INFINITY]] ; %oeq.neg.infinity = fcmp oeq half %x, 0xHFC00 %ord = fcmp ord half %x, 0.0 @@ -1597,8 +1597,8 @@ define i1 @ueq_neginfinity_and_olt_smallest_normal(half %x) #0 { ; -> nan|ninf define i1 @ueq_neginfinity_or_uno(half %x) #0 { ; CHECK-LABEL: @ueq_neginfinity_or_uno( -; CHECK-NEXT: [[CLASS:%.*]] = fcmp ueq half [[X:%.*]], 0xHFC00 -; CHECK-NEXT: ret i1 [[CLASS]] +; CHECK-NEXT: [[UEQ_NEG_INFINITY:%.*]] = fcmp ueq half [[X:%.*]], 0xHFC00 +; CHECK-NEXT: ret i1 [[UEQ_NEG_INFINITY]] ; %ueq.neg.infinity = fcmp ueq half %x, 0xHFC00 %uno = fcmp uno half %x, 0.0 diff --git a/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll b/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll index d898df0..4b2ff1b 100644 --- a/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll +++ b/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll @@ -259,3 +259,170 @@ define <2 x i1> @uno8(<2 x double> %x, <2 x double> %y) { %r = or <2 x i1> %cmp1, %cmp2 ret <2 x i1> %r } + +define i1 @olt_implies_ord(float %x, float %y) { +; CHECK-LABEL: @olt_implies_ord( +; CHECK-NEXT: [[OLT:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[OLT]] +; + %ord = fcmp ord float %x, 0.000000e+00 + %olt = fcmp olt float %x, %y + %ret = and i1 %olt, %ord + ret i1 %ret +} + +define i1 @olt_implies_ord_commuted1(float %x, float %y) { +; CHECK-LABEL: @olt_implies_ord_commuted1( +; CHECK-NEXT: [[OLT:%.*]] = fcmp olt float [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: ret i1 [[OLT]] +; + %ord = fcmp ord float %x, 0.000000e+00 + %olt = fcmp olt float %y, %x + %ret = and i1 %olt, %ord + ret i1 %ret +} + +define i1 @olt_implies_ord_commuted2(float %x, float %y) { +; CHECK-LABEL: @olt_implies_ord_commuted2( +; CHECK-NEXT: [[OLT:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[OLT]] +; + %ord = fcmp ord float %x, 0.000000e+00 + %olt = fcmp olt float %x, %y + %ret = and i1 %ord, %olt + ret i1 %ret +} + +define i1 @olt_implies_ord_commuted3(float %x, float %y) { +; CHECK-LABEL: @olt_implies_ord_commuted3( +; CHECK-NEXT: [[OLT:%.*]] = fcmp olt float [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: ret i1 [[OLT]] +; + %ord = fcmp ord float %x, 0.000000e+00 + %olt = fcmp olt float %y, %x + %ret = and i1 %ord, %olt + ret i1 %ret +} + +define <2 x i1> @olt_implies_ord_vec(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @olt_implies_ord_vec( +; CHECK-NEXT: [[OLT:%.*]] = fcmp olt <2 x float> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret <2 x i1> [[OLT]] +; + %ord = fcmp ord <2 x float> %x, zeroinitializer + %olt = fcmp olt <2 x float> %x, %y + %ret = and <2 x i1> %ord, %olt + ret <2 x i1> %ret +} + +define i1 @ord_implies_ord(float %x, float %y) { +; CHECK-LABEL: @ord_implies_ord( +; CHECK-NEXT: [[ORD2:%.*]] = fcmp ord float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[ORD2]] +; + %ord = fcmp ord float %x, 0.000000e+00 + %ord2 = fcmp ord float %x, %y + %ret = and i1 %ord, %ord2 + ret i1 %ret +} + +define i1 @olt_implies_uno(float %x, float %y) { +; CHECK-LABEL: @olt_implies_uno( +; CHECK-NEXT: ret i1 false +; + %uno = fcmp uno float %x, 0.000000e+00 + %olt = fcmp olt float %x, %y + %ret = and i1 %olt, %uno + ret i1 %ret +} + +define i1 @ult_implies_uno(float %x, float %y) { +; CHECK-LABEL: @ult_implies_uno( +; CHECK-NEXT: [[ULT:%.*]] = fcmp ult float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[ULT]] +; + %uno = fcmp uno float %x, 0.000000e+00 + %ult = fcmp ult float %x, %y + %ret = or i1 %ult, %uno + ret i1 %ret +} + +define i1 @uno_implies_uno(float %x, float %y) { +; CHECK-LABEL: @uno_implies_uno( +; CHECK-NEXT: [[UNO2:%.*]] = fcmp uno float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[UNO2]] +; + %uno = fcmp uno float %x, 0.000000e+00 + %uno2 = fcmp uno float %x, %y + %ret = or i1 %uno, %uno2 + ret i1 %ret +} + +define i1 @ult_implies_ord(float %x, float %y) { +; CHECK-LABEL: @ult_implies_ord( +; CHECK-NEXT: ret i1 true +; + %ord = fcmp ord float %x, 0.000000e+00 + %ult = fcmp ult float %x, %y + %ret = or i1 %ult, %ord + ret i1 %ret +} + +; TODO: %cmp1 is false implies %cmp3 is true +define float @test_ord_implies_uno(float %x) { +; CHECK-LABEL: @test_ord_implies_uno( +; CHECK-NEXT: [[CMP1:%.*]] = fcmp ord float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: [[CMP2:%.*]] = fcmp olt float [[X]], 0.000000e+00 +; CHECK-NEXT: [[CMP3:%.*]] = fcmp uno float [[X]], 0.000000e+00 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 [[CMP3]] +; CHECK-NEXT: [[RET:%.*]] = select i1 [[SEL]], float 0.000000e+00, float [[X]] +; CHECK-NEXT: ret float [[RET]] +; + %cmp1 = fcmp ord float %x, 0.000000e+00 + %cmp2 = fcmp olt float %x, 0.000000e+00 + %cmp3 = fcmp uno float %x, 0.000000e+00 + %sel = select i1 %cmp1, i1 %cmp2, i1 %cmp3 + %ret = select i1 %sel, float 0.000000e+00, float %x + ret float %ret +} + +; Negative tests + +define i1 @olt_implies_ord_fail(float %x, float %y, float %z) { +; CHECK-LABEL: @olt_implies_ord_fail( +; CHECK-NEXT: [[ORD:%.*]] = fcmp ord float [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[OLT:%.*]] = fcmp olt float [[X]], [[Y:%.*]] +; CHECK-NEXT: [[RET:%.*]] = and i1 [[OLT]], [[ORD]] +; CHECK-NEXT: ret i1 [[RET]] +; + %ord = fcmp ord float %x, %z + %olt = fcmp olt float %x, %y + %ret = and i1 %olt, %ord + ret i1 %ret +} + +define i1 @ult_implies_uno_and(float %x, float %y) { +; CHECK-LABEL: @ult_implies_uno_and( +; CHECK-NEXT: [[UNO:%.*]] = fcmp uno float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: [[ULT:%.*]] = fcmp ult float [[X]], [[Y:%.*]] +; CHECK-NEXT: [[RET:%.*]] = and i1 [[ULT]], [[UNO]] +; CHECK-NEXT: ret i1 [[RET]] +; + %uno = fcmp uno float %x, 0.000000e+00 + %ult = fcmp ult float %x, %y + %ret = and i1 %ult, %uno + ret i1 %ret +} + +define i1 @olt_implies_olt_fail(float %x, float %y) { +; CHECK-LABEL: @olt_implies_olt_fail( +; CHECK-NEXT: [[OLT:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: [[OLT2:%.*]] = fcmp olt float [[X]], [[Y:%.*]] +; CHECK-NEXT: [[RET:%.*]] = and i1 [[OLT]], [[OLT2]] +; CHECK-NEXT: ret i1 [[RET]] +; + %olt = fcmp olt float %x, 0.000000e+00 + %olt2 = fcmp olt float %x, %y + %ret = and i1 %olt, %olt2 + ret i1 %ret +} -- cgit v1.1 From 9ff3b82948c90c54f2f6ec20798c529cb93fab3b Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Thu, 8 Feb 2024 08:30:59 +0100 Subject: [AMDGPU] Revert Metadata Version Upgrade (#80995) Metadata is still 1.2, not 1.3 after V6. I thought that amdhsa.version mapped to the COV version but it's separate, and there are no MD changes in V6, hence it doesn't need to be updated. --- llvm/include/llvm/Support/AMDGPUMetadata.h | 2 +- llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll | 7 +++---- llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll | 11 +++++------ .../CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll | 11 +++++------ llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll | 9 ++++----- llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll | 9 ++++----- llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll | 11 +++++------ llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll | 11 +++++------ llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll | 11 +++++------ llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll | 11 +++++------ llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll | 11 +++++------ llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll | 11 +++++------ llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll | 11 +++++------ 13 files changed, 57 insertions(+), 69 deletions(-) diff --git a/llvm/include/llvm/Support/AMDGPUMetadata.h b/llvm/include/llvm/Support/AMDGPUMetadata.h index d5e0f40..76ac7ab 100644 --- a/llvm/include/llvm/Support/AMDGPUMetadata.h +++ b/llvm/include/llvm/Support/AMDGPUMetadata.h @@ -47,7 +47,7 @@ constexpr uint32_t VersionMinorV5 = 2; /// HSA metadata major version for code object V6. constexpr uint32_t VersionMajorV6 = 1; /// HSA metadata minor version for code object V6. -constexpr uint32_t VersionMinorV6 = 3; +constexpr uint32_t VersionMinorV6 = 2; /// Old HSA metadata beginning assembler directive for V2. This is only used for /// diagnostics now. diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll index 7404015..bc8f3eb 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll @@ -1,8 +1,8 @@ ; REQUIRES: asserts ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV4 %s ; RUN: not llc --crash -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV5,COV56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV6,COV56 %s ; AMDGPUAttributor deletes the function "by accident" so it's never ; codegened with optimizations. @@ -20,8 +20,7 @@ ; OPT-NEXT: amdhsa.version: ; OPT-NEXT: - 1 ; COV4: - 1 -; COV5: - 2 -; COV6: - 3 +; COV56: - 2 ; OPT: ... define internal i32 @func() { ret i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll index 4faaf60..89d89a7 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" ; ASM: amdhsa.target: amdgcn-amd-amdhsa--gfx900 ; ASM: amdhsa.version: -; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM: - 1 +; ASM4: - 1 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll index 2079db7..bc57c99 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx700" ; ASM: amdhsa.target: amdgcn-amd-amdhsa--gfx700 ; ASM: amdhsa.version: -; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM: - 1 +; ASM4: - 1 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll index 5fa49c5..51351c3 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack-" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-' ; ASM: amdhsa.version: -; ASM: - 1 +; ASM: - 1 ; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll index 0d0a8d8..f408cbe 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack+" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+' ; ASM: amdhsa.version: -; ASM: - 1 +; ASM: - 1 ; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll index c29fb1f..78b3376 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack-" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-' ; ASM: amdhsa.version: -; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM: - 1 +; ASM4: - 1 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll index 8f6a4ff..d1c98c7 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack-" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-' ; ASM: amdhsa.version: -; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM: - 1 +; ASM4: - 1 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll index f24e0b2..adf84db 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack+" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+' ; ASM: amdhsa.version: -; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM: - 1 +; ASM4: - 1 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll index 1493004..210b2e8 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack+" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+' ; ASM: amdhsa.version: -; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM: - 1 +; ASM4: - 1 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll index f0af6ca..44e77a2 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx700" ; ASM: amdhsa.target: amdgcn-amd-amdhsa--gfx700 ; ASM: amdhsa.version: -; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM: - 1 +; ASM4: - 1 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll index 5501ce9..3205dbe 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack-" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-' ; ASM: amdhsa.version: -; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM: - 1 +; ASM4: - 1 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll index 4cec639..6e7c575 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -13,10 +13,9 @@ ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack+" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+' ; ASM: amdhsa.version: -; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 -; ASM6: - 3 +; ASM: - 1 +; ASM4: - 1 +; ASM56: - 2 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 -- cgit v1.1 From a446c9bf69b4797da329977366ca62e55a429a90 Mon Sep 17 00:00:00 2001 From: martinboehme Date: Thu, 8 Feb 2024 08:38:35 +0100 Subject: [clang][dataflow] Add support for `CXXRewrittenBinaryOperator`. (#81086) This occurs in rewritten candidates for binary operators (a C++20 feature). The patch modifies UncheckedOptionalAccessModelTest to run in C++20 mode (as well as C++17 mode, as before) and to use rewritten candidates. The modified test fails without the newly added support for `CXXRewrittenBinaryOperator`. --- clang/lib/Analysis/FlowSensitive/Transfer.cpp | 4 ++++ .../FlowSensitive/UncheckedOptionalAccessModelTest.cpp | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp index bb3aec7..a098471 100644 --- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp +++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp @@ -545,6 +545,10 @@ public: VisitCallExpr(S); } + void VisitCXXRewrittenBinaryOperator(const CXXRewrittenBinaryOperator *RBO) { + propagateValue(*RBO->getSemanticForm(), *RBO, Env); + } + void VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *S) { if (S->getCastKind() == CK_ConstructorConversion) { const Expr *SubExpr = S->getSubExpr(); diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp index 73fb406..b6e4973 100644 --- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp @@ -770,12 +770,17 @@ constexpr bool operator!=(const optional &lhs, const optional &rhs); template constexpr bool operator==(const optional &opt, nullopt_t); + +// C++20 and later do not define the following overloads because they are +// provided by rewritten candidates instead. +#if __cplusplus < 202002L template constexpr bool operator==(nullopt_t, const optional &opt); template constexpr bool operator!=(const optional &opt, nullopt_t); template constexpr bool operator!=(nullopt_t, const optional &opt); +#endif // __cplusplus < 202002L template constexpr bool operator==(const optional &opt, const U &value); @@ -1289,6 +1294,15 @@ protected: template void ExpectDiagnosticsFor(std::string SourceCode, FuncDeclMatcher FuncMatcher) { + // Run in C++17 and C++20 mode to cover differences in the AST between modes + // (e.g. C++20 can contain `CXXRewrittenBinaryOperator`). + for (const char *CxxMode : {"-std=c++17", "-std=c++20"}) + ExpectDiagnosticsFor(SourceCode, FuncMatcher, CxxMode); + } + + template + void ExpectDiagnosticsFor(std::string SourceCode, FuncDeclMatcher FuncMatcher, + const char *CxxMode) { ReplaceAllOccurrences(SourceCode, "$ns", GetParam().NamespaceName); ReplaceAllOccurrences(SourceCode, "$optional", GetParam().TypeName); @@ -1332,7 +1346,7 @@ protected: llvm::move(EltDiagnostics, std::back_inserter(Diagnostics)); }) .withASTBuildArgs( - {"-fsyntax-only", "-std=c++17", "-Wno-undefined-inline"}) + {"-fsyntax-only", CxxMode, "-Wno-undefined-inline"}) .withASTBuildVirtualMappedFiles( tooling::FileContentMappings(Headers.begin(), Headers.end())), /*VerifyResults=*/[&Diagnostics]( -- cgit v1.1 From a24b0c351a75a87410203dd3777c0d8ee87f65c1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 8 Feb 2024 13:20:34 +0530 Subject: clang/AMDGPU: Regenerate test checks in hip header tests --- clang/test/Headers/__clang_hip_cmath.hip | 16 +- clang/test/Headers/__clang_hip_math.hip | 896 +++++++++++++++---------------- 2 files changed, 456 insertions(+), 456 deletions(-) diff --git a/clang/test/Headers/__clang_hip_cmath.hip b/clang/test/Headers/__clang_hip_cmath.hip index c194f44..cd085fd 100644 --- a/clang/test/Headers/__clang_hip_cmath.hip +++ b/clang/test/Headers/__clang_hip_cmath.hip @@ -61,13 +61,13 @@ extern "C" __device__ float test_fabs_f32(float x) { // DEFAULT-LABEL: @test_sin_f32( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR8:[0-9]+]] -// DEFAULT-NEXT: ret float [[CALL_I_I]] +// DEFAULT-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR8:[0-9]+]] +// DEFAULT-NEXT: ret float [[CALL_I1]] // // FINITEONLY-LABEL: @test_sin_f32( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8:[0-9]+]] -// FINITEONLY-NEXT: ret float [[CALL_I_I]] +// FINITEONLY-NEXT: [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8:[0-9]+]] +// FINITEONLY-NEXT: ret float [[CALL_I1]] // extern "C" __device__ float test_sin_f32(float x) { return sin(x); @@ -75,13 +75,13 @@ extern "C" __device__ float test_sin_f32(float x) { // DEFAULT-LABEL: @test_cos_f32( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR8]] -// DEFAULT-NEXT: ret float [[CALL_I_I]] +// DEFAULT-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR8]] +// DEFAULT-NEXT: ret float [[CALL_I1]] // // FINITEONLY-LABEL: @test_cos_f32( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8]] -// FINITEONLY-NEXT: ret float [[CALL_I_I]] +// FINITEONLY-NEXT: [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8]] +// FINITEONLY-NEXT: ret float [[CALL_I1]] // extern "C" __device__ float test_cos_f32(float x) { return cos(x); diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip index 5230c36..e9a9cb4 100644 --- a/clang/test/Headers/__clang_hip_math.hip +++ b/clang/test/Headers/__clang_hip_math.hip @@ -258,17 +258,17 @@ extern "C" __device__ long long test_llabs(long x) { // DEFAULT-LABEL: @test_acosf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR12:[0-9]+]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_acosf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14:[0-9]+]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12:[0-9]+]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_acosf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR12:[0-9]+]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_acosf(float x) { @@ -277,17 +277,17 @@ extern "C" __device__ float test_acosf(float x) { // DEFAULT-LABEL: @test_acos( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_acos( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_acos( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_acos(double x) { @@ -296,17 +296,17 @@ extern "C" __device__ double test_acos(double x) { // DEFAULT-LABEL: @test_acoshf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR13:[0-9]+]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_acoshf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15:[0-9]+]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13:[0-9]+]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_acoshf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR13:[0-9]+]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_acoshf(float x) { @@ -315,17 +315,17 @@ extern "C" __device__ float test_acoshf(float x) { // DEFAULT-LABEL: @test_acosh( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_acosh( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_acosh( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_acosh(double x) { @@ -334,17 +334,17 @@ extern "C" __device__ double test_acosh(double x) { // DEFAULT-LABEL: @test_asinf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_asinf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_asinf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_asinf(float x) { @@ -353,17 +353,17 @@ extern "C" __device__ float test_asinf(float x) { // DEFAULT-LABEL: @test_asin( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_asin( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_asin( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_asin(double x) { @@ -373,17 +373,17 @@ extern "C" __device__ double test_asin(double x) { // DEFAULT-LABEL: @test_asinhf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_asinhf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asinh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asinh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_asinhf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_asinhf(float x) { @@ -392,17 +392,17 @@ extern "C" __device__ float test_asinhf(float x) { // DEFAULT-LABEL: @test_asinh( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_asinh( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asinh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asinh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_asinh( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_asinh(double x) { @@ -411,17 +411,17 @@ extern "C" __device__ double test_asinh(double x) { // DEFAULT-LABEL: @test_atan2f( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_atan2f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan2_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan2_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_atan2f( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_atan2f(float x, float y) { @@ -430,17 +430,17 @@ extern "C" __device__ float test_atan2f(float x, float y) { // DEFAULT-LABEL: @test_atan2( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_atan2( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan2_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan2_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_atan2( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_atan2(double x, double y) { @@ -449,17 +449,17 @@ extern "C" __device__ double test_atan2(double x, double y) { // DEFAULT-LABEL: @test_atanf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_atanf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_atanf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_atanf(float x) { @@ -468,17 +468,17 @@ extern "C" __device__ float test_atanf(float x) { // DEFAULT-LABEL: @test_atan( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_atan( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_atan( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_atan(double x) { @@ -487,17 +487,17 @@ extern "C" __device__ double test_atan(double x) { // DEFAULT-LABEL: @test_atanhf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_atanhf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_atanhf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_atanhf(float x) { @@ -506,17 +506,17 @@ extern "C" __device__ float test_atanhf(float x) { // DEFAULT-LABEL: @test_atanh( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_atanh( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_atanh( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_atanh(double x) { @@ -525,17 +525,17 @@ extern "C" __device__ double test_atanh(double x) { // DEFAULT-LABEL: @test_cbrtf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_cbrtf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_cbrtf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_cbrtf(float x) { @@ -544,17 +544,17 @@ extern "C" __device__ float test_cbrtf(float x) { // DEFAULT-LABEL: @test_cbrt( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_cbrt( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_cbrt( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_cbrt(double x) { @@ -639,17 +639,17 @@ extern "C" __device__ double test_copysign(double x, double y) { // DEFAULT-LABEL: @test_cosf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_cosf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16:[0-9]+]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14:[0-9]+]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_cosf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]] +// APPROX-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]] // APPROX-NEXT: ret float [[CALL_I1]] // extern "C" __device__ float test_cosf(float x) { @@ -658,17 +658,17 @@ extern "C" __device__ float test_cosf(float x) { // DEFAULT-LABEL: @test_cos( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_cos( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_cos( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_cos(double x) { @@ -677,17 +677,17 @@ extern "C" __device__ double test_cos(double x) { // DEFAULT-LABEL: @test_coshf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_coshf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_coshf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_coshf(float x) { @@ -696,17 +696,17 @@ extern "C" __device__ float test_coshf(float x) { // DEFAULT-LABEL: @test_cosh( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_cosh( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_cosh( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_cosh(double x) { @@ -715,17 +715,17 @@ extern "C" __device__ double test_cosh(double x) { // DEFAULT-LABEL: @test_cospif( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_cospif( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cospi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cospi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_cospif( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_cospif(float x) { @@ -734,17 +734,17 @@ extern "C" __device__ float test_cospif(float x) { // DEFAULT-LABEL: @test_cospi( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_cospi( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cospi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cospi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_cospi( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_cospi(double x) { @@ -753,17 +753,17 @@ extern "C" __device__ double test_cospi(double x) { // DEFAULT-LABEL: @test_cyl_bessel_i0f( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_cyl_bessel_i0f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_cyl_bessel_i0f( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_cyl_bessel_i0f(float x) { @@ -772,17 +772,17 @@ extern "C" __device__ float test_cyl_bessel_i0f(float x) { // DEFAULT-LABEL: @test_cyl_bessel_i0( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_cyl_bessel_i0( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_cyl_bessel_i0( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_cyl_bessel_i0(double x) { @@ -791,17 +791,17 @@ extern "C" __device__ double test_cyl_bessel_i0(double x) { // DEFAULT-LABEL: @test_cyl_bessel_i1f( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_cyl_bessel_i1f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_cyl_bessel_i1f( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_cyl_bessel_i1f(float x) { @@ -810,17 +810,17 @@ extern "C" __device__ float test_cyl_bessel_i1f(float x) { // DEFAULT-LABEL: @test_cyl_bessel_i1( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_cyl_bessel_i1( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_cyl_bessel_i1( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_cyl_bessel_i1(double x) { @@ -829,17 +829,17 @@ extern "C" __device__ double test_cyl_bessel_i1(double x) { // DEFAULT-LABEL: @test_erfcf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_erfcf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfc_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfc_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_erfcf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_erfcf(float x) { @@ -848,17 +848,17 @@ extern "C" __device__ float test_erfcf(float x) { // DEFAULT-LABEL: @test_erfc( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_erfc( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfc_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfc_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_erfc( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_erfc(double x) { @@ -867,17 +867,17 @@ extern "C" __device__ double test_erfc(double x) { // DEFAULT-LABEL: @test_erfinvf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_erfinvf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_erfinvf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_erfinvf(float x) { @@ -886,17 +886,17 @@ extern "C" __device__ float test_erfinvf(float x) { // DEFAULT-LABEL: @test_erfinv( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_erfinv( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_erfinv( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_erfinv(double x) { @@ -905,17 +905,17 @@ extern "C" __device__ double test_erfinv(double x) { // DEFAULT-LABEL: @test_exp10f( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_exp10f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_exp10_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_exp10_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_exp10f( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_exp10f(float x) { @@ -924,17 +924,17 @@ extern "C" __device__ float test_exp10f(float x) { // DEFAULT-LABEL: @test_exp10( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_exp10( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_exp10( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_exp10(double x) { @@ -962,17 +962,17 @@ extern "C" __device__ float test_exp2f(float x) { // DEFAULT-LABEL: @test_exp2( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_exp2( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_exp2( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_exp2(double x) { @@ -1000,17 +1000,17 @@ extern "C" __device__ float test_expf(float x) { // DEFAULT-LABEL: @test_exp( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_exp( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_exp( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_exp(double x) { @@ -1019,17 +1019,17 @@ extern "C" __device__ double test_exp(double x) { // DEFAULT-LABEL: @test_expm1f( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_expm1f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_expm1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_expm1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_expm1f( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_expm1f(float x) { @@ -1038,17 +1038,17 @@ extern "C" __device__ float test_expm1f(float x) { // DEFAULT-LABEL: @test_expm1( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_expm1( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_expm1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_expm1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_expm1( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_expm1(double x) { @@ -1095,17 +1095,17 @@ extern "C" __device__ double test_fabs(double x) { // DEFAULT-LABEL: @test_fdimf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_fdimf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fdim_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fdim_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_fdimf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_fdimf(float x, float y) { @@ -1114,17 +1114,17 @@ extern "C" __device__ float test_fdimf(float x, float y) { // DEFAULT-LABEL: @test_fdim( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_fdim( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fdim_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fdim_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_fdim( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_fdim(double x, double y) { @@ -1323,17 +1323,17 @@ extern "C" __device__ double test_fmin(double x, double y) { // DEFAULT-LABEL: @test_fmodf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_fmodf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fmod_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fmod_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_fmodf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_fmodf(float x, float y) { @@ -1342,17 +1342,17 @@ extern "C" __device__ float test_fmodf(float x, float y) { // DEFAULT-LABEL: @test_fmod( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_fmod( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fmod_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fmod_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_fmod( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_fmod(double x, double y) { @@ -1385,17 +1385,17 @@ extern "C" __device__ double test_frexp(double x, int* y) { // DEFAULT-LABEL: @test_hypotf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_hypotf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_hypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_hypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_hypotf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_hypotf(float x, float y) { @@ -1404,17 +1404,17 @@ extern "C" __device__ float test_hypotf(float x, float y) { // DEFAULT-LABEL: @test_hypot( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_hypot( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_hypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_hypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_hypot( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_hypot(double x, double y) { @@ -1423,17 +1423,17 @@ extern "C" __device__ double test_hypot(double x, double y) { // DEFAULT-LABEL: @test_ilogbf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret i32 [[CALL_I]] // // FINITEONLY-LABEL: @test_ilogbf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret i32 [[CALL_I]] // // APPROX-LABEL: @test_ilogbf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret i32 [[CALL_I]] // extern "C" __device__ int test_ilogbf(float x) { @@ -1442,17 +1442,17 @@ extern "C" __device__ int test_ilogbf(float x) { // DEFAULT-LABEL: @test_ilogb( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret i32 [[CALL_I]] // // FINITEONLY-LABEL: @test_ilogb( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret i32 [[CALL_I]] // // APPROX-LABEL: @test_ilogb( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret i32 [[CALL_I]] // extern "C" __device__ int test_ilogb(double x) { @@ -1589,17 +1589,17 @@ extern "C" __device__ BOOL_TYPE test___isnan(double x) { // DEFAULT-LABEL: @test_j0f( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_j0f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_j0f( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_j0f(float x) { @@ -1608,17 +1608,17 @@ extern "C" __device__ float test_j0f(float x) { // DEFAULT-LABEL: @test_j0( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_j0( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_j0( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_j0(double x) { @@ -1627,17 +1627,17 @@ extern "C" __device__ double test_j0(double x) { // DEFAULT-LABEL: @test_j1f( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_j1f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_j1f( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_j1f(float x) { @@ -1646,17 +1646,17 @@ extern "C" __device__ float test_j1f(float x) { // DEFAULT-LABEL: @test_j1( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_j1( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_j1( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_j1(double x) { @@ -1670,14 +1670,14 @@ extern "C" __device__ double test_j1(double x) { // DEFAULT-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // DEFAULT-NEXT: ] // DEFAULT: if.then.i: -// DEFAULT-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: br label [[_ZL3JNFIF_EXIT:%.*]] // DEFAULT: if.then2.i: -// DEFAULT-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]] // DEFAULT-NEXT: br label [[_ZL3JNFIF_EXIT]] // DEFAULT: if.end4.i: -// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]] -// DEFAULT-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]] // DEFAULT-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // DEFAULT-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]] // DEFAULT: for.body.i: @@ -1703,14 +1703,14 @@ extern "C" __device__ double test_j1(double x) { // FINITEONLY-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // FINITEONLY-NEXT: ] // FINITEONLY: if.then.i: -// FINITEONLY-NEXT: [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: br label [[_ZL3JNFIF_EXIT:%.*]] // FINITEONLY: if.then2.i: -// FINITEONLY-NEXT: [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] // FINITEONLY-NEXT: br label [[_ZL3JNFIF_EXIT]] // FINITEONLY: if.end4.i: -// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] -// FINITEONLY-NEXT: [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] // FINITEONLY-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // FINITEONLY-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]] // FINITEONLY: for.body.i: @@ -1736,14 +1736,14 @@ extern "C" __device__ double test_j1(double x) { // APPROX-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // APPROX-NEXT: ] // APPROX: if.then.i: -// APPROX-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR14]] // APPROX-NEXT: br label [[_ZL3JNFIF_EXIT:%.*]] // APPROX: if.then2.i: -// APPROX-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]] // APPROX-NEXT: br label [[_ZL3JNFIF_EXIT]] // APPROX: if.end4.i: -// APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]] -// APPROX-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]] // APPROX-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // APPROX-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]] // APPROX: for.body.i: @@ -1773,14 +1773,14 @@ extern "C" __device__ float test_jnf(int x, float y) { // DEFAULT-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // DEFAULT-NEXT: ] // DEFAULT: if.then.i: -// DEFAULT-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: br label [[_ZL2JNID_EXIT:%.*]] // DEFAULT: if.then2.i: -// DEFAULT-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]] // DEFAULT-NEXT: br label [[_ZL2JNID_EXIT]] // DEFAULT: if.end4.i: -// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]] -// DEFAULT-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]] // DEFAULT-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // DEFAULT-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]] // DEFAULT: for.body.i: @@ -1806,14 +1806,14 @@ extern "C" __device__ float test_jnf(int x, float y) { // FINITEONLY-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // FINITEONLY-NEXT: ] // FINITEONLY: if.then.i: -// FINITEONLY-NEXT: [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: br label [[_ZL2JNID_EXIT:%.*]] // FINITEONLY: if.then2.i: -// FINITEONLY-NEXT: [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] // FINITEONLY-NEXT: br label [[_ZL2JNID_EXIT]] // FINITEONLY: if.end4.i: -// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] -// FINITEONLY-NEXT: [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] // FINITEONLY-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // FINITEONLY-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]] // FINITEONLY: for.body.i: @@ -1839,14 +1839,14 @@ extern "C" __device__ float test_jnf(int x, float y) { // APPROX-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // APPROX-NEXT: ] // APPROX: if.then.i: -// APPROX-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR14]] // APPROX-NEXT: br label [[_ZL2JNID_EXIT:%.*]] // APPROX: if.then2.i: -// APPROX-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]] // APPROX-NEXT: br label [[_ZL2JNID_EXIT]] // APPROX: if.end4.i: -// APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]] -// APPROX-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]] // APPROX-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // APPROX-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]] // APPROX: for.body.i: @@ -1909,17 +1909,17 @@ extern "C" __device__ double test_ldexp(double x, int y) { // DEFAULT-LABEL: @test_lgammaf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_lgammaf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_lgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_lgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_lgammaf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_lgammaf(float x) { @@ -1928,17 +1928,17 @@ extern "C" __device__ float test_lgammaf(float x) { // DEFAULT-LABEL: @test_lgamma( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_lgamma( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_lgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_lgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_lgamma( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_lgamma(double x) { @@ -2054,17 +2054,17 @@ extern "C" __device__ float test_log10f(float x) { // DEFAULT-LABEL: @test_log10( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_log10( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_log10( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_log10(double x) { @@ -2073,17 +2073,17 @@ extern "C" __device__ double test_log10(double x) { // DEFAULT-LABEL: @test_log1pf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_log1pf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log1p_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log1p_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_log1pf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_log1pf(float x) { @@ -2092,17 +2092,17 @@ extern "C" __device__ float test_log1pf(float x) { // DEFAULT-LABEL: @test_log1p( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_log1p( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log1p_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log1p_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_log1p( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_log1p(double x) { @@ -2111,12 +2111,12 @@ extern "C" __device__ double test_log1p(double x) { // DEFAULT-LABEL: @test_log2f( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log2_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log2_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_log2f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log2_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log2_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_log2f( @@ -2130,17 +2130,17 @@ extern "C" __device__ float test_log2f(float x) { // DEFAULT-LABEL: @test_log2( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_log2( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_log2( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_log2(double x) { @@ -2149,17 +2149,17 @@ extern "C" __device__ double test_log2(double x) { // DEFAULT-LABEL: @test_logbf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_logbf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_logb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_logb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_logbf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_logbf(float x) { @@ -2168,17 +2168,17 @@ extern "C" __device__ float test_logbf(float x) { // DEFAULT-LABEL: @test_logb( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_logb( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_logb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_logb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_logb( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_logb(double x) { @@ -2187,12 +2187,12 @@ extern "C" __device__ double test_logb(double x) { // DEFAULT-LABEL: @test_logf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_logf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_logf( @@ -2295,31 +2295,31 @@ extern "C" __device__ long int test_lround(double x) { // DEFAULT-LABEL: @test_modff( // DEFAULT-NEXT: entry: // DEFAULT-NEXT: [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5) -// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]] -// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15:[0-9]+]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // DEFAULT-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]] // DEFAULT-NEXT: store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] -// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_modff( // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5) -// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]] -// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_modf_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15:[0-9]+]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_modf_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // FINITEONLY-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]] // FINITEONLY-NEXT: store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] -// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_modff( // APPROX-NEXT: entry: // APPROX-NEXT: [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5) -// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]] -// APPROX-NEXT: [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15:[0-9]+]] +// APPROX-NEXT: [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // APPROX-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]] // APPROX-NEXT: store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] -// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_modff(float x, float* y) { @@ -2329,31 +2329,31 @@ extern "C" __device__ float test_modff(float x, float* y) { // DEFAULT-LABEL: @test_modf( // DEFAULT-NEXT: entry: // DEFAULT-NEXT: [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5) -// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // DEFAULT-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]] // DEFAULT-NEXT: store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]] -// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_modf( // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5) -// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_modf_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_modf_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // FINITEONLY-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]] // FINITEONLY-NEXT: store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]] -// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_modf( // APPROX-NEXT: entry: // APPROX-NEXT: [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5) -// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// APPROX-NEXT: [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // APPROX-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]] // APPROX-NEXT: store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]] -// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_modf(double x, double* y) { @@ -2629,17 +2629,17 @@ extern "C" __device__ double test_nearbyint(double x) { // DEFAULT-LABEL: @test_nextafterf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_nextafterf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_nextafter_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_nextafter_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_nextafterf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_nextafterf(float x, float y) { @@ -2648,17 +2648,17 @@ extern "C" __device__ float test_nextafterf(float x, float y) { // DEFAULT-LABEL: @test_nextafter( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_nextafter( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_nextafter_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_nextafter_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_nextafter( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_nextafter(double x, double y) { @@ -2667,17 +2667,17 @@ extern "C" __device__ double test_nextafter(double x, double y) { // DEFAULT-LABEL: @test_norm3df( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_norm3df( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_norm3df( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_norm3df(float x, float y, float z) { @@ -2686,17 +2686,17 @@ extern "C" __device__ float test_norm3df(float x, float y, float z) { // DEFAULT-LABEL: @test_norm3d( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_norm3d( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_norm3d( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_norm3d(double x, double y, double z) { @@ -2705,17 +2705,17 @@ extern "C" __device__ double test_norm3d(double x, double y, double z) { // DEFAULT-LABEL: @test_norm4df( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_norm4df( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_norm4df( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_norm4df(float x, float y, float z, float w) { @@ -2724,17 +2724,17 @@ extern "C" __device__ float test_norm4df(float x, float y, float z, float w) { // DEFAULT-LABEL: @test_norm4d( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_norm4d( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_norm4d( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_norm4d(double x, double y, double z, double w) { @@ -2743,17 +2743,17 @@ extern "C" __device__ double test_norm4d(double x, double y, double z, double w) // DEFAULT-LABEL: @test_normcdff( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_normcdff( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdf_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdf_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_normcdff( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_normcdff(float x) { @@ -2762,17 +2762,17 @@ extern "C" __device__ float test_normcdff(float x) { // DEFAULT-LABEL: @test_normcdf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_normcdf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdf_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdf_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_normcdf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_normcdf(double x) { @@ -2781,17 +2781,17 @@ extern "C" __device__ double test_normcdf(double x) { // DEFAULT-LABEL: @test_normcdfinvf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_normcdfinvf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_normcdfinvf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_normcdfinvf(float x) { @@ -2800,17 +2800,17 @@ extern "C" __device__ float test_normcdfinvf(float x) { // DEFAULT-LABEL: @test_normcdfinv( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_normcdfinv( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_normcdfinv( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_normcdfinv(double x) { @@ -2947,17 +2947,17 @@ extern "C" __device__ double test_norm(int x, const double *y) { // DEFAULT-LABEL: @test_powf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_powf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_powf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_powf(float x, float y) { @@ -2966,17 +2966,17 @@ extern "C" __device__ float test_powf(float x, float y) { // DEFAULT-LABEL: @test_pow( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_pow( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pow_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pow_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_pow( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_pow(double x, double y) { @@ -2985,17 +2985,17 @@ extern "C" __device__ double test_pow(double x, double y) { // DEFAULT-LABEL: @test_powif( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_powif( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pown_f32(float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pown_f32(float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_powif( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_powif(float x, int y) { @@ -3004,17 +3004,17 @@ extern "C" __device__ float test_powif(float x, int y) { // DEFAULT-LABEL: @test_powi( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_powi( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pown_f64(double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pown_f64(double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_powi( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_powi(double x, int y) { @@ -3023,17 +3023,17 @@ extern "C" __device__ double test_powi(double x, int y) { // DEFAULT-LABEL: @test_rcbrtf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_rcbrtf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rcbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rcbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_rcbrtf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_rcbrtf(float x) { @@ -3042,17 +3042,17 @@ extern "C" __device__ float test_rcbrtf(float x) { // DEFAULT-LABEL: @test_rcbrt( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_rcbrt( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rcbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rcbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_rcbrt( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_rcbrt(double x) { @@ -3061,17 +3061,17 @@ extern "C" __device__ double test_rcbrt(double x) { // DEFAULT-LABEL: @test_remainderf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_remainderf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remainder_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remainder_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_remainderf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_remainderf(float x, float y) { @@ -3080,17 +3080,17 @@ extern "C" __device__ float test_remainderf(float x, float y) { // DEFAULT-LABEL: @test_remainder( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_remainder( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remainder_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remainder_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_remainder( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_remainder(double x, double y) { @@ -3100,31 +3100,31 @@ extern "C" __device__ double test_remainder(double x, double y) { // DEFAULT-LABEL: @test_remquof( // DEFAULT-NEXT: entry: // DEFAULT-NEXT: [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5) -// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]] // DEFAULT-NEXT: store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]] -// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_remquof( // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5) -// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // FINITEONLY-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]] // FINITEONLY-NEXT: store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]] -// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_remquof( // APPROX-NEXT: entry: // APPROX-NEXT: [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5) -// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// APPROX-NEXT: [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // APPROX-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]] // APPROX-NEXT: store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]] -// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_remquof(float x, float y, int* z) { @@ -3134,31 +3134,31 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) { // DEFAULT-LABEL: @test_remquo( // DEFAULT-NEXT: entry: // DEFAULT-NEXT: [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5) -// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]] // DEFAULT-NEXT: store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]] -// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_remquo( // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5) -// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // FINITEONLY-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]] // FINITEONLY-NEXT: store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]] -// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_remquo( // APPROX-NEXT: entry: // APPROX-NEXT: [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5) -// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// APPROX-NEXT: [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // APPROX-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]] // APPROX-NEXT: store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]] -// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_remquo(double x, double y, int* z) { @@ -3167,17 +3167,17 @@ extern "C" __device__ double test_remquo(double x, double y, int* z) { // DEFAULT-LABEL: @test_rhypotf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_rhypotf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rhypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rhypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_rhypotf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_rhypotf(float x, float y) { @@ -3186,17 +3186,17 @@ extern "C" __device__ float test_rhypotf(float x, float y) { // DEFAULT-LABEL: @test_rhypot( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_rhypot( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rhypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rhypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_rhypot( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_rhypot(double x, double y) { @@ -3258,7 +3258,7 @@ extern "C" __device__ double test_rint(double x) { // DEFAULT-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]] // DEFAULT: _ZL6rnormfiPKf.exit: // DEFAULT-NEXT: [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ] -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_rnormf( @@ -3278,7 +3278,7 @@ extern "C" __device__ double test_rint(double x) { // FINITEONLY-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]] // FINITEONLY: _ZL6rnormfiPKf.exit: // FINITEONLY-NEXT: [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ] -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_rnormf( @@ -3298,7 +3298,7 @@ extern "C" __device__ double test_rint(double x) { // APPROX-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]] // APPROX: _ZL6rnormfiPKf.exit: // APPROX-NEXT: [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ] -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_rnormf(int x, const float* y) { @@ -3322,7 +3322,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) { // DEFAULT-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]] // DEFAULT: _ZL5rnormiPKd.exit: // DEFAULT-NEXT: [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ] -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_rnorm( @@ -3342,7 +3342,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) { // FINITEONLY-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]] // FINITEONLY: _ZL5rnormiPKd.exit: // FINITEONLY-NEXT: [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ] -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_rnorm( @@ -3362,7 +3362,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) { // APPROX-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]] // APPROX: _ZL5rnormiPKd.exit: // APPROX-NEXT: [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ] -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_rnorm(int x, const double* y) { @@ -3371,17 +3371,17 @@ extern "C" __device__ double test_rnorm(int x, const double* y) { // DEFAULT-LABEL: @test_rnorm3df( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_rnorm3df( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_rnorm3df( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_rnorm3df(float x, float y, float z) { @@ -3390,17 +3390,17 @@ extern "C" __device__ float test_rnorm3df(float x, float y, float z) { // DEFAULT-LABEL: @test_rnorm3d( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_rnorm3d( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_rnorm3d( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_rnorm3d(double x, double y, double z) { @@ -3409,17 +3409,17 @@ extern "C" __device__ double test_rnorm3d(double x, double y, double z) { // DEFAULT-LABEL: @test_rnorm4df( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_rnorm4df( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_rnorm4df( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) { @@ -3428,17 +3428,17 @@ extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) { // DEFAULT-LABEL: @test_rnorm4d( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_rnorm4d( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_rnorm4d( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_rnorm4d(double x, double y, double z, double w) { @@ -3485,17 +3485,17 @@ extern "C" __device__ double test_round(double x) { // DEFAULT-LABEL: @test_rsqrtf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_rsqrtf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_rsqrtf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_rsqrtf(float x) { @@ -3504,17 +3504,17 @@ extern "C" __device__ float test_rsqrtf(float x) { // DEFAULT-LABEL: @test_rsqrt( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_rsqrt( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_rsqrt( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_rsqrt(double x) { @@ -3530,7 +3530,7 @@ extern "C" __device__ double test_rsqrt(double x) { // DEFAULT-NEXT: [[TMP0:%.*]] = tail call contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]]) // DEFAULT-NEXT: br label [[_ZL8SCALBLNFFL_EXIT:%.*]] // DEFAULT: cond.false.i: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR12]] // DEFAULT-NEXT: br label [[_ZL8SCALBLNFFL_EXIT]] // DEFAULT: _ZL8scalblnffl.exit: // DEFAULT-NEXT: [[COND_I:%.*]] = phi contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ] @@ -3545,7 +3545,7 @@ extern "C" __device__ double test_rsqrt(double x) { // FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]]) // FINITEONLY-NEXT: br label [[_ZL8SCALBLNFFL_EXIT:%.*]] // FINITEONLY: cond.false.i: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_scalb_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_scalb_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR12]] // FINITEONLY-NEXT: br label [[_ZL8SCALBLNFFL_EXIT]] // FINITEONLY: _ZL8scalblnffl.exit: // FINITEONLY-NEXT: [[COND_I:%.*]] = phi nnan ninf contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ] @@ -3560,7 +3560,7 @@ extern "C" __device__ double test_rsqrt(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]]) // APPROX-NEXT: br label [[_ZL8SCALBLNFFL_EXIT:%.*]] // APPROX: cond.false.i: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR12]] // APPROX-NEXT: br label [[_ZL8SCALBLNFFL_EXIT]] // APPROX: _ZL8scalblnffl.exit: // APPROX-NEXT: [[COND_I:%.*]] = phi contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ] @@ -3579,7 +3579,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) { // DEFAULT-NEXT: [[TMP0:%.*]] = tail call contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]]) // DEFAULT-NEXT: br label [[_ZL7SCALBLNDL_EXIT:%.*]] // DEFAULT: cond.false.i: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR12]] // DEFAULT-NEXT: br label [[_ZL7SCALBLNDL_EXIT]] // DEFAULT: _ZL7scalblndl.exit: // DEFAULT-NEXT: [[COND_I:%.*]] = phi contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ] @@ -3594,7 +3594,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) { // FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]]) // FINITEONLY-NEXT: br label [[_ZL7SCALBLNDL_EXIT:%.*]] // FINITEONLY: cond.false.i: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_scalb_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_scalb_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR12]] // FINITEONLY-NEXT: br label [[_ZL7SCALBLNDL_EXIT]] // FINITEONLY: _ZL7scalblndl.exit: // FINITEONLY-NEXT: [[COND_I:%.*]] = phi nnan ninf contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ] @@ -3609,7 +3609,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]]) // APPROX-NEXT: br label [[_ZL7SCALBLNDL_EXIT:%.*]] // APPROX: cond.false.i: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR12]] // APPROX-NEXT: br label [[_ZL7SCALBLNDL_EXIT]] // APPROX: _ZL7scalblndl.exit: // APPROX-NEXT: [[COND_I:%.*]] = phi contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ] @@ -3681,34 +3681,34 @@ extern "C" __device__ BOOL_TYPE test___signbit(double x) { // DEFAULT-LABEL: @test_sincosf( // DEFAULT-NEXT: entry: // DEFAULT-NEXT: [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5) -// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // DEFAULT-NEXT: store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] // DEFAULT-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]] // DEFAULT-NEXT: store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]] -// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // DEFAULT-NEXT: ret void // // FINITEONLY-LABEL: @test_sincosf( // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5) -// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincos_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincos_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // FINITEONLY-NEXT: store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] // FINITEONLY-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]] // FINITEONLY-NEXT: store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]] -// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // FINITEONLY-NEXT: ret void // // APPROX-LABEL: @test_sincosf( // APPROX-NEXT: entry: // APPROX-NEXT: [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5) -// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// APPROX-NEXT: [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // APPROX-NEXT: store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] // APPROX-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]] // APPROX-NEXT: store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]] -// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret void // extern "C" __device__ void test_sincosf(float x, float *y, float *z) { @@ -3718,34 +3718,34 @@ extern "C" __device__ void test_sincosf(float x, float *y, float *z) { // DEFAULT-LABEL: @test_sincos( // DEFAULT-NEXT: entry: // DEFAULT-NEXT: [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5) -// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // DEFAULT-NEXT: store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]] // DEFAULT-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]] // DEFAULT-NEXT: store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]] -// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // DEFAULT-NEXT: ret void // // FINITEONLY-LABEL: @test_sincos( // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5) -// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincos_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincos_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // FINITEONLY-NEXT: store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]] // FINITEONLY-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]] // FINITEONLY-NEXT: store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]] -// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // FINITEONLY-NEXT: ret void // // APPROX-LABEL: @test_sincos( // APPROX-NEXT: entry: // APPROX-NEXT: [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5) -// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// APPROX-NEXT: [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // APPROX-NEXT: store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]] // APPROX-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]] // APPROX-NEXT: store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]] -// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret void // extern "C" __device__ void test_sincos(double x, double *y, double *z) { @@ -3755,34 +3755,34 @@ extern "C" __device__ void test_sincos(double x, double *y, double *z) { // DEFAULT-LABEL: @test_sincospif( // DEFAULT-NEXT: entry: // DEFAULT-NEXT: [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5) -// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // DEFAULT-NEXT: store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] // DEFAULT-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]] // DEFAULT-NEXT: store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]] -// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // DEFAULT-NEXT: ret void // // FINITEONLY-LABEL: @test_sincospif( // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5) -// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincospi_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincospi_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // FINITEONLY-NEXT: store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] // FINITEONLY-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]] // FINITEONLY-NEXT: store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]] -// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // FINITEONLY-NEXT: ret void // // APPROX-LABEL: @test_sincospif( // APPROX-NEXT: entry: // APPROX-NEXT: [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5) -// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// APPROX-NEXT: [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // APPROX-NEXT: store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] // APPROX-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]] // APPROX-NEXT: store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]] -// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret void // extern "C" __device__ void test_sincospif(float x, float *y, float *z) { @@ -3792,34 +3792,34 @@ extern "C" __device__ void test_sincospif(float x, float *y, float *z) { // DEFAULT-LABEL: @test_sincospi( // DEFAULT-NEXT: entry: // DEFAULT-NEXT: [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5) -// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // DEFAULT-NEXT: store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]] // DEFAULT-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]] // DEFAULT-NEXT: store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]] -// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // DEFAULT-NEXT: ret void // // FINITEONLY-LABEL: @test_sincospi( // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5) -// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincospi_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincospi_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // FINITEONLY-NEXT: store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]] // FINITEONLY-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]] // FINITEONLY-NEXT: store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]] -// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // FINITEONLY-NEXT: ret void // // APPROX-LABEL: @test_sincospi( // APPROX-NEXT: entry: // APPROX-NEXT: [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5) -// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] -// APPROX-NEXT: [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]] +// APPROX-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]] // APPROX-NEXT: store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]] // APPROX-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]] // APPROX-NEXT: store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]] -// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]] +// APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret void // extern "C" __device__ void test_sincospi(double x, double *y, double *z) { @@ -3828,17 +3828,17 @@ extern "C" __device__ void test_sincospi(double x, double *y, double *z) { // DEFAULT-LABEL: @test_sinf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_sinf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_sinf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I1]] // extern "C" __device__ float test_sinf(float x) { @@ -3847,17 +3847,17 @@ extern "C" __device__ float test_sinf(float x) { // DEFAULT-LABEL: @test_sin( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_sin( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_sin( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_sin(double x) { @@ -3866,17 +3866,17 @@ extern "C" __device__ double test_sin(double x) { // DEFAULT-LABEL: @test_sinpif( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_sinpif( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sinpi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sinpi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_sinpif( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_sinpif(float x) { @@ -3885,17 +3885,17 @@ extern "C" __device__ float test_sinpif(float x) { // DEFAULT-LABEL: @test_sinpi( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_sinpi( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sinpi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sinpi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_sinpi( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_sinpi(double x) { @@ -3942,17 +3942,17 @@ extern "C" __device__ double test_sqrt(double x) { // DEFAULT-LABEL: @test_tanf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_tanf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_tanf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_tanf(float x) { @@ -3961,17 +3961,17 @@ extern "C" __device__ float test_tanf(float x) { // DEFAULT-LABEL: @test_tan( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_tan( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_tan( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_tan(double x) { @@ -3980,17 +3980,17 @@ extern "C" __device__ double test_tan(double x) { // DEFAULT-LABEL: @test_tanhf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_tanhf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_tanhf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_tanhf(float x) { @@ -3999,17 +3999,17 @@ extern "C" __device__ float test_tanhf(float x) { // DEFAULT-LABEL: @test_tanh( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_tanh( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_tanh( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_tanh(double x) { @@ -4018,17 +4018,17 @@ extern "C" __device__ double test_tanh(double x) { // DEFAULT-LABEL: @test_tgammaf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_tgammaf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_tgammaf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_tgammaf(float x) { @@ -4037,17 +4037,17 @@ extern "C" __device__ float test_tgammaf(float x) { // DEFAULT-LABEL: @test_tgamma( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_tgamma( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_tgamma( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_tgamma(double x) { @@ -4094,17 +4094,17 @@ extern "C" __device__ double test_trunc(double x) { // DEFAULT-LABEL: @test_y0f( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_y0f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_y0f( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_y0f(float x) { @@ -4113,17 +4113,17 @@ extern "C" __device__ float test_y0f(float x) { // DEFAULT-LABEL: @test_y0( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_y0( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_y0( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_y0(double x) { @@ -4132,17 +4132,17 @@ extern "C" __device__ double test_y0(double x) { // DEFAULT-LABEL: @test_y1f( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test_y1f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test_y1f( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test_y1f(float x) { @@ -4151,17 +4151,17 @@ extern "C" __device__ float test_y1f(float x) { // DEFAULT-LABEL: @test_y1( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret double [[CALL_I]] // // FINITEONLY-LABEL: @test_y1( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret double [[CALL_I]] // // APPROX-LABEL: @test_y1( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // extern "C" __device__ double test_y1(double x) { @@ -4175,14 +4175,14 @@ extern "C" __device__ double test_y1(double x) { // DEFAULT-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // DEFAULT-NEXT: ] // DEFAULT: if.then.i: -// DEFAULT-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: br label [[_ZL3YNFIF_EXIT:%.*]] // DEFAULT: if.then2.i: -// DEFAULT-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]] // DEFAULT-NEXT: br label [[_ZL3YNFIF_EXIT]] // DEFAULT: if.end4.i: -// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]] -// DEFAULT-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]] // DEFAULT-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // DEFAULT-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]] // DEFAULT: for.body.i: @@ -4208,14 +4208,14 @@ extern "C" __device__ double test_y1(double x) { // FINITEONLY-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // FINITEONLY-NEXT: ] // FINITEONLY: if.then.i: -// FINITEONLY-NEXT: [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: br label [[_ZL3YNFIF_EXIT:%.*]] // FINITEONLY: if.then2.i: -// FINITEONLY-NEXT: [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] // FINITEONLY-NEXT: br label [[_ZL3YNFIF_EXIT]] // FINITEONLY: if.end4.i: -// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] -// FINITEONLY-NEXT: [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] // FINITEONLY-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // FINITEONLY-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]] // FINITEONLY: for.body.i: @@ -4241,14 +4241,14 @@ extern "C" __device__ double test_y1(double x) { // APPROX-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // APPROX-NEXT: ] // APPROX: if.then.i: -// APPROX-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR14]] // APPROX-NEXT: br label [[_ZL3YNFIF_EXIT:%.*]] // APPROX: if.then2.i: -// APPROX-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]] // APPROX-NEXT: br label [[_ZL3YNFIF_EXIT]] // APPROX: if.end4.i: -// APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]] -// APPROX-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]] // APPROX-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // APPROX-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]] // APPROX: for.body.i: @@ -4278,14 +4278,14 @@ extern "C" __device__ float test_ynf(int x, float y) { // DEFAULT-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // DEFAULT-NEXT: ] // DEFAULT: if.then.i: -// DEFAULT-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: br label [[_ZL2YNID_EXIT:%.*]] // DEFAULT: if.then2.i: -// DEFAULT-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]] // DEFAULT-NEXT: br label [[_ZL2YNID_EXIT]] // DEFAULT: if.end4.i: -// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]] -// DEFAULT-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]] // DEFAULT-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // DEFAULT-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]] // DEFAULT: for.body.i: @@ -4311,14 +4311,14 @@ extern "C" __device__ float test_ynf(int x, float y) { // FINITEONLY-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // FINITEONLY-NEXT: ] // FINITEONLY: if.then.i: -// FINITEONLY-NEXT: [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: br label [[_ZL2YNID_EXIT:%.*]] // FINITEONLY: if.then2.i: -// FINITEONLY-NEXT: [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] // FINITEONLY-NEXT: br label [[_ZL2YNID_EXIT]] // FINITEONLY: if.end4.i: -// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] -// FINITEONLY-NEXT: [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]] // FINITEONLY-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // FINITEONLY-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]] // FINITEONLY: for.body.i: @@ -4344,14 +4344,14 @@ extern "C" __device__ float test_ynf(int x, float y) { // APPROX-NEXT: i32 1, label [[IF_THEN2_I:%.*]] // APPROX-NEXT: ] // APPROX: if.then.i: -// APPROX-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR14]] // APPROX-NEXT: br label [[_ZL2YNID_EXIT:%.*]] // APPROX: if.then2.i: -// APPROX-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]] // APPROX-NEXT: br label [[_ZL2YNID_EXIT]] // APPROX: if.end4.i: -// APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]] -// APPROX-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]] // APPROX-NEXT: [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1 // APPROX-NEXT: br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]] // APPROX: for.body.i: @@ -4376,17 +4376,17 @@ extern "C" __device__ double test_yn(int x, double y) { // DEFAULT-LABEL: @test___cosf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test___cosf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test___cosf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test___cosf(float x) { @@ -4553,17 +4553,17 @@ extern "C" __device__ float test___frsqrt_rn(float x) { // DEFAULT-LABEL: @test___fsqrt_rn( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR12]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test___fsqrt_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test___fsqrt_rn( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test___fsqrt_rn(float x) { @@ -4648,17 +4648,17 @@ extern "C" __device__ float test___logf(float x) { // DEFAULT-LABEL: @test___powf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test___powf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR13]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test___powf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test___powf(float x, float y) { @@ -4695,25 +4695,25 @@ extern "C" __device__ float test___saturatef(float x) { // DEFAULT-LABEL: @test___sincosf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] -// DEFAULT-NEXT: [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]] // DEFAULT-NEXT: store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]] // DEFAULT-NEXT: ret void // // FINITEONLY-LABEL: @test___sincosf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] -// FINITEONLY-NEXT: [[CALL1_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL1_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]] // FINITEONLY-NEXT: store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]] // FINITEONLY-NEXT: ret void // // APPROX-LABEL: @test___sincosf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]] -// APPROX-NEXT: [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]] // APPROX-NEXT: store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]] // APPROX-NEXT: ret void // @@ -4723,17 +4723,17 @@ extern "C" __device__ void test___sincosf(float x, float *y, float *z) { // DEFAULT-LABEL: @test___sinf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] // DEFAULT-NEXT: ret float [[CALL_I]] // // FINITEONLY-LABEL: @test___sinf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] // FINITEONLY-NEXT: ret float [[CALL_I]] // // APPROX-LABEL: @test___sinf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // extern "C" __device__ float test___sinf(float x) { @@ -4742,24 +4742,24 @@ extern "C" __device__ float test___sinf(float x) { // DEFAULT-LABEL: @test___tanf( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]] -// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]] +// DEFAULT-NEXT: [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// DEFAULT-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]] // DEFAULT-NEXT: [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]]) // DEFAULT-NEXT: [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]] // DEFAULT-NEXT: ret float [[MUL_I]] // // FINITEONLY-LABEL: @test___tanf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]] -// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]] +// FINITEONLY-NEXT: [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]] +// FINITEONLY-NEXT: [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]] // FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]]) // FINITEONLY-NEXT: [[MUL_I:%.*]] = fmul nnan ninf contract float [[CALL_I3_I]], [[TMP0]] // FINITEONLY-NEXT: ret float [[MUL_I]] // // APPROX-LABEL: @test___tanf( // APPROX-NEXT: entry: -// APPROX-NEXT: [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]] -// APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]] +// APPROX-NEXT: [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]] // APPROX-NEXT: [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]]) // APPROX-NEXT: [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]] // APPROX-NEXT: ret float [[MUL_I]] -- cgit v1.1 From decbd29f9e9be50756a083cd677f7fea22cd3c91 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 8 Feb 2024 14:12:39 +0530 Subject: Reapply "InstCombine: Introduce SimplifyDemandedUseFPClass"" (#74056) This reverts commit ef388334ee5a3584255b9ef5b3fefdb244fa3fd7. The referenced issue violates the spec for finite-only math only by using a return value for a constant infinity. If the interpretation is results and arguments cannot violate nofpclass, then any std::numeric_limits::infinity() result is invalid under -ffinite-math-only. Without this interpretation the utility of nofpclass is slashed. --- llvm/include/llvm/Analysis/ValueTracking.h | 4 + .../Transforms/InstCombine/InstCombineInternal.h | 9 + .../InstCombine/InstCombineSimplifyDemanded.cpp | 136 ++++++++++++++ .../InstCombine/InstructionCombining.cpp | 27 ++- .../InstCombine/simplify-demanded-fpclass.ll | 209 ++++++++------------- 5 files changed, 251 insertions(+), 134 deletions(-) diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index d9287ae..06f94f5 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -248,6 +248,10 @@ struct KnownFPClass { /// definitely set or false if the sign bit is definitely unset. std::optional SignBit; + bool operator==(KnownFPClass Other) const { + return KnownFPClasses == Other.KnownFPClasses && SignBit == Other.SignBit; + } + /// Return true if it's known this can never be one of the mask entries. bool isKnownNever(FPClassTest Mask) const { return (KnownFPClasses & Mask) == fcNone; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 97459a8..7f6618f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -566,6 +566,15 @@ public: APInt &PoisonElts, unsigned Depth = 0, bool AllowMultipleUsers = false) override; + /// Attempts to replace V with a simpler value based on the demanded + /// floating-point classes + Value *SimplifyDemandedUseFPClass(Value *V, FPClassTest DemandedMask, + KnownFPClass &Known, unsigned Depth, + Instruction *CxtI); + bool SimplifyDemandedFPClass(Instruction *I, unsigned Op, + FPClassTest DemandedMask, KnownFPClass &Known, + unsigned Depth = 0); + /// Canonicalize the position of binops relative to shufflevector. Instruction *foldVectorBinop(BinaryOperator &Inst); Instruction *foldVectorSelect(SelectInst &Sel); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 79873a9..be6ee9d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1877,3 +1877,139 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, return MadeChange ? I : nullptr; } + +/// For floating-point classes that resolve to a single bit pattern, return that +/// value. +static Constant *getFPClassConstant(Type *Ty, FPClassTest Mask) { + switch (Mask) { + case fcPosZero: + return ConstantFP::getZero(Ty); + case fcNegZero: + return ConstantFP::getZero(Ty, true); + case fcPosInf: + return ConstantFP::getInfinity(Ty); + case fcNegInf: + return ConstantFP::getInfinity(Ty, true); + case fcNone: + return PoisonValue::get(Ty); + default: + return nullptr; + } +} + +Value *InstCombinerImpl::SimplifyDemandedUseFPClass( + Value *V, const FPClassTest DemandedMask, KnownFPClass &Known, + unsigned Depth, Instruction *CxtI) { + assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth"); + Type *VTy = V->getType(); + + assert(Known == KnownFPClass() && "expected uninitialized state"); + + if (DemandedMask == fcNone) + return isa(V) ? nullptr : PoisonValue::get(VTy); + + if (Depth == MaxAnalysisRecursionDepth) + return nullptr; + + Instruction *I = dyn_cast(V); + if (!I) { + // Handle constants and arguments + Known = computeKnownFPClass(V, fcAllFlags, CxtI, Depth + 1); + Value *FoldedToConst = + getFPClassConstant(VTy, DemandedMask & Known.KnownFPClasses); + return FoldedToConst == V ? nullptr : FoldedToConst; + } + + if (!I->hasOneUse()) + return nullptr; + + // TODO: Should account for nofpclass/FastMathFlags on current instruction + switch (I->getOpcode()) { + case Instruction::FNeg: { + if (SimplifyDemandedFPClass(I, 0, llvm::fneg(DemandedMask), Known, + Depth + 1)) + return I; + Known.fneg(); + break; + } + case Instruction::Call: { + CallInst *CI = cast(I); + switch (CI->getIntrinsicID()) { + case Intrinsic::fabs: + if (SimplifyDemandedFPClass(I, 0, llvm::inverse_fabs(DemandedMask), Known, + Depth + 1)) + return I; + Known.fabs(); + break; + case Intrinsic::arithmetic_fence: + if (SimplifyDemandedFPClass(I, 0, DemandedMask, Known, Depth + 1)) + return I; + break; + case Intrinsic::copysign: { + // Flip on more potentially demanded classes + const FPClassTest DemandedMaskAnySign = llvm::unknown_sign(DemandedMask); + if (SimplifyDemandedFPClass(I, 0, DemandedMaskAnySign, Known, Depth + 1)) + return I; + + if ((DemandedMask & fcPositive) == fcNone) { + // Roundabout way of replacing with fneg(fabs) + I->setOperand(1, ConstantFP::get(VTy, -1.0)); + return I; + } + + if ((DemandedMask & fcNegative) == fcNone) { + // Roundabout way of replacing with fabs + I->setOperand(1, ConstantFP::getZero(VTy)); + return I; + } + + KnownFPClass KnownSign = + computeKnownFPClass(I->getOperand(1), fcAllFlags, CxtI, Depth + 1); + Known.copysign(KnownSign); + break; + } + default: + Known = computeKnownFPClass(I, ~DemandedMask, CxtI, Depth + 1); + break; + } + + break; + } + case Instruction::Select: { + KnownFPClass KnownLHS, KnownRHS; + if (SimplifyDemandedFPClass(I, 2, DemandedMask, KnownRHS, Depth + 1) || + SimplifyDemandedFPClass(I, 1, DemandedMask, KnownLHS, Depth + 1)) + return I; + + if (KnownLHS.isKnownNever(DemandedMask)) + return I->getOperand(2); + if (KnownRHS.isKnownNever(DemandedMask)) + return I->getOperand(1); + + // TODO: Recognize clamping patterns + Known = KnownLHS | KnownRHS; + break; + } + default: + Known = computeKnownFPClass(I, ~DemandedMask, CxtI, Depth + 1); + break; + } + + return getFPClassConstant(VTy, DemandedMask & Known.KnownFPClasses); +} + +bool InstCombinerImpl::SimplifyDemandedFPClass(Instruction *I, unsigned OpNo, + FPClassTest DemandedMask, + KnownFPClass &Known, + unsigned Depth) { + Use &U = I->getOperandUse(OpNo); + Value *NewVal = + SimplifyDemandedUseFPClass(U.get(), DemandedMask, Known, Depth, I); + if (!NewVal) + return false; + if (Instruction *OpInst = dyn_cast(U)) + salvageDebugInfo(*OpInst); + + replaceUse(U, NewVal); + return true; +} diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 9e8bcbc..b1e2262 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -142,6 +142,12 @@ static cl::opt MaxArraySize("instcombine-maxarray-size", cl::init(1024), cl::desc("Maximum array size considered when doing a combine")); +// TODO: Remove this option +static cl::opt EnableSimplifyDemandedUseFPClass( + "instcombine-simplify-demanded-fp-class", + cl::desc("Enable demanded floating-point class optimizations"), + cl::init(false)); + // FIXME: Remove this flag when it is no longer necessary to convert // llvm.dbg.declare to avoid inaccurate debug info. Setting this to false // increases variable availability at the cost of accuracy. Variables that @@ -3105,8 +3111,25 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI, Value *Op) { } Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) { - // Nothing for now. - return nullptr; + if (!EnableSimplifyDemandedUseFPClass) + return nullptr; + + Value *RetVal = RI.getReturnValue(); + if (!RetVal || !AttributeFuncs::isNoFPClassCompatibleType(RetVal->getType())) + return nullptr; + + Function *F = RI.getFunction(); + FPClassTest ReturnClass = F->getAttributes().getRetNoFPClass(); + if (ReturnClass == fcNone) + return nullptr; + + KnownFPClass KnownClass; + Value *Simplified = + SimplifyDemandedUseFPClass(RetVal, ~ReturnClass, KnownClass, 0, &RI); + if (!Simplified) + return nullptr; + + return ReturnInst::Create(RI.getContext(), Simplified); } // WARNING: keep in sync with SimplifyCFGOpt::simplifyUnreachable()! diff --git a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll index 9817b6e..dd9b714 100644 --- a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll +++ b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt -S -passes=instcombine < %s | FileCheck %s +; RUN: opt -S -passes=instcombine -instcombine-simplify-demanded-fp-class < %s | FileCheck %s declare float @llvm.fabs.f32(float) declare float @llvm.copysign.f32(float, float) @@ -42,7 +42,7 @@ define nofpclass(inf) float @ret_nofpclass_inf_undef() { define nofpclass(all) float @ret_nofpclass_all_var(float %arg) { ; CHECK-LABEL: define nofpclass(all) float @ret_nofpclass_all_var ; CHECK-SAME: (float [[ARG:%.*]]) { -; CHECK-NEXT: ret float [[ARG]] +; CHECK-NEXT: ret float poison ; ret float %arg } @@ -51,7 +51,7 @@ define nofpclass(all) float @ret_nofpclass_all_var(float %arg) { define nofpclass(all) <2 x float> @ret_nofpclass_all_var_vector(<2 x float> %arg) { ; CHECK-LABEL: define nofpclass(all) <2 x float> @ret_nofpclass_all_var_vector ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) { -; CHECK-NEXT: ret <2 x float> [[ARG]] +; CHECK-NEXT: ret <2 x float> poison ; ret <2 x float> %arg } @@ -65,14 +65,14 @@ define nofpclass(inf) float @ret_nofpclass_inf__0() { define nofpclass(inf) float @ret_nofpclass_inf__pinf() { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__pinf() { -; CHECK-NEXT: ret float 0x7FF0000000000000 +; CHECK-NEXT: ret float poison ; ret float 0x7FF0000000000000 } define nofpclass(pinf) float @ret_nofpclass_pinf__pinf() { ; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__pinf() { -; CHECK-NEXT: ret float 0x7FF0000000000000 +; CHECK-NEXT: ret float poison ; ret float 0x7FF0000000000000 } @@ -86,7 +86,7 @@ define nofpclass(pinf) float @ret_nofpclass_pinf__ninf() { define nofpclass(inf) float @ret_nofpclass_inf__ninf() { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__ninf() { -; CHECK-NEXT: ret float 0xFFF0000000000000 +; CHECK-NEXT: ret float poison ; ret float 0xFFF0000000000000 } @@ -106,8 +106,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_inf_lhs(i1 %con define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lhs(i1 %cond, float nofpclass(nan norm zero sub) %x, float %y) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lhs ; CHECK-SAME: (i1 [[COND:%.*]], float nofpclass(nan zero sub norm) [[X:%.*]], float [[Y:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float [[Y]] -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float [[Y]] ; %select = select i1 %cond, float %x, float %y ret float %select @@ -117,8 +116,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lh define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rhs(i1 %cond, float %x, float nofpclass(nan norm zero sub) %y) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float nofpclass(nan zero sub norm) [[Y:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float [[Y]] -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float [[X]] ; %select = select i1 %cond, float %x, float %y ret float %select @@ -128,8 +126,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rh define nofpclass(inf) [3 x [2 x float]] @ret_float_array(i1 %cond, [3 x [2 x float]] nofpclass(nan norm zero sub) %x, [3 x [2 x float]] %y) { ; CHECK-LABEL: define nofpclass(inf) [3 x [2 x float]] @ret_float_array ; CHECK-SAME: (i1 [[COND:%.*]], [3 x [2 x float]] nofpclass(nan zero sub norm) [[X:%.*]], [3 x [2 x float]] [[Y:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], [3 x [2 x float]] [[X]], [3 x [2 x float]] [[Y]] -; CHECK-NEXT: ret [3 x [2 x float]] [[SELECT]] +; CHECK-NEXT: ret [3 x [2 x float]] [[Y]] ; %select = select i1 %cond, [3 x [2 x float]] %x, [3 x [2 x float]] %y ret [3 x [2 x float ]] %select @@ -139,8 +136,7 @@ define nofpclass(inf) [3 x [2 x float]] @ret_float_array(i1 %cond, [3 x [2 x flo define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]] -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float [[X]] ; %select = select i1 %cond, float 0x7FF0000000000000, float %x ret float %select @@ -150,8 +146,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs(i1 %cond, float define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float [[X]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 ret float %select @@ -161,8 +156,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs(i1 %cond, float define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float 0xFFF0000000000000 -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float poison ; %select = select i1 %cond, float 0x7FF0000000000000, float 0xFFF0000000000000 ret float %select @@ -172,8 +166,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf(i1 %cond, fl define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float 0x7FF0000000000000 -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float poison ; %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000 ret float %select @@ -183,8 +176,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf(i1 %cond, fl define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float 0x7FF0000000000000 -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float 0x7FF0000000000000 ; %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000 ret float %select @@ -194,8 +186,7 @@ define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf(i1 %cond, define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float 0x7FF0000000000000 -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float 0xFFF0000000000000 ; %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000 ret float %select @@ -205,8 +196,7 @@ define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf(i1 %cond, define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float -0.000000e+00 -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float poison ; %select = select i1 %cond, float 0.0, float -0.0 ret float %select @@ -216,8 +206,7 @@ define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero(i1 %cond define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float -0.000000e+00 -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float 0.000000e+00 ; %select = select i1 %cond, float 0.0, float -0.0 ret float %select @@ -227,8 +216,7 @@ define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero(i1 %co define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float -0.000000e+00 -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float -0.000000e+00 ; %select = select i1 %cond, float 0.0, float -0.0 ret float %select @@ -238,8 +226,7 @@ define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero(i1 %co define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector(<2 x i1> %cond, <2 x float> %x) { ; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector ; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select <2 x i1> [[COND]], <2 x float> , <2 x float> [[X]] -; CHECK-NEXT: ret <2 x float> [[SELECT]] +; CHECK-NEXT: ret <2 x float> [[X]] ; %select = select <2 x i1> %cond, <2 x float> , <2 x float> %x ret <2 x float> %select @@ -249,8 +236,7 @@ define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector(<2 define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_undef(<2 x i1> %cond, <2 x float> %x) { ; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_undef ; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select <2 x i1> [[COND]], <2 x float> , <2 x float> [[X]] -; CHECK-NEXT: ret <2 x float> [[SELECT]] +; CHECK-NEXT: ret <2 x float> [[X]] ; %select = select <2 x i1> %cond, <2 x float> , <2 x float> %x ret <2 x float> %select @@ -260,8 +246,7 @@ define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_und define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_mixed_inf_lhs_vector(<2 x i1> %cond, <2 x float> %x) { ; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_mixed_inf_lhs_vector ; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select <2 x i1> [[COND]], <2 x float> , <2 x float> [[X]] -; CHECK-NEXT: ret <2 x float> [[SELECT]] +; CHECK-NEXT: ret <2 x float> [[X]] ; %select = select <2 x i1> %cond, <2 x float> , <2 x float> %x ret <2 x float> %select @@ -327,8 +312,7 @@ define nofpclass(nan) float @ret_nofpclass_nan__select_pinf_rhs(i1 %cond, float define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(nan inf) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]] -; CHECK-NEXT: ret float [[SELECT1]] +; CHECK-NEXT: ret float [[X]] ; %select0 = select i1 %cond, float 0x7FF8000000000000, float %x %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0 @@ -338,8 +322,7 @@ define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0(i define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_1(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(nan inf) float @ret_nofpclass_inf_nan__select_chain_inf_nan_1 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float 0x7FF8000000000000 -; CHECK-NEXT: ret float [[SELECT1]] +; CHECK-NEXT: ret float poison ; %select0 = select i1 %cond, float %x, float 0x7FF8000000000000 %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0 @@ -360,8 +343,7 @@ define nofpclass(nan) float @ret_nofpclass_nan__select_chain_inf_nan(i1 %cond, f define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]] -; CHECK-NEXT: ret float [[SELECT1]] +; CHECK-NEXT: ret float [[X]] ; %select0 = select i1 %cond, float 0x7FF8000000000000, float %x %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0 @@ -371,8 +353,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0(i1 %cond, define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF8000000000000, float 0x7FF0000000000000 -; CHECK-NEXT: ret float [[SELECT1]] +; CHECK-NEXT: ret float 0x7FF8000000000000 ; %select0 = select i1 %cond, float 0x7FF8000000000000, float %x %select1 = select i1 %cond, float %select0, float 0x7FF0000000000000 @@ -383,8 +364,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1(i1 %cond, define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0xFFF0000000000000 -; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) +; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: ret float [[FABS]] ; %select = select i1 %cond, float %x, float 0xFFF0000000000000 @@ -396,8 +376,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs(i1 %cond, f define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) +; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: ret float [[FABS]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -421,8 +400,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives__fabs_ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_no_positives__fabs_select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_no_positives__fabs_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) +; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: ret float [[FABS]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -446,9 +424,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives_na define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_nan__fabs_select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_no_positives_nan__fabs_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) -; CHECK-NEXT: ret float [[FABS]] +; CHECK-NEXT: ret float poison ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 %fabs = call float @llvm.fabs.f32(float %select) @@ -459,8 +435,7 @@ define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_na define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0xFFF0000000000000 -; CHECK-NEXT: [[FNEG:%.*]] = fneg float [[SELECT]] +; CHECK-NEXT: [[FNEG:%.*]] = fneg float [[X]] ; CHECK-NEXT: ret float [[FNEG]] ; %select = select i1 %cond, float %x, float 0xFFF0000000000000 @@ -472,8 +447,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs(i1 %cond, f define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___fneg_select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_nonegatives_noinf___fneg_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[FNEG:%.*]] = fneg float [[SELECT]] +; CHECK-NEXT: [[FNEG:%.*]] = fneg float [[X]] ; CHECK-NEXT: ret float [[FNEG]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -485,8 +459,7 @@ define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___ define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___fneg_select_ninf_lhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_nonegatives_noinf___fneg_select_ninf_lhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float [[X]] -; CHECK-NEXT: [[FNEG:%.*]] = fneg float [[SELECT]] +; CHECK-NEXT: [[FNEG:%.*]] = fneg float [[X]] ; CHECK-NEXT: ret float [[FNEG]] ; %select = select i1 %cond, float 0xFFF0000000000000, float %x @@ -510,8 +483,7 @@ define nofpclass(pzero psub pnorm pinf) float @ret_nofpclass_nopositives___fneg_ define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) +; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: [[FNEG:%.*]] = fneg float [[FABS]] ; CHECK-NEXT: ret float [[FNEG]] ; @@ -525,8 +497,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs(i1 %co define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives__fneg_fabs_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) +; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: [[FNEG:%.*]] = fneg float [[FABS]] ; CHECK-NEXT: ret float [[FNEG]] ; @@ -541,10 +512,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives__fneg_f define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_nonan__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_nonan__fneg_fabs_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) -; CHECK-NEXT: [[FNEG:%.*]] = fneg float [[FABS]] -; CHECK-NEXT: ret float [[FNEG]] +; CHECK-NEXT: ret float poison ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 %fabs = call float @llvm.fabs.f32(float %select) @@ -556,8 +524,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_non define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]]) ; CHECK-NEXT: ret float [[COPYSIGN]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -568,8 +535,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: ret float [[COPYSIGN]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -580,8 +546,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rh define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: [[COPYSIGN:%.*]] = fneg float [[TMP1]] ; CHECK-NEXT: ret float [[COPYSIGN]] ; @@ -594,7 +559,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rh define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysign(float %x, float %unknown.sign) { ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives_copysign ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = fneg float [[TMP1]] ; CHECK-NEXT: ret float [[COPYSIGN]] ; %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign) @@ -605,7 +571,8 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysig define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysign_nnan_flag(float %x, float %unknown.sign) { ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives_copysign_nnan_flag ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { -; CHECK-NEXT: [[COPYSIGN:%.*]] = call nnan float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call nnan float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = fneg nnan float [[TMP1]] ; CHECK-NEXT: ret float [[COPYSIGN]] ; %copysign = call nnan float @llvm.copysign.f32(float %x, float %unknown.sign) @@ -616,7 +583,8 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysig define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_nopositives_nonan_copysign(float %x, float %unknown.sign) { ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_nopositives_nonan_copysign ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = fneg float [[TMP1]] ; CHECK-NEXT: ret float [[COPYSIGN]] ; %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign) @@ -627,7 +595,7 @@ define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_nopositives_non define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysign(float %x, float %unknown.sign) { ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_copysign ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: ret float [[COPYSIGN]] ; %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign) @@ -638,7 +606,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysig define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysign_nnan_flag(float %x, float %unknown.sign) { ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_copysign_nnan_flag ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { -; CHECK-NEXT: [[COPYSIGN:%.*]] = call nnan float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = call nnan float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: ret float [[COPYSIGN]] ; %copysign = call nnan float @llvm.copysign.f32(float %x, float %unknown.sign) @@ -649,7 +617,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysig define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_nonan_copysign(float %x, float %unknown.sign) { ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_nonan_copysign ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: ret float [[COPYSIGN]] ; %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign) @@ -659,8 +627,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_non define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives__copysign_fabs_select_pinf_rhs(i1 %cond, float %x, float %sign) { ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives__copysign_fabs_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[SIGN:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: ret float [[COPYSIGN]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -673,8 +640,7 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives__copysi define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_no_negatives_noinf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) { ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_no_negatives_noinf__copysign_unknown_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: ret float [[COPYSIGN]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -686,8 +652,8 @@ define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_no_negatives_noinf__ define nofpclass(inf pnorm psub pzero) float @ret_nofpclass_no_positives_noinf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) { ; CHECK-LABEL: define nofpclass(inf pzero psub pnorm) float @ret_nofpclass_no_positives_noinf__copysign_unknown_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = fneg float [[TMP1]] ; CHECK-NEXT: ret float [[COPYSIGN]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -700,7 +666,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives__copys ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_no_negatives__copysign_unknown_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) ; CHECK-NEXT: ret float [[COPYSIGN]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -713,7 +679,8 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_no_positives__copys ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_no_positives__copysign_unknown_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = fneg float [[TMP1]] ; CHECK-NEXT: ret float [[COPYSIGN]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -726,7 +693,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives_no ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_no_negatives_nonan__copysign_unknown_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) ; CHECK-NEXT: ret float [[COPYSIGN]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -739,7 +706,8 @@ define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_no ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_no_positives_nonan__copysign_unknown_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) { ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]]) +; CHECK-NEXT: [[COPYSIGN:%.*]] = fneg float [[TMP1]] ; CHECK-NEXT: ret float [[COPYSIGN]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 @@ -790,9 +758,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nan_negatives__ define nofpclass(nan ninf nnorm nsub zero) float @ret_nofpclass_nan_negatives_zero__select_clamp_pos_to_zero(float %x) { ; CHECK-LABEL: define nofpclass(nan ninf zero nsub nnorm) float @ret_nofpclass_nan_negatives_zero__select_clamp_pos_to_zero ; CHECK-SAME: (float [[X:%.*]]) { -; CHECK-NEXT: [[IS_GT_ZERO:%.*]] = fcmp ogt float [[X]], 0.000000e+00 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[IS_GT_ZERO]], float 0.000000e+00, float [[X]] -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float [[X]] ; %is.gt.zero = fcmp ogt float %x, 0.0 %select = select i1 %is.gt.zero, float 0.0, float %x @@ -803,9 +769,7 @@ define nofpclass(nan ninf nnorm nsub zero) float @ret_nofpclass_nan_negatives_ze define nofpclass(ninf nnorm nsub zero) float @ret_nofpclass_negatives_zero__select_clamp_pos_to_zero(float %x) { ; CHECK-LABEL: define nofpclass(ninf zero nsub nnorm) float @ret_nofpclass_negatives_zero__select_clamp_pos_to_zero ; CHECK-SAME: (float [[X:%.*]]) { -; CHECK-NEXT: [[IS_GT_ZERO:%.*]] = fcmp ogt float [[X]], 0.000000e+00 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[IS_GT_ZERO]], float 0.000000e+00, float [[X]] -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float [[X]] ; %is.gt.zero = fcmp ogt float %x, 0.0 %select = select i1 %is.gt.zero, float 0.0, float %x @@ -819,8 +783,7 @@ define nofpclass(inf) float @ret_nofpclass_noinfs__assumed_isinf__select_pinf_lh ; CHECK-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]]) ; CHECK-NEXT: [[X_IS_INF:%.*]] = fcmp oeq float [[FABS_X]], 0x7FF0000000000000 ; CHECK-NEXT: call void @llvm.assume(i1 [[X_IS_INF]]) -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float [[Y]] -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float [[Y]] ; %fabs.x = call float @llvm.fabs.f32(float %x) %x.is.inf = fcmp oeq float %fabs.x, 0x7FF0000000000000 @@ -838,18 +801,13 @@ define nofpclass(nan inf nzero nsub nnorm) float @powr_issue64870(float nofpclas ; CHECK-NEXT: [[I1:%.*]] = tail call float @llvm.log2.f32(float [[I]]) ; CHECK-NEXT: [[I2:%.*]] = fmul float [[I1]], [[Y]] ; CHECK-NEXT: [[I3:%.*]] = tail call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[I2]]) -; CHECK-NEXT: [[I4:%.*]] = fcmp olt float [[Y]], 0.000000e+00 -; CHECK-NEXT: [[I5:%.*]] = select i1 [[I4]], float 0x7FF0000000000000, float 0.000000e+00 ; CHECK-NEXT: [[I6:%.*]] = fcmp oeq float [[X]], 0.000000e+00 -; CHECK-NEXT: [[I7:%.*]] = select i1 [[I6]], float [[I5]], float [[I3]] +; CHECK-NEXT: [[I7:%.*]] = select i1 [[I6]], float 0.000000e+00, float [[I3]] ; CHECK-NEXT: [[I8:%.*]] = fcmp oeq float [[Y]], 0.000000e+00 -; CHECK-NEXT: [[I9:%.*]] = select i1 [[I6]], float 0x7FF8000000000000, float 1.000000e+00 -; CHECK-NEXT: [[I10:%.*]] = select i1 [[I8]], float [[I9]], float [[I7]] ; CHECK-NEXT: [[I11:%.*]] = fcmp oeq float [[X]], 1.000000e+00 -; CHECK-NEXT: [[I12:%.*]] = select i1 [[I11]], float 1.000000e+00, float [[I10]] -; CHECK-NEXT: [[I13:%.*]] = fcmp olt float [[X]], 0.000000e+00 -; CHECK-NEXT: [[I14:%.*]] = select i1 [[I13]], float 0x7FF8000000000000, float [[I12]] -; CHECK-NEXT: ret float [[I14]] +; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[I11]], i1 true, i1 [[I8]] +; CHECK-NEXT: [[I12:%.*]] = select i1 [[TMP0]], float 1.000000e+00, float [[I7]] +; CHECK-NEXT: ret float [[I12]] ; entry: %i = tail call float @llvm.fabs.f32(float %x) @@ -881,12 +839,8 @@ define nofpclass(nan inf nzero nsub nnorm) float @test_powr_issue64870_2(float n ; CHECK-NEXT: [[I4:%.*]] = select i1 [[I]], float 0x7FF8000000000000, float [[ARG1]] ; CHECK-NEXT: [[I5:%.*]] = fmul float [[I4]], [[I3]] ; CHECK-NEXT: [[I6:%.*]] = tail call noundef nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float noundef [[I5]]) -; CHECK-NEXT: [[I7:%.*]] = fcmp olt float [[I4]], 0.000000e+00 -; CHECK-NEXT: [[I8:%.*]] = select i1 [[I7]], float 0x7FF0000000000000, float 0.000000e+00 -; CHECK-NEXT: [[I9:%.*]] = fcmp ueq float [[I4]], 0.000000e+00 ; CHECK-NEXT: [[I10:%.*]] = fcmp oeq float [[I2]], 0.000000e+00 -; CHECK-NEXT: [[I11:%.*]] = select i1 [[I9]], float 0x7FF8000000000000, float [[I8]] -; CHECK-NEXT: [[I12:%.*]] = select i1 [[I10]], float [[I11]], float [[I6]] +; CHECK-NEXT: [[I12:%.*]] = select i1 [[I10]], float 0.000000e+00, float [[I6]] ; CHECK-NEXT: ret float [[I12]] ; bb: @@ -923,16 +877,10 @@ define nofpclass(nan inf) float @pow_f32(float nofpclass(nan inf) %arg, float no ; CHECK-NEXT: [[I11:%.*]] = and i1 [[I7]], [[I10]] ; CHECK-NEXT: [[I12:%.*]] = select i1 [[I11]], float [[ARG]], float 1.000000e+00 ; CHECK-NEXT: [[I13:%.*]] = tail call noundef float @llvm.copysign.f32(float noundef [[I4]], float noundef [[I12]]) -; CHECK-NEXT: [[I14:%.*]] = fcmp olt float [[ARG]], 0.000000e+00 -; CHECK-NEXT: [[I15:%.*]] = select i1 [[I7]], float [[I13]], float 0x7FF8000000000000 -; CHECK-NEXT: [[I16:%.*]] = select i1 [[I14]], float [[I15]], float [[I13]] ; CHECK-NEXT: [[I17:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00 -; CHECK-NEXT: [[I18:%.*]] = fcmp olt float [[ARG1]], 0.000000e+00 -; CHECK-NEXT: [[I19:%.*]] = xor i1 [[I17]], [[I18]] -; CHECK-NEXT: [[I20:%.*]] = select i1 [[I19]], float 0.000000e+00, float 0x7FF0000000000000 ; CHECK-NEXT: [[I21:%.*]] = select i1 [[I11]], float [[ARG]], float 0.000000e+00 -; CHECK-NEXT: [[I22:%.*]] = tail call noundef nofpclass(nan sub norm) float @llvm.copysign.f32(float noundef [[I20]], float noundef [[I21]]) -; CHECK-NEXT: [[I23:%.*]] = select i1 [[I17]], float [[I22]], float [[I16]] +; CHECK-NEXT: [[I22:%.*]] = tail call noundef nofpclass(nan sub norm) float @llvm.copysign.f32(float noundef 0.000000e+00, float noundef [[I21]]) +; CHECK-NEXT: [[I23:%.*]] = select i1 [[I17]], float [[I22]], float [[I13]] ; CHECK-NEXT: [[I24:%.*]] = fcmp oeq float [[ARG]], 1.000000e+00 ; CHECK-NEXT: [[I25:%.*]] = fcmp oeq float [[ARG1]], 0.000000e+00 ; CHECK-NEXT: [[I26:%.*]] = or i1 [[I24]], [[I25]] @@ -977,8 +925,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_call_only_inf(i ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_call_only_inf ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) { ; CHECK-NEXT: [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern() -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[MUST_BE_INF]], float [[Y]] -; CHECK-NEXT: ret float [[SELECT]] +; CHECK-NEXT: ret float [[Y]] ; %must.be.inf = call nofpclass(nan norm zero sub) float @extern() %select = select i1 %cond, float %must.be.inf, float %y @@ -989,7 +936,7 @@ define nofpclass(pinf) float @ret_nofpclass_pinf__nofpclass_call_only_inf(i1 %co ; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__nofpclass_call_only_inf ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) { ; CHECK-NEXT: [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern() -; CHECK-NEXT: ret float [[MUST_BE_INF]] +; CHECK-NEXT: ret float 0xFFF0000000000000 ; %must.be.inf = call nofpclass(nan norm zero sub) float @extern() ret float %must.be.inf @@ -999,7 +946,7 @@ define nofpclass(ninf) float @ret_nofpclass_ninf__nofpclass_call_only_inf(i1 %co ; CHECK-LABEL: define nofpclass(ninf) float @ret_nofpclass_ninf__nofpclass_call_only_inf ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) { ; CHECK-NEXT: [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern() -; CHECK-NEXT: ret float [[MUST_BE_INF]] +; CHECK-NEXT: ret float 0x7FF0000000000000 ; %must.be.inf = call nofpclass(nan norm zero sub) float @extern() ret float %must.be.inf @@ -1009,7 +956,7 @@ define nofpclass(nzero) float @ret_nofpclass_nzero__nofpclass_call_only_zero(i1 ; CHECK-LABEL: define nofpclass(nzero) float @ret_nofpclass_nzero__nofpclass_call_only_zero ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) { ; CHECK-NEXT: [[MUST_BE_ZERO:%.*]] = call nofpclass(nan inf sub norm) float @extern() -; CHECK-NEXT: ret float [[MUST_BE_ZERO]] +; CHECK-NEXT: ret float 0.000000e+00 ; %must.be.zero = call nofpclass(nan sub norm inf) float @extern() ret float %must.be.zero @@ -1019,7 +966,7 @@ define nofpclass(pzero) float @ret_nofpclass_pzero__nofpclass_call_only_zero(i1 ; CHECK-LABEL: define nofpclass(pzero) float @ret_nofpclass_pzero__nofpclass_call_only_zero ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) { ; CHECK-NEXT: [[MUST_BE_ZERO:%.*]] = call nofpclass(nan inf sub norm) float @extern() -; CHECK-NEXT: ret float [[MUST_BE_ZERO]] +; CHECK-NEXT: ret float -0.000000e+00 ; %must.be.zero = call nofpclass(nan sub norm inf) float @extern() ret float %must.be.zero @@ -1133,8 +1080,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__recursive_phi_0(i1 %cond0, float ; CHECK-NEXT: [[LOOP_COND:%.*]] = call i1 @loop.cond() ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[RET]], label [[LOOP]] ; CHECK: ret: -; CHECK-NEXT: [[PHI_RET:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0x7FF0000000000000, [[LOOP]] ] -; CHECK-NEXT: ret float [[PHI_RET]] +; CHECK-NEXT: ret float 0.000000e+00 ; entry: br i1 %cond0, label %loop, label %ret @@ -1159,7 +1105,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__recursive_phi_1(i1 %cond0, float ; CHECK-NEXT: [[LOOP_COND:%.*]] = call i1 @loop.cond() ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[RET]], label [[LOOP]] ; CHECK: ret: -; CHECK-NEXT: ret float 0x7FF0000000000000 +; CHECK-NEXT: ret float poison ; entry: br i1 %cond0, label %loop, label %ret @@ -1180,8 +1126,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__phi_switch_repeated_predecessor( ; CHECK-SAME: (i32 [[SWITCH:%.*]], float [[UNKNOWN:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: switch i32 [[SWITCH]], label [[RET:%.*]] [ -; CHECK-NEXT: i32 0, label [[LOOP:%.*]] -; CHECK-NEXT: i32 1, label [[LOOP]] +; CHECK-NEXT: i32 0, label [[LOOP:%.*]] +; CHECK-NEXT: i32 1, label [[LOOP]] ; CHECK-NEXT: ] ; CHECK: loop: ; CHECK-NEXT: [[PHI_LOOP:%.*]] = phi float [ 0x7FF0000000000000, [[ENTRY:%.*]] ], [ 0x7FF0000000000000, [[ENTRY]] ], [ [[UNKNOWN]], [[LOOP]] ] @@ -1211,8 +1157,7 @@ ret: define nofpclass(inf) float @ret_nofpclass_inf__arithmetic_fence_select_pinf_rhs(i1 %cond, float %x) { ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__arithmetic_fence_select_pinf_rhs ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) { -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[FENCE:%.*]] = call float @llvm.arithmetic.fence.f32(float [[SELECT]]) +; CHECK-NEXT: [[FENCE:%.*]] = call float @llvm.arithmetic.fence.f32(float [[X]]) ; CHECK-NEXT: ret float [[FENCE]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 -- cgit v1.1 From 35d6ae8110e082e9a4704416dfbe83d5a3b16ed1 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 8 Feb 2024 09:44:51 +0100 Subject: [InstCombine] Handle multi-use in simplifyAndOrWithOpReplaced() (#81006) Slightly generalize simplifyAndOrWithOpReplaced() by allowing it to perform simplifications (without creating new instructions) in multi-use cases. This way we can remove existing patterns without worrying about multi-use edge cases. I've opted to change the general way the implementation works to be more similar to the standard simplifyWithOpReplaced(). We perform the operand replacement generically, and then try to simplify the result or create a new instruction if we're allowed to do so. --- .../Transforms/InstCombine/InstCombineAndOrXor.cpp | 92 +++++++++++----------- llvm/test/Transforms/InstCombine/or.ll | 3 +- 2 files changed, 47 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index aa3b9da..a53eb39 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -2217,47 +2217,47 @@ foldBitwiseLogicWithIntrinsics(BinaryOperator &I, } } -// Try to simplify X | Y by replacing occurrences of Y in X with 0. -// Similarly, simplify X & Y by replacing occurrences of Y in X with -1. +// Try to simplify V by replacing occurrences of Op with RepOp, but only look +// through bitwise operations. In particular, for X | Y we try to replace Y with +// 0 inside X and for X & Y we try to replace Y with -1 inside X. // Return the simplified result of X if successful, and nullptr otherwise. -static Value *simplifyAndOrWithOpReplaced(Value *X, Value *Y, bool IsAnd, +// If SimplifyOnly is true, no new instructions will be created. +static Value *simplifyAndOrWithOpReplaced(Value *V, Value *Op, Value *RepOp, + bool SimplifyOnly, InstCombinerImpl &IC, unsigned Depth = 0) { - if (isa(X) || X == Y) + if (Op == RepOp) return nullptr; - Value *RHS; - if (match(X, m_c_And(m_Specific(Y), m_Value(RHS)))) { - return IsAnd ? RHS : Constant::getNullValue(X->getType()); - } else if (match(X, m_c_Or(m_Specific(Y), m_Value(RHS)))) { - return IsAnd ? Constant::getAllOnesValue(X->getType()) : RHS; - } else if (match(X, m_c_Xor(m_Specific(Y), m_Value(RHS)))) { - if (IsAnd) { - if (X->hasOneUse()) - return IC.Builder.CreateNot(RHS); + if (V == Op) + return RepOp; - if (Value *NotRHS = - IC.getFreelyInverted(RHS, RHS->hasOneUse(), &IC.Builder)) - return NotRHS; - } else - return RHS; - } + auto *I = dyn_cast(V); + if (!I || !I->isBitwiseLogicOp() || Depth >= 3) + return nullptr; - // Replace uses of Y in X recursively. - Value *Op0, *Op1; - if (Depth < 2 && match(X, m_BitwiseLogic(m_Value(Op0), m_Value(Op1)))) { - // TODO: Relax the one-use constraint to clean up existing hard-coded - // simplifications. - if (!X->hasOneUse()) - return nullptr; - Value *NewOp0 = simplifyAndOrWithOpReplaced(Op0, Y, IsAnd, IC, Depth + 1); - Value *NewOp1 = simplifyAndOrWithOpReplaced(Op1, Y, IsAnd, IC, Depth + 1); - if (!NewOp0 && !NewOp1) - return nullptr; - return IC.Builder.CreateBinOp(cast(X)->getOpcode(), - NewOp0 ? NewOp0 : Op0, NewOp1 ? NewOp1 : Op1); - } - return nullptr; + if (!I->hasOneUse()) + SimplifyOnly = true; + + Value *NewOp0 = simplifyAndOrWithOpReplaced(I->getOperand(0), Op, RepOp, + SimplifyOnly, IC, Depth + 1); + Value *NewOp1 = simplifyAndOrWithOpReplaced(I->getOperand(1), Op, RepOp, + SimplifyOnly, IC, Depth + 1); + if (!NewOp0 && !NewOp1) + return nullptr; + + if (!NewOp0) + NewOp0 = I->getOperand(0); + if (!NewOp1) + NewOp1 = I->getOperand(1); + + if (Value *Res = simplifyBinOp(I->getOpcode(), NewOp0, NewOp1, + IC.getSimplifyQuery().getWithInstruction(I))) + return Res; + + if (SimplifyOnly) + return nullptr; + return IC.Builder.CreateBinOp(I->getOpcode(), NewOp0, NewOp1); } // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches @@ -2781,9 +2781,13 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { if (Instruction *Res = foldBitwiseLogicWithIntrinsics(I, Builder)) return Res; - if (Value *V = simplifyAndOrWithOpReplaced(Op0, Op1, /*IsAnd*/ true, *this)) + if (Value *V = + simplifyAndOrWithOpReplaced(Op0, Op1, Constant::getAllOnesValue(Ty), + /*SimplifyOnly*/ false, *this)) return BinaryOperator::CreateAnd(V, Op1); - if (Value *V = simplifyAndOrWithOpReplaced(Op1, Op0, /*IsAnd*/ true, *this)) + if (Value *V = + simplifyAndOrWithOpReplaced(Op1, Op0, Constant::getAllOnesValue(Ty), + /*SimplifyOnly*/ false, *this)) return BinaryOperator::CreateAnd(Op0, V); return nullptr; @@ -3602,14 +3606,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) return BinaryOperator::CreateOr(Op1, C); - // ((A & B) ^ C) | B -> C | B - if (match(Op0, m_c_Xor(m_c_And(m_Value(A), m_Specific(Op1)), m_Value(C)))) - return BinaryOperator::CreateOr(C, Op1); - - // B | ((A & B) ^ C) -> B | C - if (match(Op1, m_c_Xor(m_c_And(m_Value(A), m_Specific(Op0)), m_Value(C)))) - return BinaryOperator::CreateOr(Op0, C); - if (Instruction *DeMorgan = matchDeMorgansLaws(I, *this)) return DeMorgan; @@ -3965,9 +3961,13 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Instruction *Res = foldBitwiseLogicWithIntrinsics(I, Builder)) return Res; - if (Value *V = simplifyAndOrWithOpReplaced(Op0, Op1, /*IsAnd*/ false, *this)) + if (Value *V = + simplifyAndOrWithOpReplaced(Op0, Op1, Constant::getNullValue(Ty), + /*SimplifyOnly*/ false, *this)) return BinaryOperator::CreateOr(V, Op1); - if (Value *V = simplifyAndOrWithOpReplaced(Op1, Op0, /*IsAnd*/ false, *this)) + if (Value *V = + simplifyAndOrWithOpReplaced(Op1, Op0, Constant::getNullValue(Ty), + /*SimplifyOnly*/ false, *this)) return BinaryOperator::CreateOr(Op0, V); return nullptr; diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll index 51863af..1b1a6ff 100644 --- a/llvm/test/Transforms/InstCombine/or.ll +++ b/llvm/test/Transforms/InstCombine/or.ll @@ -1938,8 +1938,7 @@ define i32 @test_or_and_and_multiuse(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[AND1]], [[C:%.*]] ; CHECK-NEXT: call void @use(i32 [[AND1]]) ; CHECK-NEXT: call void @use(i32 [[AND2]]) -; CHECK-NEXT: [[OR:%.*]] = or i32 [[AND2]], [[A]] -; CHECK-NEXT: ret i32 [[OR]] +; CHECK-NEXT: ret i32 [[A]] ; %and1 = and i32 %a, %b %and2 = and i32 %and1, %c -- cgit v1.1 From 7c0d52ca91d32e693ca245fb82f2402a34212fc3 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 8 Feb 2024 09:47:49 +0100 Subject: [ValueTracking] Support dominating known bits condition in and/or (#74728) This extends computeKnownBits() support for dominating conditions to also handle and/or conditions. We'll look through either and or or depending on which edge we're considering. This change is mainly for the sake of completeness, so we don't start missing optimizations if SimplifyCFG decides to merge some branches. --- llvm/lib/Analysis/DomConditionCache.cpp | 48 ++++++++++++++++--------- llvm/lib/Analysis/ValueTracking.cpp | 32 +++++++++++------ llvm/test/Transforms/InstCombine/known-bits.ll | 15 +++----- llvm/test/Transforms/LoopVectorize/induction.ll | 30 ++++++++-------- 4 files changed, 74 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Analysis/DomConditionCache.cpp b/llvm/lib/Analysis/DomConditionCache.cpp index c7f4cab..3dad0c2 100644 --- a/llvm/lib/Analysis/DomConditionCache.cpp +++ b/llvm/lib/Analysis/DomConditionCache.cpp @@ -34,23 +34,39 @@ static void findAffectedValues(Value *Cond, } }; - ICmpInst::Predicate Pred; - Value *A; - if (match(Cond, m_ICmp(Pred, m_Value(A), m_Constant()))) { - AddAffected(A); + bool TopLevelIsAnd = match(Cond, m_LogicalAnd()); + SmallVector Worklist; + SmallPtrSet Visited; + Worklist.push_back(Cond); + while (!Worklist.empty()) { + Value *V = Worklist.pop_back_val(); + if (!Visited.insert(V).second) + continue; - if (ICmpInst::isEquality(Pred)) { - Value *X; - // (X & C) or (X | C) or (X ^ C). - // (X << C) or (X >>_s C) or (X >>_u C). - if (match(A, m_BitwiseLogic(m_Value(X), m_ConstantInt())) || - match(A, m_Shift(m_Value(X), m_ConstantInt()))) - AddAffected(X); - } else { - Value *X; - // Handle (A + C1) u< C2, which is the canonical form of A > C3 && A < C4. - if (match(A, m_Add(m_Value(X), m_ConstantInt()))) - AddAffected(X); + ICmpInst::Predicate Pred; + Value *A, *B; + // Only recurse into and/or if it matches the top-level and/or type. + if (TopLevelIsAnd ? match(V, m_LogicalAnd(m_Value(A), m_Value(B))) + : match(V, m_LogicalOr(m_Value(A), m_Value(B)))) { + Worklist.push_back(A); + Worklist.push_back(B); + } else if (match(V, m_ICmp(Pred, m_Value(A), m_Constant()))) { + AddAffected(A); + + if (ICmpInst::isEquality(Pred)) { + Value *X; + // (X & C) or (X | C) or (X ^ C). + // (X << C) or (X >>_s C) or (X >>_u C). + if (match(A, m_BitwiseLogic(m_Value(X), m_ConstantInt())) || + match(A, m_Shift(m_Value(X), m_ConstantInt()))) + AddAffected(X); + } else { + Value *X; + // Handle (A + C1) u< C2, which is the canonical form of + // A > C3 && A < C4. + if (match(A, m_Add(m_Value(X), m_ConstantInt()))) + AddAffected(X); + } } } } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 58db81f..0e40a02 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -706,28 +706,40 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred, } } +static void computeKnownBitsFromCond(const Value *V, Value *Cond, + KnownBits &Known, unsigned Depth, + const SimplifyQuery &SQ, bool Invert) { + Value *A, *B; + if (Depth < MaxAnalysisRecursionDepth && + (Invert ? match(Cond, m_LogicalOr(m_Value(A), m_Value(B))) + : match(Cond, m_LogicalAnd(m_Value(A), m_Value(B))))) { + computeKnownBitsFromCond(V, A, Known, Depth + 1, SQ, Invert); + computeKnownBitsFromCond(V, B, Known, Depth + 1, SQ, Invert); + } + + if (auto *Cmp = dyn_cast(Cond)) + computeKnownBitsFromCmp( + V, Invert ? Cmp->getInversePredicate() : Cmp->getPredicate(), + Cmp->getOperand(0), Cmp->getOperand(1), Known, SQ); +} + void llvm::computeKnownBitsFromContext(const Value *V, KnownBits &Known, - unsigned Depth, const SimplifyQuery &Q) { + unsigned Depth, const SimplifyQuery &Q) { if (!Q.CxtI) return; if (Q.DC && Q.DT) { // Handle dominating conditions. for (BranchInst *BI : Q.DC->conditionsFor(V)) { - auto *Cmp = dyn_cast(BI->getCondition()); - if (!Cmp) - continue; - BasicBlockEdge Edge0(BI->getParent(), BI->getSuccessor(0)); if (Q.DT->dominates(Edge0, Q.CxtI->getParent())) - computeKnownBitsFromCmp(V, Cmp->getPredicate(), Cmp->getOperand(0), - Cmp->getOperand(1), Known, Q); + computeKnownBitsFromCond(V, BI->getCondition(), Known, Depth, Q, + /*Invert*/ false); BasicBlockEdge Edge1(BI->getParent(), BI->getSuccessor(1)); if (Q.DT->dominates(Edge1, Q.CxtI->getParent())) - computeKnownBitsFromCmp(V, Cmp->getInversePredicate(), - Cmp->getOperand(0), Cmp->getOperand(1), Known, - Q); + computeKnownBitsFromCond(V, BI->getCondition(), Known, Depth, Q, + /*Invert*/ true); } if (Known.hasConflict()) diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll index e346330..246579c 100644 --- a/llvm/test/Transforms/InstCombine/known-bits.ll +++ b/llvm/test/Transforms/InstCombine/known-bits.ll @@ -105,8 +105,7 @@ define i8 @test_cond_and(i8 %x, i1 %c) { ; CHECK-NEXT: [[COND:%.*]] = and i1 [[CMP]], [[C:%.*]] ; CHECK-NEXT: br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]] ; CHECK: if: -; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -4 -; CHECK-NEXT: ret i8 [[OR1]] +; CHECK-NEXT: ret i8 -4 ; CHECK: exit: ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -4 ; CHECK-NEXT: ret i8 [[OR2]] @@ -133,8 +132,7 @@ define i8 @test_cond_and_commuted(i8 %x, i1 %c1, i1 %c2) { ; CHECK-NEXT: [[COND:%.*]] = and i1 [[C3]], [[CMP]] ; CHECK-NEXT: br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]] ; CHECK: if: -; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -4 -; CHECK-NEXT: ret i8 [[OR1]] +; CHECK-NEXT: ret i8 -4 ; CHECK: exit: ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -4 ; CHECK-NEXT: ret i8 [[OR2]] @@ -161,8 +159,7 @@ define i8 @test_cond_logical_and(i8 %x, i1 %c) { ; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i1 [[C:%.*]], i1 false ; CHECK-NEXT: br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]] ; CHECK: if: -; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -4 -; CHECK-NEXT: ret i8 [[OR1]] +; CHECK-NEXT: ret i8 -4 ; CHECK: exit: ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -4 ; CHECK-NEXT: ret i8 [[OR2]] @@ -218,8 +215,7 @@ define i8 @test_cond_inv_or(i8 %x, i1 %c) { ; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -4 ; CHECK-NEXT: ret i8 [[OR1]] ; CHECK: exit: -; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -4 -; CHECK-NEXT: ret i8 [[OR2]] +; CHECK-NEXT: ret i8 -4 ; %and = and i8 %x, 3 %cmp = icmp ne i8 %and, 0 @@ -242,8 +238,7 @@ define i8 @test_cond_inv_logical_or(i8 %x, i1 %c) { ; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP_NOT]], i1 [[C:%.*]], i1 false ; CHECK-NEXT: br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]] ; CHECK: if: -; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -4 -; CHECK-NEXT: ret i8 [[OR1]] +; CHECK-NEXT: ret i8 -4 ; CHECK: exit: ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -4 ; CHECK-NEXT: ret i8 [[OR2]] diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 29d8719d..50a5cc6 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -3523,10 +3523,10 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; IND-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] ; IND-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: -; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -2 +; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 510 ; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; IND-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] +; IND-NEXT: [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] ; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 ; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; IND-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], @@ -3589,10 +3589,10 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; UNROLL-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] ; UNROLL-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: -; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -4 +; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 508 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; UNROLL-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] +; UNROLL-NEXT: [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], @@ -3733,10 +3733,10 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] ; INTERLEAVE-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: -; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -8 +; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 504 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; INTERLEAVE-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] +; INTERLEAVE-NEXT: [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], @@ -3907,11 +3907,11 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; IND-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] ; IND-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: -; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -2 +; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 510 ; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; IND-NEXT: [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]] -; IND-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL5]], 2 +; IND-NEXT: [[EXT_MUL5:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] +; IND-NEXT: [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL5]], 2 ; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 ; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; IND-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], @@ -3976,11 +3976,11 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; UNROLL-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] ; UNROLL-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: -; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -4 +; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 508 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; UNROLL-NEXT: [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]] -; UNROLL-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2 +; UNROLL-NEXT: [[EXT_MUL6:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] +; UNROLL-NEXT: [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL6]], 2 ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], @@ -4126,11 +4126,11 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] ; INTERLEAVE-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: -; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -8 +; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 504 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; INTERLEAVE-NEXT: [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]] -; INTERLEAVE-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2 +; INTERLEAVE-NEXT: [[EXT_MUL6:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] +; INTERLEAVE-NEXT: [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL6]], 2 ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], -- cgit v1.1 From 7ec6e7351458924946e9afaadf9788cb233095b9 Mon Sep 17 00:00:00 2001 From: Sven van Haastregt Date: Thu, 8 Feb 2024 08:58:13 +0000 Subject: [DAG] Fix typos in comments; NFC --- llvm/include/llvm/CodeGen/SelectionDAG.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index b9ec307..886ec0b 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1613,10 +1613,10 @@ public: /// Expand the specified \c ISD::VACOPY node as the Legalize pass would. SDValue expandVACopy(SDNode *Node); - /// Returs an GlobalAddress of the function from the current module with + /// Return a GlobalAddress of the function from the current module with /// name matching the given ExternalSymbol. Additionally can provide the /// matched function. - /// Panics the function doesn't exists. + /// Panic if the function doesn't exist. SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction = nullptr); @@ -2255,7 +2255,7 @@ public: std::pair GetDependentSplitDestVTs(const EVT &VT, const EVT &EnvVT, bool *HiIsEmpty) const; - /// Split the vector with EXTRACT_SUBVECTOR using the provides + /// Split the vector with EXTRACT_SUBVECTOR using the provided /// VTs and return the low/high part. std::pair SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT); -- cgit v1.1 From dd9511d3e46094ec15282bce6eba163fed2226a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 7 Feb 2024 16:06:59 +0100 Subject: [clang][Interp][NFC] Convert test case to verify=expected,both style --- clang/test/AST/Interp/builtin-functions.cpp | 103 +++++++++++----------------- 1 file changed, 39 insertions(+), 64 deletions(-) diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp index d6ed2d8..3aa01d5 100644 --- a/clang/test/AST/Interp/builtin-functions.cpp +++ b/clang/test/AST/Interp/builtin-functions.cpp @@ -1,11 +1,11 @@ -// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify -// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify -// RUN: %clang_cc1 -Wno-string-plus-int -verify=ref %s -Wno-constant-evaluated -// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify -// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify -// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -verify=ref %s -Wno-constant-evaluated -// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify -// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -verify=ref %s -Wno-constant-evaluated +// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify=expected,both +// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify=expected,both +// RUN: %clang_cc1 -Wno-string-plus-int -verify=ref,both %s -Wno-constant-evaluated +// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify=expected,both +// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify=expected,both +// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -verify=ref,both %s -Wno-constant-evaluated +// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify=expected,both +// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -verify=ref,both %s -Wno-constant-evaluated namespace strcmp { @@ -23,23 +23,17 @@ namespace strcmp { static_assert(__builtin_strcmp("abab\0banana", "abab") == 0, ""); static_assert(__builtin_strcmp("abab", "abab\0banana") == 0, ""); static_assert(__builtin_strcmp("abab\0banana", "abab\0canada") == 0, ""); - static_assert(__builtin_strcmp(0, "abab") == 0, ""); // expected-error {{not an integral constant}} \ - // expected-note {{dereferenced null}} \ - // expected-note {{in call to}} \ - // ref-error {{not an integral constant}} \ - // ref-note {{dereferenced null}} - static_assert(__builtin_strcmp("abab", 0) == 0, ""); // expected-error {{not an integral constant}} \ - // expected-note {{dereferenced null}} \ - // expected-note {{in call to}} \ - // ref-error {{not an integral constant}} \ - // ref-note {{dereferenced null}} + static_assert(__builtin_strcmp(0, "abab") == 0, ""); // both-error {{not an integral constant}} \ + // both-note {{dereferenced null}} \ + // expected-note {{in call to}} + static_assert(__builtin_strcmp("abab", 0) == 0, ""); // both-error {{not an integral constant}} \ + // both-note {{dereferenced null}} \ + // expected-note {{in call to}} static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar) == -1, ""); - static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar + 6) == 0, ""); // expected-error {{not an integral constant}} \ - // expected-note {{dereferenced one-past-the-end}} \ - // expected-note {{in call to}} \ - // ref-error {{not an integral constant}} \ - // ref-note {{dereferenced one-past-the-end}} + static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar + 6) == 0, ""); // both-error {{not an integral constant}} \ + // both-note {{dereferenced one-past-the-end}} \ + // expected-note {{in call to}} } /// Copied from constant-expression-cxx11.cpp @@ -69,41 +63,27 @@ constexpr const char *a = "foo\0quux"; static_assert(check(b), ""); static_assert(check(c), ""); - constexpr int over1 = __builtin_strlen(a + 9); // expected-error {{constant expression}} \ - // expected-note {{one-past-the-end}} \ - // expected-note {{in call to}} \ - // ref-error {{constant expression}} \ - // ref-note {{one-past-the-end}} - constexpr int over2 = __builtin_strlen(b + 9); // expected-error {{constant expression}} \ - // expected-note {{one-past-the-end}} \ - // expected-note {{in call to}} \ - // ref-error {{constant expression}} \ - // ref-note {{one-past-the-end}} - constexpr int over3 = __builtin_strlen(c + 9); // expected-error {{constant expression}} \ - // expected-note {{one-past-the-end}} \ - // expected-note {{in call to}} \ - // ref-error {{constant expression}} \ - // ref-note {{one-past-the-end}} - - constexpr int under1 = __builtin_strlen(a - 1); // expected-error {{constant expression}} \ - // expected-note {{cannot refer to element -1}} \ - // ref-error {{constant expression}} \ - // ref-note {{cannot refer to element -1}} - constexpr int under2 = __builtin_strlen(b - 1); // expected-error {{constant expression}} \ - // expected-note {{cannot refer to element -1}} \ - // ref-error {{constant expression}} \ - // ref-note {{cannot refer to element -1}} - constexpr int under3 = __builtin_strlen(c - 1); // expected-error {{constant expression}} \ - // expected-note {{cannot refer to element -1}} \ - // ref-error {{constant expression}} \ - // ref-note {{cannot refer to element -1}} + constexpr int over1 = __builtin_strlen(a + 9); // both-error {{constant expression}} \ + // both-note {{one-past-the-end}} \ + // expected-note {{in call to}} + constexpr int over2 = __builtin_strlen(b + 9); // both-error {{constant expression}} \ + // both-note {{one-past-the-end}} \ + // expected-note {{in call to}} + constexpr int over3 = __builtin_strlen(c + 9); // both-error {{constant expression}} \ + // both-note {{one-past-the-end}} \ + // expected-note {{in call to}} + + constexpr int under1 = __builtin_strlen(a - 1); // both-error {{constant expression}} \ + // both-note {{cannot refer to element -1}} + constexpr int under2 = __builtin_strlen(b - 1); // both-error {{constant expression}} \ + // both-note {{cannot refer to element -1}} + constexpr int under3 = __builtin_strlen(c - 1); // both-error {{constant expression}} \ + // both-note {{cannot refer to element -1}} constexpr char d[] = { 'f', 'o', 'o' }; // no nul terminator. - constexpr int bad = __builtin_strlen(d); // expected-error {{constant expression}} \ - // expected-note {{one-past-the-end}} \ - // expected-note {{in call to}} \ - // ref-error {{constant expression}} \ - // ref-note {{one-past-the-end}} + constexpr int bad = __builtin_strlen(d); // both-error {{constant expression}} \ + // both-note {{one-past-the-end}} \ + // expected-note {{in call to}} } namespace nan { @@ -115,8 +95,7 @@ namespace nan { // expected-error@-2 {{must be initialized by a constant expression}} #endif - constexpr double NaN3 = __builtin_nan("foo"); // expected-error {{must be initialized by a constant expression}} \ - // ref-error {{must be initialized by a constant expression}} + constexpr double NaN3 = __builtin_nan("foo"); // both-error {{must be initialized by a constant expression}} constexpr float NaN4 = __builtin_nanf(""); //constexpr long double NaN5 = __builtin_nanf128(""); @@ -126,8 +105,7 @@ namespace nan { /// FIXME: Current interpreter misses diagnostics. constexpr char f2[] = {'0', 'x', 'A', 'E'}; /// No trailing 0 byte. - constexpr double NaN7 = __builtin_nan(f2); // ref-error {{must be initialized by a constant expression}} \ - // expected-error {{must be initialized by a constant expression}} \ + constexpr double NaN7 = __builtin_nan(f2); // both-error {{must be initialized by a constant expression}} \ // expected-note {{read of dereferenced one-past-the-end pointer}} \ // expected-note {{in call to}} static_assert(!__builtin_issignaling(__builtin_nan("")), ""); @@ -370,9 +348,6 @@ namespace EhReturnDataRegno { case __builtin_eh_return_data_regno(0): // constant foldable. break; } - - __builtin_eh_return_data_regno(X); // expected-error {{argument to '__builtin_eh_return_data_regno' must be a constant integer}} \ - // ref-error {{argument to '__builtin_eh_return_data_regno' must be a constant integer}} - + __builtin_eh_return_data_regno(X); // both-error {{argument to '__builtin_eh_return_data_regno' must be a constant integer}} } } -- cgit v1.1 From ef05b4b520ee342db6a3d6c5607f8e8729246316 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 8 Feb 2024 09:31:26 +0000 Subject: [BasicAA] More vscale tests. NFC This time with i8 geps and scale intrinsics, along with mutiple vscale intrinsics that can be treated as identical. --- llvm/test/Analysis/BasicAA/vscale.ll | 168 +++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll index 3fff435..1b9118b 100644 --- a/llvm/test/Analysis/BasicAA/vscale.ll +++ b/llvm/test/Analysis/BasicAA/vscale.ll @@ -309,6 +309,174 @@ define void @v1v2types(ptr %p) vscale_range(1,16) { ret void } +; VScale intrinsic offset tests + +; CHECK-LABEL: vscale_neg_notscalable +; CHECK-DAG: NoAlias: <4 x i32>* %p, <4 x i32>* %vm16 +; CHECK-DAG: NoAlias: <4 x i32>* %m16, <4 x i32>* %p +; CHECK-DAG: MayAlias: <4 x i32>* %m16, <4 x i32>* %vm16 +; CHECK-DAG: MayAlias: <4 x i32>* %p, <4 x i32>* %vm16m16 +; CHECK-DAG: NoAlias: <4 x i32>* %vm16, <4 x i32>* %vm16m16 +; CHECK-DAG: NoAlias: <4 x i32>* %m16, <4 x i32>* %vm16m16 +; CHECK-DAG: MayAlias: <4 x i32>* %m16pv16, <4 x i32>* %p +; CHECK-DAG: NoAlias: <4 x i32>* %m16pv16, <4 x i32>* %vm16 +; CHECK-DAG: NoAlias: <4 x i32>* %m16, <4 x i32>* %m16pv16 +; CHECK-DAG: MayAlias: <4 x i32>* %m16pv16, <4 x i32>* %vm16m16 +define void @vscale_neg_notscalable(ptr %p) { + %v = call i64 @llvm.vscale.i64() + %vp = mul nsw i64 %v, 16 + %vm = mul nsw i64 %v, -16 + %vm16 = getelementptr i8, ptr %p, i64 %vm + %m16 = getelementptr <4 x i32>, ptr %p, i64 -1 + %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 -1 + %m16pv16 = getelementptr i8, ptr %m16, i64 %vp + load <4 x i32>, ptr %p + load <4 x i32>, ptr %vm16 + load <4 x i32>, ptr %m16 + load <4 x i32>, ptr %vm16m16 + load <4 x i32>, ptr %m16pv16 + ret void +} + +; CHECK-LABEL: vscale_neg_scalable +; CHECK-DAG: MayAlias: * %p, * %vm16 +; CHECK-DAG: MayAlias: * %m16, * %p +; CHECK-DAG: MayAlias: * %m16, * %vm16 +; CHECK-DAG: MayAlias: * %p, * %vm16m16 +; CHECK-DAG: MayAlias: * %vm16, * %vm16m16 +; CHECK-DAG: MayAlias: * %m16, * %vm16m16 +; CHECK-DAG: MayAlias: * %m16pv16, * %p +; CHECK-DAG: MayAlias: * %m16pv16, * %vm16 +; CHECK-DAG: MayAlias: * %m16, * %m16pv16 +; CHECK-DAG: MayAlias: * %m16pv16, * %vm16m16 +define void @vscale_neg_scalable(ptr %p) { + %v = call i64 @llvm.vscale.i64() + %vp = mul nsw i64 %v, 16 + %vm = mul nsw i64 %v, -16 + %vm16 = getelementptr i8, ptr %p, i64 %vm + %m16 = getelementptr <4 x i32>, ptr %p, i64 -1 + %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 -1 + %m16pv16 = getelementptr i8, ptr %m16, i64 %vp + load , ptr %p + load , ptr %vm16 + load , ptr %m16 + load , ptr %vm16m16 + load , ptr %m16pv16 + ret void +} + +; CHECK-LABEL: vscale_pos_notscalable +; CHECK-DAG: NoAlias: <4 x i32>* %p, <4 x i32>* %vm16 +; CHECK-DAG: NoAlias: <4 x i32>* %m16, <4 x i32>* %p +; CHECK-DAG: MayAlias: <4 x i32>* %m16, <4 x i32>* %vm16 +; CHECK-DAG: MayAlias: <4 x i32>* %p, <4 x i32>* %vm16m16 +; CHECK-DAG: NoAlias: <4 x i32>* %vm16, <4 x i32>* %vm16m16 +; CHECK-DAG: NoAlias: <4 x i32>* %m16, <4 x i32>* %vm16m16 +; CHECK-DAG: MayAlias: <4 x i32>* %m16pv16, <4 x i32>* %p +; CHECK-DAG: NoAlias: <4 x i32>* %m16pv16, <4 x i32>* %vm16 +; CHECK-DAG: NoAlias: <4 x i32>* %m16, <4 x i32>* %m16pv16 +; CHECK-DAG: MayAlias: <4 x i32>* %m16pv16, <4 x i32>* %vm16m16 +define void @vscale_pos_notscalable(ptr %p) { + %v = call i64 @llvm.vscale.i64() + %vp = mul nsw i64 %v, 16 + %vm = mul nsw i64 %v, -16 + %vm16 = getelementptr i8, ptr %p, i64 %vp + %m16 = getelementptr <4 x i32>, ptr %p, i64 1 + %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 1 + %m16pv16 = getelementptr i8, ptr %m16, i64 %vm + load <4 x i32>, ptr %p + load <4 x i32>, ptr %vm16 + load <4 x i32>, ptr %m16 + load <4 x i32>, ptr %vm16m16 + load <4 x i32>, ptr %m16pv16 + ret void +} + +; CHECK-LABEL: vscale_pos_scalable +; CHECK-DAG: MayAlias: * %p, * %vm16 +; CHECK-DAG: MayAlias: * %m16, * %p +; CHECK-DAG: MayAlias: * %m16, * %vm16 +; CHECK-DAG: MayAlias: * %p, * %vm16m16 +; CHECK-DAG: MayAlias: * %vm16, * %vm16m16 +; CHECK-DAG: MayAlias: * %m16, * %vm16m16 +; CHECK-DAG: MayAlias: * %m16pv16, * %p +; CHECK-DAG: MayAlias: * %m16pv16, * %vm16 +; CHECK-DAG: MayAlias: * %m16, * %m16pv16 +; CHECK-DAG: MayAlias: * %m16pv16, * %vm16m16 +define void @vscale_pos_scalable(ptr %p) { + %v = call i64 @llvm.vscale.i64() + %vp = mul nsw i64 %v, 16 + %vm = mul nsw i64 %v, -16 + %vm16 = getelementptr i8, ptr %p, i64 %vp + %m16 = getelementptr <4 x i32>, ptr %p, i64 1 + %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 1 + %m16pv16 = getelementptr i8, ptr %m16, i64 %vm + load , ptr %p + load , ptr %vm16 + load , ptr %m16 + load , ptr %vm16m16 + load , ptr %m16pv16 + ret void +} + +; CHECK-LABEL: vscale_v1v2types +; CHECK-DAG: MustAlias: <4 x i32>* %p, * %p +; CHECK-DAG: MayAlias: * %p, * %vm16 +; CHECK-DAG: MayAlias: <4 x i32>* %p, * %vm16 +; CHECK-DAG: MayAlias: * %p, <4 x i32>* %vm16 +; CHECK-DAG: NoAlias: <4 x i32>* %p, <4 x i32>* %vm16 +; CHECK-DAG: MustAlias: <4 x i32>* %vm16, * %vm16 +; CHECK-DAG: MayAlias: * %m16, * %p +; CHECK-DAG: MayAlias: * %m16, <4 x i32>* %p +; CHECK-DAG: MayAlias: * %m16, * %vm16 +; CHECK-DAG: MayAlias: * %m16, <4 x i32>* %vm16 +; CHECK-DAG: NoAlias: <4 x i32>* %m16, * %p +; CHECK-DAG: NoAlias: <4 x i32>* %m16, <4 x i32>* %p +; CHECK-DAG: MayAlias: <4 x i32>* %m16, * %vm16 +; CHECK-DAG: MayAlias: <4 x i32>* %m16, <4 x i32>* %vm16 +; CHECK-DAG: MustAlias: <4 x i32>* %m16, * %m16 +; CHECK-DAG: MayAlias: * %p, * %vp16 +; CHECK-DAG: MayAlias: <4 x i32>* %p, * %vp16 +; CHECK-DAG: MayAlias: * %vm16, * %vp16 +; CHECK-DAG: MayAlias: <4 x i32>* %vm16, * %vp16 +; CHECK-DAG: MayAlias: * %m16, * %vp16 +; CHECK-DAG: MayAlias: <4 x i32>* %m16, * %vp16 +define void @vscale_v1v2types(ptr %p) { + %v = call i64 @llvm.vscale.i64() + %vp = mul nsw i64 %v, 16 + %vm = mul nsw i64 %v, -16 + %vp16 = getelementptr i8, ptr %p, i64 %vp + %vm16 = getelementptr i8, ptr %p, i64 %vm + %m16 = getelementptr <4 x i32>, ptr %p, i64 -1 + load , ptr %p + load <4 x i32>, ptr %p + load , ptr %vm16 + load <4 x i32>, ptr %vm16 + load , ptr %m16 + load <4 x i32>, ptr %m16 + load , ptr %vp16 + ret void +} + +; CHECK-LABEL: twovscales +; CHECK-DAG: MayAlias: * %vp161, * %vp162 +; CHECK-DAG: MayAlias: * %vp161, * %vp161b +; CHECK-DAG: MayAlias: * %vp161b, * %vp162 +define void @twovscales(ptr %p) { + %v1 = call i64 @llvm.vscale.i64() + %v2 = call i64 @llvm.vscale.i64() + %vp1 = mul nsw i64 %v1, 16 + %vp2 = mul nsw i64 %v2, 16 + %vp3 = mul nsw i64 %v1, 17 + %vp161 = getelementptr i8, ptr %p, i64 %vp1 + %vp162 = getelementptr i8, ptr %p, i64 %vp2 + %vp161b = getelementptr i8, ptr %vp161, i64 %vp3 + load , ptr %vp161 + load , ptr %vp162 + load , ptr %vp161b + ret void +} + ; getelementptr recursion ; CHECK-LABEL: gep_recursion_level_1 -- cgit v1.1 From 9ac82f0d3ecf6c13669b0c7940920460c037a292 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 8 Feb 2024 11:45:57 +0200 Subject: [OpenMP] [cmake] In standalone mode, make Python3_EXECUTABLE available (#80828) When running the tests, we try to invoke them as "${Python3_EXECUTABLE} ${OPENMP_LLVM_LIT_EXECUTABLE}", but when running "find_package(Python3)" within the function "find_standalone_test_dependencies", the variable "Python3_EXECUTABLE" only gets set within the function scope. Tests have worked regardless of this in many cases, where executing the python script directly succeeds. But for consistency, and for working in cases when the python script can't be executed as such, make the Python3_EXECUTABLE variable available as intended. --- openmp/cmake/OpenMPTesting.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openmp/cmake/OpenMPTesting.cmake b/openmp/cmake/OpenMPTesting.cmake index df41956..ab2348ae 100644 --- a/openmp/cmake/OpenMPTesting.cmake +++ b/openmp/cmake/OpenMPTesting.cmake @@ -10,6 +10,8 @@ function(find_standalone_test_dependencies) message(WARNING "The check targets will not be available!") set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE) return() + else() + set(Python3_EXECUTABLE ${Python3_EXECUTABLE} PARENT_SCOPE) endif() # Find executables. -- cgit v1.1 From 49ee2ffc65b7660bfe84cd842e083d6c0ee3e991 Mon Sep 17 00:00:00 2001 From: Evgeniy Date: Thu, 8 Feb 2024 02:06:22 -0800 Subject: [X86][GlobalISel] Reorganize br/brcond tests (NFC) (#80204) Removing duplicating tests under GlobalISel, consolidating to perform checks with all three selectors. --- llvm/test/CodeGen/X86/GlobalISel/br.ll | 19 - llvm/test/CodeGen/X86/GlobalISel/brcond.ll | 91 -- llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll | 293 ------ llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll | 469 --------- llvm/test/CodeGen/X86/isel-br.ll | 31 + llvm/test/CodeGen/X86/isel-brcond-fcmp.ll | 1341 ++++++++++++++++++++++++ llvm/test/CodeGen/X86/isel-brcond-icmp.ll | 1107 +++++++++++++++++++ 7 files changed, 2479 insertions(+), 872 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/GlobalISel/br.ll delete mode 100644 llvm/test/CodeGen/X86/GlobalISel/brcond.ll delete mode 100644 llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll delete mode 100644 llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll create mode 100644 llvm/test/CodeGen/X86/isel-br.ll create mode 100644 llvm/test/CodeGen/X86/isel-brcond-fcmp.ll create mode 100644 llvm/test/CodeGen/X86/isel-brcond-icmp.ll diff --git a/llvm/test/CodeGen/X86/GlobalISel/br.ll b/llvm/test/CodeGen/X86/GlobalISel/br.ll deleted file mode 100644 index 878fe98..0000000 --- a/llvm/test/CodeGen/X86/GlobalISel/br.ll +++ /dev/null @@ -1,19 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -O0 -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs %s -o - | FileCheck %s - -define void @uncondbr() { -; CHECK-LABEL: uncondbr: -; CHECK: # %bb.1: # %entry -; CHECK-NEXT: jmp .LBB0_3 -; CHECK-NEXT: .LBB0_2: # %end -; CHECK-NEXT: retq -; CHECK-NEXT: .LBB0_3: # %bb2 -; CHECK-NEXT: jmp .LBB0_2 -entry: - br label %bb2 -end: - ret void -bb2: - br label %end -} - diff --git a/llvm/test/CodeGen/X86/GlobalISel/brcond.ll b/llvm/test/CodeGen/X86/GlobalISel/brcond.ll deleted file mode 100644 index b38fbfd..0000000 --- a/llvm/test/CodeGen/X86/GlobalISel/brcond.ll +++ /dev/null @@ -1,91 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X64 -; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X86 - -define i32 @test_1(i32 %a, i32 %b, i32 %tValue, i32 %fValue) { -; X64-LABEL: test_1: -; X64: # %bb.0: # %entry -; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: setl %al -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB0_2 -; X64-NEXT: # %bb.1: # %if.then -; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: retq -; X64-NEXT: .LBB0_2: # %if.else -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: retq -; -; X86-LABEL: test_1: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %eax -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: setl %al -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB0_2 -; X86-NEXT: # %bb.1: # %if.then -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: jmp .LBB0_3 -; X86-NEXT: .LBB0_2: # %if.else -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: .LBB0_3: # %return -; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: popl %ecx -; X86-NEXT: .cfi_def_cfa_offset 4 -; X86-NEXT: retl -entry: - %retval = alloca i32, align 4 - %cmp = icmp slt i32 %a, %b - br i1 %cmp, label %if.then, label %if.else - -if.then: - store i32 %tValue, ptr %retval, align 4 - br label %return - -if.else: - store i32 %fValue, ptr %retval, align 4 - br label %return - -return: - %0 = load i32, ptr %retval, align 4 - ret i32 %0 -} - -define i32 @test_2(i32 %a) { -; X64-LABEL: test_2: -; X64: # %bb.0: # %entry -; X64-NEXT: testb $1, %dil -; X64-NEXT: je .LBB1_2 -; X64-NEXT: # %bb.1: # %if.then -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB1_2: # %if.else -; X64-NEXT: movl $1, %eax -; X64-NEXT: retq -; -; X86-LABEL: test_2: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB1_2 -; X86-NEXT: # %bb.1: # %if.then -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB1_2: # %if.else -; X86-NEXT: movl $1, %eax -; X86-NEXT: retl -entry: - %cmp = trunc i32 %a to i1 - br i1 %cmp, label %if.then, label %if.else - -if.then: - ret i32 0 -if.else: - ret i32 1 -} - diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll deleted file mode 100644 index 475d8fc..0000000 --- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll +++ /dev/null @@ -1,293 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s - -define i32 @fcmp_oeq(float %x, float %y) { -; CHECK-LABEL: fcmp_oeq -; CHECK: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jne {{LBB.+_1}} -; CHECK-NEXT: jp {{LBB.+_1}} - %1 = fcmp oeq float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ogt(float %x, float %y) { -; CHECK-LABEL: fcmp_ogt -; CHECK: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jbe {{LBB.+_1}} - %1 = fcmp ogt float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_oge(float %x, float %y) { -; CHECK-LABEL: fcmp_oge -; CHECK: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jb {{LBB.+_1}} - %1 = fcmp oge float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_olt(float %x, float %y) { -; CHECK-LABEL: fcmp_olt -; CHECK: ucomiss %xmm0, %xmm1 -; CHECK-NEXT: jbe {{LBB.+_1}} - %1 = fcmp olt float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ole(float %x, float %y) { -; CHECK-LABEL: fcmp_ole -; CHECK: ucomiss %xmm0, %xmm1 -; CHECK-NEXT: jb {{LBB.+_1}} - %1 = fcmp ole float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_one(float %x, float %y) { -; CHECK-LABEL: fcmp_one -; CHECK: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: je {{LBB.+_1}} - %1 = fcmp one float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ord(float %x, float %y) { -; CHECK-LABEL: fcmp_ord -; CHECK: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jp {{LBB.+_1}} - %1 = fcmp ord float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_uno(float %x, float %y) { -; CHECK-LABEL: fcmp_uno -; CHECK: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jp {{LBB.+_2}} - %1 = fcmp uno float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ueq(float %x, float %y) { -; CHECK-LABEL: fcmp_ueq -; CHECK: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: je {{LBB.+_2}} - %1 = fcmp ueq float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ugt(float %x, float %y) { -; CHECK-LABEL: fcmp_ugt -; CHECK: ucomiss %xmm0, %xmm1 -; CHECK-NEXT: jae {{LBB.+_1}} - %1 = fcmp ugt float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_uge(float %x, float %y) { -; CHECK-LABEL: fcmp_uge -; CHECK: ucomiss %xmm0, %xmm1 -; CHECK-NEXT: ja {{LBB.+_1}} - %1 = fcmp uge float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ult(float %x, float %y) { -; CHECK-LABEL: fcmp_ult -; CHECK: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jae {{LBB.+_1}} - %1 = fcmp ult float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ule(float %x, float %y) { -; CHECK-LABEL: fcmp_ule -; CHECK: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: ja {{LBB.+_1}} - %1 = fcmp ule float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_une(float %x, float %y) { -; CHECK-LABEL: fcmp_une -; CHECK: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jne {{LBB.+_2}} -; CHECK-NEXT: jnp {{LBB.+_1}} - %1 = fcmp une float %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_eq(i32 %x, i32 %y) { -; CHECK-LABEL: icmp_eq -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jne {{LBB.+_1}} - %1 = icmp eq i32 %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_ne(i32 %x, i32 %y) { -; CHECK-LABEL: icmp_ne -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: je {{LBB.+_1}} - %1 = icmp ne i32 %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_ugt(i32 %x, i32 %y) { -; CHECK-LABEL: icmp_ugt -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jbe {{LBB.+_1}} - %1 = icmp ugt i32 %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_uge(i32 %x, i32 %y) { -; CHECK-LABEL: icmp_uge -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jb {{LBB.+_1}} - %1 = icmp uge i32 %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_ult(i32 %x, i32 %y) { -; CHECK-LABEL: icmp_ult -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jae {{LBB.+_1}} - %1 = icmp ult i32 %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_ule(i32 %x, i32 %y) { -; CHECK-LABEL: icmp_ule -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: ja {{LBB.+_1}} - %1 = icmp ule i32 %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_sgt(i32 %x, i32 %y) { -; CHECK-LABEL: icmp_sgt -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jle {{LBB.+_1}} - %1 = icmp sgt i32 %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_sge(i32 %x, i32 %y) { -; CHECK-LABEL: icmp_sge -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jl {{LBB.+_1}} - %1 = icmp sge i32 %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_slt(i32 %x, i32 %y) { -; CHECK-LABEL: icmp_slt -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jge {{LBB.+_1}} - %1 = icmp slt i32 %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_sle(i32 %x, i32 %y) { -; CHECK-LABEL: icmp_sle -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jg {{LBB.+_1}} - %1 = icmp sle i32 %x, %y - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll deleted file mode 100644 index 8f09b2e3..0000000 --- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll +++ /dev/null @@ -1,469 +0,0 @@ -; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s - -define i32 @fcmp_oeq1(float %x) { -; CHECK-LABEL: fcmp_oeq1 -; CHECK: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jp {{LBB.+_1}} - %1 = fcmp oeq float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_oeq2(float %x) { -; CHECK-LABEL: fcmp_oeq2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jne {{LBB.+_1}} -; CHECK-NEXT: jp {{LBB.+_1}} - %1 = fcmp oeq float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ogt1(float %x) { -; CHECK-LABEL: fcmp_ogt1 -; CHECK-NOT: ucomiss -; CHECK: movl $1, %eax - %1 = fcmp ogt float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ogt2(float %x) { -; CHECK-LABEL: fcmp_ogt2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jbe {{LBB.+_1}} - %1 = fcmp ogt float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_oge1(float %x) { -; CHECK-LABEL: fcmp_oge1 -; CHECK: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jp {{LBB.+_1}} - %1 = fcmp oge float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_oge2(float %x) { -; CHECK-LABEL: fcmp_oge2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jb {{LBB.+_1}} - %1 = fcmp oge float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_olt1(float %x) { -; CHECK-LABEL: fcmp_olt1 -; CHECK-NOT: ucomiss -; CHECK: movl $1, %eax - %1 = fcmp olt float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_olt2(float %x) { -; CHECK-LABEL: fcmp_olt2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm0, %xmm1 -; CHECK-NEXT: jbe {{LBB.+_1}} - %1 = fcmp olt float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ole1(float %x) { -; CHECK-LABEL: fcmp_ole1 -; CHECK: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jp {{LBB.+_1}} - %1 = fcmp ole float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ole2(float %x) { -; CHECK-LABEL: fcmp_ole2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm0, %xmm1 -; CHECK-NEXT: jb {{LBB.+_1}} - %1 = fcmp ole float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_one1(float %x) { -; CHECK-LABEL: fcmp_one1 -; CHECK-NOT: ucomiss -; CHECK: movl $1, %eax - %1 = fcmp one float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_one2(float %x) { -; CHECK-LABEL: fcmp_one2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: je {{LBB.+_1}} - %1 = fcmp one float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ord1(float %x) { -; CHECK-LABEL: fcmp_ord1 -; CHECK: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jp {{LBB.+_1}} - %1 = fcmp ord float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ord2(float %x) { -; CHECK-LABEL: fcmp_ord2 -; CHECK: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jp {{LBB.+_1}} - %1 = fcmp ord float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_uno1(float %x) { -; CHECK-LABEL: fcmp_uno1 -; CHECK: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jp {{LBB.+_2}} - %1 = fcmp uno float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_uno2(float %x) { -; CHECK-LABEL: fcmp_uno2 -; CHECK: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jp {{LBB.+_2}} - %1 = fcmp uno float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ueq1(float %x) { -; CHECK-LABEL: fcmp_ueq1 -; CHECK-NOT: ucomiss - %1 = fcmp ueq float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ueq2(float %x) { -; CHECK-LABEL: fcmp_ueq2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: je {{LBB.+_2}} - %1 = fcmp ueq float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ugt1(float %x) { -; CHECK-LABEL: fcmp_ugt1 -; CHECK: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jnp {{LBB.+_1}} - %1 = fcmp ugt float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ugt2(float %x) { -; CHECK-LABEL: fcmp_ugt2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm0, %xmm1 -; CHECK-NEXT: jae {{LBB.+_1}} - %1 = fcmp ugt float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_uge1(float %x) { -; CHECK-LABEL: fcmp_uge1 -; CHECK-NOT: ucomiss - %1 = fcmp uge float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_uge2(float %x) { -; CHECK-LABEL: fcmp_uge2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm0, %xmm1 -; CHECK-NEXT: ja {{LBB.+_1}} - %1 = fcmp uge float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ult1(float %x) { -; CHECK-LABEL: fcmp_ult1 -; CHECK: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jnp {{LBB.+_1}} - %1 = fcmp ult float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ult2(float %x) { -; CHECK-LABEL: fcmp_ult2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jae {{LBB.+_1}} - %1 = fcmp ult float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ule1(float %x) { -; CHECK-LABEL: fcmp_ule1 -; CHECK-NOT: ucomiss - %1 = fcmp ule float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_ule2(float %x) { -; CHECK-LABEL: fcmp_ule2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: ja {{LBB.+_1}} - %1 = fcmp ule float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_une1(float %x) { -; CHECK-LABEL: fcmp_une1 -; CHECK: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jnp {{LBB.+_1}} - %1 = fcmp une float %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @fcmp_une2(float %x) { -; CHECK-LABEL: fcmp_une2 -; CHECK: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: jne {{LBB.+_2}} -; CHECK-NEXT: jnp {{LBB.+_1}} - %1 = fcmp une float %x, 0.000000e+00 - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_eq(i32 %x) { -; CHECK-LABEL: icmp_eq -; CHECK-NOT: cmpl -; CHECK: xorl %eax, %eax - %1 = icmp eq i32 %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_ne(i32 %x) { -; CHECK-LABEL: icmp_ne -; CHECK-NOT: cmpl -; CHECK: movl $1, %eax - %1 = icmp ne i32 %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_ugt(i32 %x) { -; CHECK-LABEL: icmp_ugt -; CHECK-NOT: cmpl -; CHECK: movl $1, %eax - %1 = icmp ugt i32 %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_uge(i32 %x) { -; CHECK-LABEL: icmp_uge -; CHECK-NOT: cmpl -; CHECK: xorl %eax, %eax - %1 = icmp uge i32 %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_ult(i32 %x) { -; CHECK-LABEL: icmp_ult -; CHECK-NOT: cmpl -; CHECK: movl $1, %eax - %1 = icmp ult i32 %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_ule(i32 %x) { -; CHECK-LABEL: icmp_ule -; CHECK-NOT: cmpl -; CHECK: xorl %eax, %eax - %1 = icmp ule i32 %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_sgt(i32 %x) { -; CHECK-LABEL: icmp_sgt -; CHECK-NOT: cmpl -; CHECK: movl $1, %eax - %1 = icmp sgt i32 %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_sge(i32 %x) { -; CHECK-LABEL: icmp_sge -; CHECK-NOT: cmpl -; CHECK: xorl %eax, %eax - %1 = icmp sge i32 %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_slt(i32 %x) { -; CHECK-LABEL: icmp_slt -; CHECK-NOT: cmpl -; CHECK: movl $1, %eax - %1 = icmp slt i32 %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - -define i32 @icmp_sle(i32 %x) { -; CHECK-LABEL: icmp_sle -; CHECK-NOT: cmpl -; CHECK: xorl %eax, %eax - %1 = icmp sle i32 %x, %x - br i1 %1, label %bb1, label %bb2 -bb2: - ret i32 1 -bb1: - ret i32 0 -} - diff --git a/llvm/test/CodeGen/X86/isel-br.ll b/llvm/test/CodeGen/X86/isel-br.ll new file mode 100644 index 0000000..5388c89 --- /dev/null +++ b/llvm/test/CodeGen/X86/isel-br.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -O0 -mtriple=i686-linux-gnu -global-isel=0 -verify-machineinstrs | FileCheck %s --check-prefix=DAG +; RUN: llc < %s -O0 -mtriple=i686-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=DAG +; RUN: llc < %s -O0 -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 -verify-machineinstrs | FileCheck %s --check-prefix=GISEL +; RUN: llc < %s -O0 -mtriple=x86_64-linux-gnu -global-isel=0 | FileCheck %s --check-prefix=DAG +; RUN: llc < %s -O0 -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=DAG +; RUN: llc < %s -O0 -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=GISEL + +define void @uncondbr() { +; DAG-LABEL: uncondbr: +; DAG: # %bb.0: # %entry +; DAG-NEXT: jmp .LBB0_2 +; DAG-NEXT: .LBB0_1: # %end +; DAG-NEXT: ret{{[l|q]}} +; DAG-NEXT: .LBB0_2: # %bb2 +; DAG-NEXT: jmp .LBB0_1 +; +; GISEL-LABEL: uncondbr: +; GISEL: # %bb.1: # %entry +; GISEL-NEXT: jmp .LBB0_3 +; GISEL-NEXT: .LBB0_2: # %end +; GISEL-NEXT: ret{{[l|q]}} +; GISEL-NEXT: .LBB0_3: # %bb2 +; GISEL-NEXT: jmp .LBB0_2 +entry: + br label %bb2 +end: + ret void +bb2: + br label %end +} diff --git a/llvm/test/CodeGen/X86/isel-brcond-fcmp.ll b/llvm/test/CodeGen/X86/isel-brcond-fcmp.ll new file mode 100644 index 0000000..5a28e09 --- /dev/null +++ b/llvm/test/CodeGen/X86/isel-brcond-fcmp.ll @@ -0,0 +1,1341 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -global-isel=0 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=X64,SDAG-X64 +; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=X64,FASTISEL-X64 +; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=GISEL-X64 + +define i32 @fcmp_oeq(float %x, float %y) { +; X64-LABEL: fcmp_oeq: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jne LBB0_1 +; X64-NEXT: jp LBB0_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB0_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_oeq: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: sete %al +; GISEL-X64-NEXT: setnp %cl +; GISEL-X64-NEXT: andb %al, %cl +; GISEL-X64-NEXT: testb $1, %cl +; GISEL-X64-NEXT: je LBB0_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB0_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp oeq float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ogt(float %x, float %y) { +; X64-LABEL: fcmp_ogt: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jbe LBB1_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB1_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ogt: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: seta %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB1_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB1_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ogt float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_oge(float %x, float %y) { +; X64-LABEL: fcmp_oge: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jb LBB2_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB2_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_oge: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setae %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB2_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB2_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp oge float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_olt(float %x, float %y) { +; X64-LABEL: fcmp_olt: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm1 +; X64-NEXT: jbe LBB3_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB3_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_olt: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm1 +; GISEL-X64-NEXT: seta %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB3_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB3_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp olt float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ole(float %x, float %y) { +; X64-LABEL: fcmp_ole: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm1 +; X64-NEXT: jb LBB4_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB4_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ole: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm1 +; GISEL-X64-NEXT: setae %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB4_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB4_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ole float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_one(float %x, float %y) { +; X64-LABEL: fcmp_one: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: je LBB5_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB5_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_one: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setne %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB5_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB5_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp one float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ord(float %x, float %y) { +; X64-LABEL: fcmp_ord: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jp LBB6_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB6_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ord: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setnp %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB6_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB6_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ord float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_uno(float %x, float %y) { +; X64-LABEL: fcmp_uno: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jp LBB7_2 +; X64-NEXT: ## %bb.1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; X64-NEXT: LBB7_2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_uno: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setp %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: jne LBB7_2 +; GISEL-X64-NEXT: ## %bb.1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB7_2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp uno float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ueq(float %x, float %y) { +; X64-LABEL: fcmp_ueq: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: je LBB8_2 +; X64-NEXT: ## %bb.1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; X64-NEXT: LBB8_2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ueq: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: sete %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: jne LBB8_2 +; GISEL-X64-NEXT: ## %bb.1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB8_2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ueq float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ugt(float %x, float %y) { +; X64-LABEL: fcmp_ugt: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm1 +; X64-NEXT: jae LBB9_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB9_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ugt: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm1 +; GISEL-X64-NEXT: setb %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB9_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB9_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ugt float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_uge(float %x, float %y) { +; X64-LABEL: fcmp_uge: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm1 +; X64-NEXT: ja LBB10_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB10_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_uge: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm1 +; GISEL-X64-NEXT: setbe %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB10_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB10_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp uge float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ult(float %x, float %y) { +; X64-LABEL: fcmp_ult: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jae LBB11_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB11_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ult: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setb %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB11_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB11_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ult float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ule(float %x, float %y) { +; X64-LABEL: fcmp_ule: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: ja LBB12_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB12_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ule: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setbe %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB12_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB12_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ule float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_une(float %x, float %y) { +; X64-LABEL: fcmp_une: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jne LBB13_2 +; X64-NEXT: jnp LBB13_1 +; X64-NEXT: LBB13_2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB13_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_une: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setne %al +; GISEL-X64-NEXT: setp %cl +; GISEL-X64-NEXT: orb %al, %cl +; GISEL-X64-NEXT: testb $1, %cl +; GISEL-X64-NEXT: je LBB13_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB13_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp une float %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_oeq1(float %x) { +; X64-LABEL: fcmp_oeq1: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: jp LBB14_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB14_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_oeq1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: sete %al +; GISEL-X64-NEXT: setnp %cl +; GISEL-X64-NEXT: andb %al, %cl +; GISEL-X64-NEXT: testb $1, %cl +; GISEL-X64-NEXT: je LBB14_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB14_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp oeq float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_oeq2(float %x) { +; X64-LABEL: fcmp_oeq2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jne LBB15_1 +; X64-NEXT: jp LBB15_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB15_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_oeq2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: sete %al +; GISEL-X64-NEXT: setnp %cl +; GISEL-X64-NEXT: andb %al, %cl +; GISEL-X64-NEXT: testb $1, %cl +; GISEL-X64-NEXT: je LBB15_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB15_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp oeq float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ogt1(float %x) { +; SDAG-X64-LABEL: fcmp_ogt1: +; SDAG-X64: ## %bb.0: +; SDAG-X64-NEXT: xorl %eax, %eax +; SDAG-X64-NEXT: testb %al, %al +; SDAG-X64-NEXT: je LBB16_1 +; SDAG-X64-NEXT: ## %bb.2: ## %bb1 +; SDAG-X64-NEXT: xorl %eax, %eax +; SDAG-X64-NEXT: retq +; SDAG-X64-NEXT: LBB16_1: ## %bb2 +; SDAG-X64-NEXT: movl $1, %eax +; SDAG-X64-NEXT: retq + +; FASTISEL-X64-LABEL: fcmp_ogt1: +; FASTISEL-X64: ## %bb.0: +; FASTISEL-X64: movl $1, %eax +; FASTISEL-X64: retq + +; GISEL-X64-LABEL: fcmp_ogt1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: seta %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB16_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB16_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ogt float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ogt2(float %x) { +; X64-LABEL: fcmp_ogt2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jbe LBB17_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB17_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ogt2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: seta %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB17_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB17_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ogt float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_oge1(float %x) { +; X64-LABEL: fcmp_oge1: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: jp LBB18_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB18_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_oge1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: setae %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB18_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB18_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp oge float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_oge2(float %x) { +; X64-LABEL: fcmp_oge2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jb LBB19_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB19_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_oge2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setae %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB19_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB19_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp oge float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_olt1(float %x) { +; GISEL-X64-LABEL: fcmp_olt1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: seta %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB20_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB20_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp olt float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_olt2(float %x) { +; X64-LABEL: fcmp_olt2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm0, %xmm1 +; X64-NEXT: jbe LBB21_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB21_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_olt2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm1 +; GISEL-X64-NEXT: seta %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB21_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB21_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp olt float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ole1(float %x) { +; X64-LABEL: fcmp_ole1: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: jp LBB22_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB22_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ole1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: setae %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB22_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB22_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ole float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ole2(float %x) { +; X64-LABEL: fcmp_ole2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm0, %xmm1 +; X64-NEXT: jb LBB23_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB23_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ole2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm1 +; GISEL-X64-NEXT: setae %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB23_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB23_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ole float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_one1(float %x) { +; GISEL-X64-LABEL: fcmp_one1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: setne %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB24_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB24_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp one float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_one2(float %x) { +; X64-LABEL: fcmp_one2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: je LBB25_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB25_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_one2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setne %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB25_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB25_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp one float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ord1(float %x) { +; X64-LABEL: fcmp_ord1: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: jp LBB26_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB26_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ord1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: setnp %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB26_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB26_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ord float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ord2(float %x) { +; X64-LABEL: fcmp_ord2: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: jp LBB27_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB27_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ord2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setnp %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB27_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB27_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ord float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_uno1(float %x) { +; X64-LABEL: fcmp_uno1: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: jp LBB28_2 +; X64-NEXT: ## %bb.1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; X64-NEXT: LBB28_2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_uno1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: setp %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: jne LBB28_2 +; GISEL-X64-NEXT: ## %bb.1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB28_2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp uno float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_uno2(float %x) { +; X64-LABEL: fcmp_uno2: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: jp LBB29_2 +; X64-NEXT: ## %bb.1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; X64-NEXT: LBB29_2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_uno2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setp %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: jne LBB29_2 +; GISEL-X64-NEXT: ## %bb.1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB29_2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp uno float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ueq1(float %x) { +; GISEL-X64-LABEL: fcmp_ueq1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: sete %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: jne LBB30_2 +; GISEL-X64-NEXT: ## %bb.1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB30_2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ueq float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ueq2(float %x) { +; X64-LABEL: fcmp_ueq2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: je LBB31_2 +; X64-NEXT: ## %bb.1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; X64-NEXT: LBB31_2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ueq2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: sete %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: jne LBB31_2 +; GISEL-X64-NEXT: ## %bb.1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB31_2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ueq float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ugt1(float %x) { +; X64-LABEL: fcmp_ugt1: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: jnp LBB32_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB32_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ugt1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: setb %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB32_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB32_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ugt float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ugt2(float %x) { +; X64-LABEL: fcmp_ugt2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm0, %xmm1 +; X64-NEXT: jae LBB33_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB33_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ugt2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm1 +; GISEL-X64-NEXT: setb %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB33_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB33_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ugt float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_uge1(float %x) { +; GISEL-X64-LABEL: fcmp_uge1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: setbe %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB34_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB34_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp uge float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_uge2(float %x) { +; X64-LABEL: fcmp_uge2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm0, %xmm1 +; X64-NEXT: ja LBB35_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB35_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_uge2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm1 +; GISEL-X64-NEXT: setbe %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB35_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB35_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp uge float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ult1(float %x) { +; X64-LABEL: fcmp_ult1: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: jnp LBB36_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB36_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ult1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: setb %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB36_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB36_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ult float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ult2(float %x) { +; X64-LABEL: fcmp_ult2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jae LBB37_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB37_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ult2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setb %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB37_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB37_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ult float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ule1(float %x) { +; GISEL-X64-LABEL: fcmp_ule1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: setbe %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB38_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB38_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ule float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_ule2(float %x) { +; X64-LABEL: fcmp_ule2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: ja LBB39_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB39_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_ule2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setbe %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB39_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB39_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp ule float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_une1(float %x) { +; X64-LABEL: fcmp_une1: +; X64: ## %bb.0: +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: jnp LBB40_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB40_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_une1: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: ucomiss %xmm0, %xmm0 +; GISEL-X64-NEXT: setne %al +; GISEL-X64-NEXT: setp %cl +; GISEL-X64-NEXT: orb %al, %cl +; GISEL-X64-NEXT: testb $1, %cl +; GISEL-X64-NEXT: je LBB40_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB40_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp une float %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @fcmp_une2(float %x) { +; X64-LABEL: fcmp_une2: +; X64: ## %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: jne LBB41_2 +; X64-NEXT: jnp LBB41_1 +; X64-NEXT: LBB41_2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB41_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: fcmp_une2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; GISEL-X64-NEXT: ucomiss %xmm1, %xmm0 +; GISEL-X64-NEXT: setne %al +; GISEL-X64-NEXT: setp %cl +; GISEL-X64-NEXT: orb %al, %cl +; GISEL-X64-NEXT: testb $1, %cl +; GISEL-X64-NEXT: je LBB41_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB41_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq + %1 = fcmp une float %x, 0.000000e+00 + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} diff --git a/llvm/test/CodeGen/X86/isel-brcond-icmp.ll b/llvm/test/CodeGen/X86/isel-brcond-icmp.ll new file mode 100644 index 0000000..59a45d9 --- /dev/null +++ b/llvm/test/CodeGen/X86/isel-brcond-icmp.ll @@ -0,0 +1,1107 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -global-isel=0 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=X64,SDAG +; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=X64,FASTISEL +; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL-X64 +; RUN: llc < %s -global-isel=0 -mtriple=i686-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=X86,SDAG +; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=i686-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=X86,FASTISEL +; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL-X86 + +define i32 @icmp_eq_2(i32 %x, i32 %y) { +; X64-LABEL: icmp_eq_2: +; X64: ## %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: jne LBB0_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB0_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_eq_2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: sete %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB0_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB0_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; X86-LABEL: icmp_eq_2: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}} +; X86-NEXT: jne LBB0_1 +; X86-NEXT: ## %bb.2: ## %bb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: LBB0_1: ## %bb2 +; X86-NEXT: movl $1, %eax +; X86-NEXT: retl +; +; GISEL-X86-LABEL: icmp_eq_2: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: sete %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB0_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB0_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp eq i32 %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_ne_2(i32 %x, i32 %y) { +; X64-LABEL: icmp_ne_2: +; X64: ## %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: je LBB1_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB1_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_ne_2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setne %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB1_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB1_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; X86-LABEL: icmp_ne_2: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}} +; X86-NEXT: je LBB1_1 +; X86-NEXT: ## %bb.2: ## %bb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: LBB1_1: ## %bb2 +; X86-NEXT: movl $1, %eax +; X86-NEXT: retl +; +; GISEL-X86-LABEL: icmp_ne_2: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: setne %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB1_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB1_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp ne i32 %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_ugt_2(i32 %x, i32 %y) { +; X64-LABEL: icmp_ugt_2: +; X64: ## %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: jbe LBB2_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB2_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_ugt_2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: seta %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB2_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB2_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; X86-LABEL: icmp_ugt_2: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}} +; X86-NEXT: jbe LBB2_1 +; X86-NEXT: ## %bb.2: ## %bb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: LBB2_1: ## %bb2 +; X86-NEXT: movl $1, %eax +; X86-NEXT: retl +; +; GISEL-X86-LABEL: icmp_ugt_2: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: seta %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB2_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB2_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp ugt i32 %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_uge_2(i32 %x, i32 %y) { +; X64-LABEL: icmp_uge_2: +; X64: ## %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: jb LBB3_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB3_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_uge_2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setae %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB3_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB3_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; X86-LABEL: icmp_uge_2: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}} +; X86-NEXT: jb LBB3_1 +; X86-NEXT: ## %bb.2: ## %bb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: LBB3_1: ## %bb2 +; X86-NEXT: movl $1, %eax +; X86-NEXT: retl +; +; GISEL-X86-LABEL: icmp_uge_2: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: setae %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB3_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB3_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp uge i32 %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_ult_2(i32 %x, i32 %y) { +; X64-LABEL: icmp_ult_2: +; X64: ## %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: jae LBB4_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB4_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_ult_2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setb %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB4_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB4_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; X86-LABEL: icmp_ult_2: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}} +; X86-NEXT: jae LBB4_1 +; X86-NEXT: ## %bb.2: ## %bb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: LBB4_1: ## %bb2 +; X86-NEXT: movl $1, %eax +; X86-NEXT: retl +; +; GISEL-X86-LABEL: icmp_ult_2: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: setb %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB4_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB4_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp ult i32 %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_ule_2(i32 %x, i32 %y) { +; X64-LABEL: icmp_ule_2: +; X64: ## %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: ja LBB5_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB5_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_ule_2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setbe %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB5_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB5_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; X86-LABEL: icmp_ule_2: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}} +; X86-NEXT: ja LBB5_1 +; X86-NEXT: ## %bb.2: ## %bb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: LBB5_1: ## %bb2 +; X86-NEXT: movl $1, %eax +; X86-NEXT: retl +; +; GISEL-X86-LABEL: icmp_ule_2: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: setbe %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB5_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB5_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp ule i32 %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_sgt_2(i32 %x, i32 %y) { +; X64-LABEL: icmp_sgt_2: +; X64: ## %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: jle LBB6_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB6_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_sgt_2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setg %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB6_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB6_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; X86-LABEL: icmp_sgt_2: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}} +; X86-NEXT: jle LBB6_1 +; X86-NEXT: ## %bb.2: ## %bb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: LBB6_1: ## %bb2 +; X86-NEXT: movl $1, %eax +; X86-NEXT: retl +; +; GISEL-X86-LABEL: icmp_sgt_2: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: setg %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB6_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB6_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp sgt i32 %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_sge_2(i32 %x, i32 %y) { +; X64-LABEL: icmp_sge_2: +; X64: ## %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: jl LBB7_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB7_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_sge_2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setge %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB7_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB7_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; X86-LABEL: icmp_sge_2: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}} +; X86-NEXT: jl LBB7_1 +; X86-NEXT: ## %bb.2: ## %bb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: LBB7_1: ## %bb2 +; X86-NEXT: movl $1, %eax +; X86-NEXT: retl +; +; GISEL-X86-LABEL: icmp_sge_2: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: setge %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB7_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB7_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp sge i32 %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_slt_2(i32 %x, i32 %y) { +; X64-LABEL: icmp_slt_2: +; X64: ## %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: jge LBB8_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB8_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_slt_2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setl %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB8_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB8_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; X86-LABEL: icmp_slt_2: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}} +; X86-NEXT: jge LBB8_1 +; X86-NEXT: ## %bb.2: ## %bb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: LBB8_1: ## %bb2 +; X86-NEXT: movl $1, %eax +; X86-NEXT: retl +; +; GISEL-X86-LABEL: icmp_slt_2: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: setl %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB8_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB8_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp slt i32 %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_sle_2(i32 %x, i32 %y) { +; X64-LABEL: icmp_sle_2: +; X64: ## %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: jg LBB9_1 +; X64-NEXT: ## %bb.2: ## %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; X64-NEXT: LBB9_1: ## %bb2 +; X64-NEXT: movl $1, %eax +; X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_sle_2: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setle %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB9_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB9_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; X86-LABEL: icmp_sle_2: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}} +; X86-NEXT: jg LBB9_1 +; X86-NEXT: ## %bb.2: ## %bb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: LBB9_1: ## %bb2 +; X86-NEXT: movl $1, %eax +; X86-NEXT: retl +; +; GISEL-X86-LABEL: icmp_sle_2: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: setle %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB9_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB9_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp sle i32 %x, %y + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_eq(i32 %x) { +; SDAG-LABEL: icmp_eq: +; SDAG: ## %bb.0: +; SDAG-NEXT: movb $1, %al +; SDAG-NEXT: testb %al, %al +; SDAG-NEXT: je LBB10_1 +; SDAG-NEXT: ## %bb.2: ## %bb1 +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: ret{{q|l}} +; SDAG-NEXT: LBB10_1: ## %bb2 +; SDAG-NEXT: movl $1, %eax +; SDAG-NEXT: ret{{q|l}} +; +; FASTISEL-LABEL: icmp_eq: +; FASTISEL: ## %bb.0: +; FASTISEL-NEXT: xorl %eax, %eax +; FASTISEL-NEXT: ret{{q|l}} +; +; GISEL-X64-LABEL: icmp_eq: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %edi, %edi +; GISEL-X64-NEXT: sete %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB10_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB10_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; GISEL-X86-LABEL: icmp_eq: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, %eax +; GISEL-X86-NEXT: sete %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB10_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB10_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp eq i32 %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_ne(i32 %x) { +; SDAG-LABEL: icmp_ne: +; SDAG: ## %bb.0: +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: testb %al, %al +; SDAG-NEXT: je LBB11_1 +; SDAG-NEXT: ## %bb.2: ## %bb1 +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: ret{{q|l}} +; SDAG-NEXT: LBB11_1: ## %bb2 +; SDAG-NEXT: movl $1, %eax +; SDAG-NEXT: ret{{q|l}} +; +; FASTISEL-LABEL: icmp_ne: +; FASTISEL: ## %bb.0: +; FASTISEL-NEXT: movl $1, %eax +; FASTISEL-NEXT: ret{{q|l}} +; +; GISEL-X64-LABEL: icmp_ne: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %edi, %edi +; GISEL-X64-NEXT: setne %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB11_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB11_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; GISEL-X86-LABEL: icmp_ne: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, %eax +; GISEL-X86-NEXT: setne %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB11_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB11_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp ne i32 %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_ugt(i32 %x) { +; SDAG-LABEL: icmp_ugt: +; SDAG: ## %bb.0: +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: testb %al, %al +; SDAG-NEXT: je LBB12_1 +; SDAG-NEXT: ## %bb.2: ## %bb1 +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: ret{{q|l}} +; SDAG-NEXT: LBB12_1: ## %bb2 +; SDAG-NEXT: movl $1, %eax +; SDAG-NEXT: ret{{q|l}} +; +; FASTISEL-LABEL: icmp_ugt: +; FASTISEL: ## %bb.0: +; FASTISEL-NEXT: movl $1, %eax +; FASTISEL-NEXT: ret{{q|l}} +; +; GISEL-X64-LABEL: icmp_ugt: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %edi, %edi +; GISEL-X64-NEXT: seta %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB12_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB12_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; GISEL-X86-LABEL: icmp_ugt: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, %eax +; GISEL-X86-NEXT: seta %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB12_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB12_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp ugt i32 %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_uge(i32 %x) { +; SDAG-LABEL: icmp_uge: +; SDAG: ## %bb.0: +; SDAG-NEXT: movb $1, %al +; SDAG-NEXT: testb %al, %al +; SDAG-NEXT: je LBB13_1 +; SDAG-NEXT: ## %bb.2: ## %bb1 +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: ret{{q|l}} +; SDAG-NEXT: LBB13_1: ## %bb2 +; SDAG-NEXT: movl $1, %eax +; SDAG-NEXT: ret{{q|l}} +; +; FASTISEL-X64-LABEL: icmp_uge: +; FASTISEL-X64: ## %bb.0: +; FASTISEL-X64-NEXT: xorl %eax, %eax +; FASTISEL-X64-NEXT: retq +; +; GISEL-X64-LABEL: icmp_uge: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %edi, %edi +; GISEL-X64-NEXT: setae %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB13_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB13_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; GISEL-X86-LABEL: icmp_uge: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, %eax +; GISEL-X86-NEXT: setae %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB13_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB13_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp uge i32 %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_ult(i32 %x) { +; SDAG-LABEL: icmp_ult: +; SDAG: ## %bb.0: +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: testb %al, %al +; SDAG-NEXT: je LBB14_1 +; SDAG-NEXT: ## %bb.2: ## %bb1 +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: ret{{q|l}} +; SDAG-NEXT: LBB14_1: ## %bb2 +; SDAG-NEXT: movl $1, %eax +; SDAG-NEXT: ret{{q|l}} +; +; FASTISEL-X64-LABEL: icmp_ult: +; FASTISEL-X64: ## %bb.0: +; FASTISEL-X64-NEXT: movl $1, %eax +; FASTISEL-X64-NEXT: ret{{q|l}} +; +; GISEL-X64-LABEL: icmp_ult: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %edi, %edi +; GISEL-X64-NEXT: setb %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB14_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB14_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; GISEL-X86-LABEL: icmp_ult: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, %eax +; GISEL-X86-NEXT: setb %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB14_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB14_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp ult i32 %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_ule(i32 %x) { +; SDAG-LABEL: icmp_ule: +; SDAG: ## %bb.0: +; SDAG-NEXT: movb $1, %al +; SDAG-NEXT: testb %al, %al +; SDAG-NEXT: je LBB15_1 +; SDAG-NEXT: ## %bb.2: ## %bb1 +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: ret{{q|l}} +; SDAG-NEXT: LBB15_1: ## %bb2 +; SDAG-NEXT: movl $1, %eax +; SDAG-NEXT: ret{{q|l}} +; +; FASTISEL-LABEL: icmp_ule: +; FASTISEL: ## %bb.0: +; FASTISEL-NEXT: xorl %eax, %eax +; FASTISEL-NEXT: ret{{q|l}} +; +; GISEL-X64-LABEL: icmp_ule: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %edi, %edi +; GISEL-X64-NEXT: setbe %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB15_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB15_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; GISEL-X86-LABEL: icmp_ule: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, %eax +; GISEL-X86-NEXT: setbe %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB15_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB15_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp ule i32 %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_sgt(i32 %x) { +; SDAG-LABEL: icmp_sgt: +; SDAG: ## %bb.0: +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: testb %al, %al +; SDAG-NEXT: je LBB16_1 +; SDAG-NEXT: ## %bb.2: ## %bb1 +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: ret{{q|l}} +; SDAG-NEXT: LBB16_1: ## %bb2 +; SDAG-NEXT: movl $1, %eax +; SDAG-NEXT: ret{{q|l}} +; +; FASTISEL-LABEL: icmp_sgt: +; FASTISEL: ## %bb.0: +; FASTISEL-NEXT: movl $1, %eax +; FASTISEL-NEXT: ret{{q|l}} +; +; GISEL-X64-LABEL: icmp_sgt: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %edi, %edi +; GISEL-X64-NEXT: setg %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB16_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB16_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; GISEL-X86-LABEL: icmp_sgt: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, %eax +; GISEL-X86-NEXT: setg %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB16_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB16_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp sgt i32 %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_sge(i32 %x) { +; SDAG-LABEL: icmp_sge: +; SDAG: ## %bb.0: +; SDAG-NEXT: movb $1, %al +; SDAG-NEXT: testb %al, %al +; SDAG-NEXT: je LBB17_1 +; SDAG-NEXT: ## %bb.2: ## %bb1 +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: ret{{q|l}} +; SDAG-NEXT: LBB17_1: ## %bb2 +; SDAG-NEXT: movl $1, %eax +; SDAG-NEXT: ret{{q|l}} +; +; FASTISEL-LABEL: icmp_sge: +; FASTISEL: ## %bb.0: +; FASTISEL-NEXT: xorl %eax, %eax +; FASTISEL-NEXT: ret{{q|l}} +; +; GISEL-X64-LABEL: icmp_sge: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %edi, %edi +; GISEL-X64-NEXT: setge %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB17_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB17_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; GISEL-X86-LABEL: icmp_sge: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, %eax +; GISEL-X86-NEXT: setge %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB17_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB17_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp sge i32 %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_slt(i32 %x) { +; SDAG-LABEL: icmp_slt: +; SDAG: ## %bb.0: +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: testb %al, %al +; SDAG-NEXT: je LBB18_1 +; SDAG-NEXT: ## %bb.2: ## %bb1 +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: ret{{q|l}} +; SDAG-NEXT: LBB18_1: ## %bb2 +; SDAG-NEXT: movl $1, %eax +; SDAG-NEXT: ret{{q|l}} +; +; FASTISEL-LABEL: icmp_slt: +; FASTISEL: ## %bb.0: +; FASTISEL-NEXT: movl $1, %eax +; FASTISEL-NEXT: ret{{q|l}} +; +; GISEL-X64-LABEL: icmp_slt: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %edi, %edi +; GISEL-X64-NEXT: setl %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB18_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB18_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; GISEL-X86-LABEL: icmp_slt: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, %eax +; GISEL-X86-NEXT: setl %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB18_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB18_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp slt i32 %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} + +define i32 @icmp_sle(i32 %x) { +; SDAG-LABEL: icmp_sle: +; SDAG: ## %bb.0: +; SDAG-NEXT: movb $1, %al +; SDAG-NEXT: testb %al, %al +; SDAG-NEXT: je LBB19_1 +; SDAG-NEXT: ## %bb.2: ## %bb1 +; SDAG-NEXT: xorl %eax, %eax +; SDAG-NEXT: ret{{q|l}} +; SDAG-NEXT: LBB19_1: ## %bb2 +; SDAG-NEXT: movl $1, %eax +; SDAG-NEXT: ret{{q|l}} +; +; FASTISEL-LABEL: icmp_sle: +; FASTISEL: ## %bb.0: +; FASTISEL-NEXT: xorl %eax, %eax +; FASTISEL-NEXT: ret{{q|l}} +; +; GISEL-X64-LABEL: icmp_sle: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: cmpl %edi, %edi +; GISEL-X64-NEXT: setle %al +; GISEL-X64-NEXT: testb $1, %al +; GISEL-X64-NEXT: je LBB19_1 +; GISEL-X64-NEXT: ## %bb.2: ## %bb1 +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: LBB19_1: ## %bb2 +; GISEL-X64-NEXT: movl $1, %eax +; GISEL-X64-NEXT: retq +; +; GISEL-X86-LABEL: icmp_sle: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: cmpl %eax, %eax +; GISEL-X86-NEXT: setle %al +; GISEL-X86-NEXT: testb $1, %al +; GISEL-X86-NEXT: je LBB19_1 +; GISEL-X86-NEXT: ## %bb.2: ## %bb1 +; GISEL-X86-NEXT: xorl %eax, %eax +; GISEL-X86-NEXT: retl +; GISEL-X86-NEXT: LBB19_1: ## %bb2 +; GISEL-X86-NEXT: movl $1, %eax +; GISEL-X86-NEXT: retl + %1 = icmp sle i32 %x, %x + br i1 %1, label %bb1, label %bb2 +bb2: + ret i32 1 +bb1: + ret i32 0 +} -- cgit v1.1 From b85fe40cb88a6b4f640c2b757bd0d254ff1d032c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Thu, 8 Feb 2024 11:09:57 +0100 Subject: [clang][analyzer] Add missing stream related functions to StdLibraryFunctionsChecker. (#76979) Some stream functions were recently added to `StreamChecker` that were not modeled by `StdCLibraryFunctionsChecker`. To ensure consistency these functions are added to the other checker too. Some of the related tests are re-organized. --- .../Checkers/StdLibraryFunctionsChecker.cpp | 79 ++++++++++++-- .../Inputs/std-c-library-functions-POSIX.h | 15 ++- .../test/Analysis/std-c-library-functions-POSIX.c | 16 ++- clang/test/Analysis/std-c-library-functions.c | 4 +- clang/test/Analysis/stream-error.c | 26 ----- clang/test/Analysis/stream-noopen.c | 120 +++++++++++++++++---- clang/test/Analysis/stream.c | 25 ++++- 7 files changed, 221 insertions(+), 64 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp index 0c6293e6..6b8ac26 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp @@ -2023,13 +2023,6 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( {{EOFv, EOFv}, {0, UCharRangeMax}}, "an unsigned char value or EOF"))); - // The getc() family of functions that returns either a char or an EOF. - addToFunctionSummaryMap( - {"getc", "fgetc"}, Signature(ArgTypes{FilePtrTy}, RetType{IntTy}), - Summary(NoEvalCall) - .Case({ReturnValueCondition(WithinRange, - {{EOFv, EOFv}, {0, UCharRangeMax}})}, - ErrnoIrrelevant)); addToFunctionSummaryMap( "getchar", Signature(ArgTypes{}, RetType{IntTy}), Summary(NoEvalCall) @@ -2139,7 +2132,17 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( std::move(GetenvSummary)); } - if (ModelPOSIX) { + if (!ModelPOSIX) { + // Without POSIX use of 'errno' is not specified (in these cases). + // Add these functions without 'errno' checks. + addToFunctionSummaryMap( + {"getc", "fgetc"}, Signature(ArgTypes{FilePtrTy}, RetType{IntTy}), + Summary(NoEvalCall) + .Case({ReturnValueCondition(WithinRange, + {{EOFv, EOFv}, {0, UCharRangeMax}})}, + ErrnoIrrelevant) + .ArgConstraint(NotNull(ArgNo(0)))); + } else { const auto ReturnsZeroOrMinusOne = ConstraintSet{ReturnValueCondition(WithinRange, Range(-1, 0))}; const auto ReturnsZero = @@ -2231,6 +2234,63 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( .Case(ReturnsMinusOne, ErrnoNEZeroIrrelevant, GenericFailureMsg) .ArgConstraint(NotNull(ArgNo(0)))); + std::optional Off_tTy = lookupTy("off_t"); + std::optional Off_tMax = getMaxValue(Off_tTy); + + // int fgetc(FILE *stream); + // 'getc' is the same as 'fgetc' but may be a macro + addToFunctionSummaryMap( + {"getc", "fgetc"}, Signature(ArgTypes{FilePtrTy}, RetType{IntTy}), + Summary(NoEvalCall) + .Case({ReturnValueCondition(WithinRange, {{0, UCharRangeMax}})}, + ErrnoMustNotBeChecked, GenericSuccessMsg) + .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))}, + ErrnoIrrelevant, GenericFailureMsg) + .ArgConstraint(NotNull(ArgNo(0)))); + + // int fputc(int c, FILE *stream); + // 'putc' is the same as 'fputc' but may be a macro + addToFunctionSummaryMap( + {"putc", "fputc"}, + Signature(ArgTypes{IntTy, FilePtrTy}, RetType{IntTy}), + Summary(NoEvalCall) + .Case({ArgumentCondition(0, WithinRange, Range(0, UCharRangeMax)), + ReturnValueCondition(BO_EQ, ArgNo(0))}, + ErrnoMustNotBeChecked, GenericSuccessMsg) + .Case({ArgumentCondition(0, OutOfRange, Range(0, UCharRangeMax)), + ReturnValueCondition(WithinRange, Range(0, UCharRangeMax))}, + ErrnoMustNotBeChecked, GenericSuccessMsg) + .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))}, + ErrnoNEZeroIrrelevant, GenericFailureMsg) + .ArgConstraint(NotNull(ArgNo(1)))); + + // char *fgets(char *restrict s, int n, FILE *restrict stream); + addToFunctionSummaryMap( + "fgets", + Signature(ArgTypes{CharPtrRestrictTy, IntTy, FilePtrRestrictTy}, + RetType{CharPtrTy}), + Summary(NoEvalCall) + .Case({ReturnValueCondition(BO_EQ, ArgNo(0))}, + ErrnoMustNotBeChecked, GenericSuccessMsg) + .Case({IsNull(Ret)}, ErrnoIrrelevant, GenericFailureMsg) + .ArgConstraint(NotNull(ArgNo(0))) + .ArgConstraint(ArgumentCondition(1, WithinRange, Range(0, IntMax))) + .ArgConstraint( + BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1))) + .ArgConstraint(NotNull(ArgNo(2)))); + + // int fputs(const char *restrict s, FILE *restrict stream); + addToFunctionSummaryMap( + "fputs", + Signature(ArgTypes{ConstCharPtrRestrictTy, FilePtrRestrictTy}, + RetType{IntTy}), + Summary(NoEvalCall) + .Case(ReturnsNonnegative, ErrnoMustNotBeChecked, GenericSuccessMsg) + .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))}, + ErrnoNEZeroIrrelevant, GenericFailureMsg) + .ArgConstraint(NotNull(ArgNo(0))) + .ArgConstraint(NotNull(ArgNo(1)))); + // int ungetc(int c, FILE *stream); addToFunctionSummaryMap( "ungetc", Signature(ArgTypes{IntTy, FilePtrTy}, RetType{IntTy}), @@ -2250,9 +2310,6 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( 0, WithinRange, {{EOFv, EOFv}, {0, UCharRangeMax}})) .ArgConstraint(NotNull(ArgNo(1)))); - std::optional Off_tTy = lookupTy("off_t"); - std::optional Off_tMax = getMaxValue(Off_tTy); - // int fseek(FILE *stream, long offset, int whence); // FIXME: It can be possible to get the 'SEEK_' values (like EOFv) and use // these for condition of arg 2. diff --git a/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h b/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h index 63e22eb..b146068 100644 --- a/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h +++ b/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h @@ -11,6 +11,7 @@ typedef unsigned long int pthread_t; typedef unsigned long time_t; typedef unsigned long clockid_t; typedef __INT64_TYPE__ off64_t; +typedef __INT64_TYPE__ fpos_t; typedef struct { int a; @@ -42,9 +43,22 @@ FILE *fopen(const char *restrict pathname, const char *restrict mode); FILE *tmpfile(void); FILE *freopen(const char *restrict pathname, const char *restrict mode, FILE *restrict stream); +FILE *fdopen(int fd, const char *mode); int fclose(FILE *stream); +int putc(int c, FILE *stream); +int fputc(int c, FILE *stream); +char *fgets(char *restrict s, int n, FILE *restrict stream); +int fputs(const char *restrict s, FILE *restrict stream); int fseek(FILE *stream, long offset, int whence); +int fgetpos(FILE *restrict stream, fpos_t *restrict pos); +int fsetpos(FILE *stream, const fpos_t *pos); +int fflush(FILE *stream); +long ftell(FILE *stream); int fileno(FILE *stream); +void rewind(FILE *stream); +void clearerr(FILE *stream); +int feof(FILE *stream); +int ferror(FILE *stream); long a64l(const char *str64); char *l64a(long value); int open(const char *path, int oflag, ...); @@ -100,7 +114,6 @@ int pclose(FILE *stream); int close(int fildes); long fpathconf(int fildes, int name); long pathconf(const char *path, int name); -FILE *fdopen(int fd, const char *mode); void rewinddir(DIR *dir); void seekdir(DIR *dirp, long loc); int rand_r(unsigned int *seedp); diff --git a/clang/test/Analysis/std-c-library-functions-POSIX.c b/clang/test/Analysis/std-c-library-functions-POSIX.c index 03aa8e2..b53f313 100644 --- a/clang/test/Analysis/std-c-library-functions-POSIX.c +++ b/clang/test/Analysis/std-c-library-functions-POSIX.c @@ -23,10 +23,22 @@ // CHECK: Loaded summary for: FILE *popen(const char *command, const char *type) // CHECK: Loaded summary for: int fclose(FILE *stream) // CHECK: Loaded summary for: int pclose(FILE *stream) +// CHECK: Loaded summary for: int getc(FILE *) +// CHECK: Loaded summary for: int fgetc(FILE *) +// CHECK: Loaded summary for: int putc(int c, FILE *stream) +// CHECK: Loaded summary for: int fputc(int c, FILE *stream) +// CHECK: Loaded summary for: char *fgets(char *restrict s, int n, FILE *restrict stream) +// CHECK: Loaded summary for: int fputs(const char *restrict s, FILE *restrict stream) // CHECK: Loaded summary for: int fseek(FILE *stream, long offset, int whence) -// CHECK: Loaded summary for: int fseeko(FILE *stream, off_t offset, int whence) -// CHECK: Loaded summary for: off_t ftello(FILE *stream) +// CHECK: Loaded summary for: int fgetpos(FILE *restrict stream, fpos_t *restrict pos) +// CHECK: Loaded summary for: int fsetpos(FILE *stream, const fpos_t *pos) +// CHECK: Loaded summary for: int fflush(FILE *stream) +// CHECK: Loaded summary for: long ftell(FILE *stream) // CHECK: Loaded summary for: int fileno(FILE *stream) +// CHECK: Loaded summary for: void rewind(FILE *stream) +// CHECK: Loaded summary for: void clearerr(FILE *stream) +// CHECK: Loaded summary for: int feof(FILE *stream) +// CHECK: Loaded summary for: int ferror(FILE *stream) // CHECK: Loaded summary for: long a64l(const char *str64) // CHECK: Loaded summary for: char *l64a(long value) // CHECK: Loaded summary for: int open(const char *path, int oflag, ...) diff --git a/clang/test/Analysis/std-c-library-functions.c b/clang/test/Analysis/std-c-library-functions.c index b7eb6b2..e6564e2 100644 --- a/clang/test/Analysis/std-c-library-functions.c +++ b/clang/test/Analysis/std-c-library-functions.c @@ -53,8 +53,6 @@ // CHECK-NEXT: Loaded summary for: int toupper(int) // CHECK-NEXT: Loaded summary for: int tolower(int) // CHECK-NEXT: Loaded summary for: int toascii(int) -// CHECK-NEXT: Loaded summary for: int getc(FILE *) -// CHECK-NEXT: Loaded summary for: int fgetc(FILE *) // CHECK-NEXT: Loaded summary for: int getchar(void) // CHECK-NEXT: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict) // CHECK-NEXT: Loaded summary for: unsigned int fwrite(const void *restrict, size_t, size_t, FILE *restrict) @@ -63,6 +61,8 @@ // CHECK-NEXT: Loaded summary for: ssize_t getline(char **restrict, size_t *restrict, FILE *restrict) // CHECK-NEXT: Loaded summary for: ssize_t getdelim(char **restrict, size_t *restrict, int, FILE *restrict) // CHECK-NEXT: Loaded summary for: char *getenv(const char *) +// CHECK-NEXT: Loaded summary for: int getc(FILE *) +// CHECK-NEXT: Loaded summary for: int fgetc(FILE *) #include "Inputs/std-c-library-functions.h" diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c index cd4b009..4bab075 100644 --- a/clang/test/Analysis/stream-error.c +++ b/clang/test/Analysis/stream-error.c @@ -491,32 +491,6 @@ void error_ftello(void) { fclose(F); } -void error_fflush_after_fclose(void) { - FILE *F = tmpfile(); - int Ret; - fflush(NULL); // no-warning - if (!F) - return; - if ((Ret = fflush(F)) != 0) - clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}} - fclose(F); - fflush(F); // expected-warning {{Stream might be already closed}} -} - -void error_fflush_on_open_failed_stream(void) { - FILE *F = tmpfile(); - if (!F) { - fflush(F); // no-warning - return; - } - fclose(F); -} - -void error_fflush_on_unknown_stream(FILE *F) { - fflush(F); // no-warning - fclose(F); // no-warning -} - void error_fflush_on_non_null_stream_clear_error_states(void) { FILE *F0 = tmpfile(), *F1 = tmpfile(); // `fflush` clears a non-EOF stream's error state. diff --git a/clang/test/Analysis/stream-noopen.c b/clang/test/Analysis/stream-noopen.c index 8ad101e..8bd01a9 100644 --- a/clang/test/Analysis/stream-noopen.c +++ b/clang/test/Analysis/stream-noopen.c @@ -57,6 +57,95 @@ void test_fwrite(FILE *F) { clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}} } +void test_fgetc(FILE *F) { + int Ret = fgetc(F); + clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}} + if (Ret != EOF) { + if (errno) {} // expected-warning {{undefined}} + } else { + clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}} + // expected-warning@-1 {{FALSE}} + } + clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}} + clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}} +} + +void test_fputc(FILE *F) { + int Ret = fputc('a', F); + clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}} + if (Ret != EOF) { + clang_analyzer_eval(Ret == 'a'); // expected-warning {{TRUE}} + if (errno) {} // expected-warning {{undefined}} + } else { + clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}} + } + clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}} + clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}} +} + +void test_fgets(char *Buf, int N, FILE *F) { + char *Ret = fgets(Buf, N, F); + clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}} + clang_analyzer_eval(Buf != NULL); // expected-warning {{TRUE}} + clang_analyzer_eval(N >= 0); // expected-warning {{TRUE}} + if (Ret == Buf) { + if (errno) {} // expected-warning {{undefined}} + } else { + clang_analyzer_eval(Ret == 0); // expected-warning {{TRUE}} + clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}} + // expected-warning@-1 {{FALSE}} + } + clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}} + clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}} + + char Buf1[10]; + Ret = fgets(Buf1, 11, F); // expected-warning {{The 1st argument to 'fgets' is a buffer with size 10}} +} + +void test_fgets_bufsize(FILE *F) { + char Buf[10]; + fgets(Buf, 11, F); // expected-warning {{The 1st argument to 'fgets' is a buffer with size 10}} +} + +void test_fputs(char *Buf, FILE *F) { + int Ret = fputs(Buf, F); + clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}} + clang_analyzer_eval(Buf != NULL); // expected-warning {{TRUE}} + if (Ret >= 0) { + if (errno) {} // expected-warning {{undefined}} + } else { + clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}} + clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}} + } + clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}} + clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}} +} + +void test_ungetc(FILE *F) { + int Ret = ungetc('X', F); + clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}} + if (Ret == 'X') { + if (errno) {} // expected-warning {{undefined}} + } else { + clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}} + clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}} + } + clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}} + clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}} +} + +void test_ungetc_EOF(FILE *F, int C) { + int Ret = ungetc(EOF, F); + clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}} + clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}} + clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}} + Ret = ungetc(C, F); + if (Ret == EOF) { + clang_analyzer_eval(C == EOF); // expected-warning {{TRUE}} + // expected-warning@-1{{FALSE}} + } +} + void test_fclose(FILE *F) { int Ret = fclose(F); clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}} @@ -138,28 +227,17 @@ void test_rewind(FILE *F) { rewind(F); } -void test_ungetc(FILE *F) { - int Ret = ungetc('X', F); - clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}} - if (Ret == 'X') { - if (errno) {} // expected-warning {{undefined}} - } else { - clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}} - clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}} - } - clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}} -} - -void test_ungetc_EOF(FILE *F, int C) { - int Ret = ungetc(EOF, F); - clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}} - clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}} - clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}} - Ret = ungetc(C, F); +void test_fflush(FILE *F) { + errno = 0; + int Ret = fflush(F); + clang_analyzer_eval(F != NULL); // expected-warning{{TRUE}} + // expected-warning@-1{{FALSE}} if (Ret == EOF) { - clang_analyzer_eval(C == EOF); // expected-warning {{TRUE}} - // expected-warning@-1{{FALSE}} + clang_analyzer_eval(errno != 0); // expected-warning{{TRUE}} + } else { + clang_analyzer_eval(Ret == 0); // expected-warning{{TRUE}} + clang_analyzer_eval(errno == 0); // expected-warning{{TRUE}} + // expected-warning@-1{{FALSE}} } } diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c index 36a9b4e..378c915 100644 --- a/clang/test/Analysis/stream.c +++ b/clang/test/Analysis/stream.c @@ -1,7 +1,9 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream -verify %s +// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection -verify %s #include "Inputs/system-header-simulator.h" +void clang_analyzer_eval(int); + void check_fread(void) { FILE *fp = tmpfile(); fread(0, 0, 0, fp); // expected-warning {{Stream pointer might be NULL}} @@ -316,3 +318,24 @@ void check_leak_noreturn_2(void) { } // expected-warning {{Opened stream never closed. Potential resource leak}} // FIXME: This warning should be placed at the `return` above. // See https://reviews.llvm.org/D83120 about details. + +void fflush_after_fclose(void) { + FILE *F = tmpfile(); + int Ret; + fflush(NULL); // no-warning + if (!F) + return; + if ((Ret = fflush(F)) != 0) + clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}} + fclose(F); + fflush(F); // expected-warning {{Stream might be already closed}} +} + +void fflush_on_open_failed_stream(void) { + FILE *F = tmpfile(); + if (!F) { + fflush(F); // no-warning + return; + } + fclose(F); +} -- cgit v1.1 From 8f2378d7fcf19ea00fbd3366c2125569ef084f93 Mon Sep 17 00:00:00 2001 From: Simon Camphausen Date: Thu, 8 Feb 2024 11:27:08 +0100 Subject: [mlir][EmitC] Add builders for call_opaque op (#80879) This allows to omit the default valued attributes and therefore write more compact code. --- mlir/include/mlir/Dialect/EmitC/IR/EmitC.td | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td index 39cc360..c50fdf3 100644 --- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td +++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td @@ -122,6 +122,19 @@ def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", []> { Variadic:$operands ); let results = (outs Variadic); + let builders = [ + OpBuilder<(ins + "::mlir::TypeRange":$resultTypes, + "::llvm::StringRef":$callee, + "::mlir::ValueRange":$operands, + CArg<"::mlir::ArrayAttr", "{}">:$args, + CArg<"::mlir::ArrayAttr", "{}">:$template_args), [{ + build($_builder, $_state, resultTypes, callee, args, template_args, + operands); + }] + > + ]; + let assemblyFormat = [{ $callee `(` $operands `)` attr-dict `:` functional-type($operands, results) }]; -- cgit v1.1 From 1a42b3804f0ed1c4958c4f17216543a1623e3452 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 8 Feb 2024 10:27:34 +0000 Subject: [DebugInfo][RemoveDIs] Erase ranges of instructions individually (#81007) The BasicBlock::erase method simply removes a range of instructions from the instlist by unlinking them. However, now that we're attaching debug-info directly to instructions, some cleanup is required, so use eraseFromParent on each instruction instead. This is less efficient, but rare, and seemingly only WASM EH Prepare uses this method of BasicBlock. Detected via a memory leak check in asan. (asan is always the final boss for whatever I do). --- llvm/lib/IR/BasicBlock.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index bb55f48..fe9d0d0 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -677,7 +677,9 @@ BasicBlock *BasicBlock::splitBasicBlockBefore(iterator I, const Twine &BBName) { BasicBlock::iterator BasicBlock::erase(BasicBlock::iterator FromIt, BasicBlock::iterator ToIt) { - return InstList.erase(FromIt, ToIt); + for (Instruction &I : make_early_inc_range(make_range(FromIt, ToIt))) + I.eraseFromParent(); + return ToIt; } void BasicBlock::replacePhiUsesWith(BasicBlock *Old, BasicBlock *New) { -- cgit v1.1 From faa2f9658a0cd276f3415fad2676f8d90df51268 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 8 Feb 2024 10:44:43 +0000 Subject: [DebugInfo] Handle dbg.assigns in FastISel (#80734) There are some rare circumstances where dbg.assign intrinsics can reach FastISel. They are a more specialised kind of dbg.value intrinsic with more information about the originating alloca. They only occur during optimisation, but might reach FastISel through always_inlining an optimised function into an optnone function. This is a slight problem as it's not safe (for debug-info accuracy) to ignore any intrinsics, and for RemoveDIs (the intrinsic-replacement project) it causes a crash through an unhandled switch case. To get around this, we can just treat the dbg.assign as a dbg.value (it's an actual subclass) and use the variable location information from the dbg.value fields. This loses a small amount of debug-info about stack locations, but is more accurate than just ignoring the intrinsic. (This has popped up deep in an LTO build of a large codebase while testing RemoveDIs, I figured it'd be good to fix it for the intrinsic-form at the same time, just to demonstrate the correct behaviour). --- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 7 ++++ llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 10 ++++- .../X86/dont-drop-dbg-assigns-in-isels.ll | 46 ++++++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index dd38317..c1d8e89 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2120,6 +2120,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, ListSize, Alignment)); return true; } + case Intrinsic::dbg_assign: + // A dbg.assign is a dbg.value with more information about stack locations, + // typically produced during optimisation of variables with leaked + // addresses. We can treat it like a normal dbg_value intrinsic here; to + // benefit from the full analysis of stack/SSA locations, GlobalISel would + // need to register for and use the AssignmentTrackingAnalysis pass. + LLVM_FALLTHROUGH; case Intrinsic::dbg_value: { // This form of DBG_VALUE is target-independent. const DbgValueInst &DI = cast(CI); diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 4df79f4..f875652 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1197,7 +1197,8 @@ void FastISel::handleDbgInfo(const Instruction *II) { V = DPV.getVariableLocationOp(0); bool Res = false; - if (DPV.getType() == DPValue::LocationType::Value) { + if (DPV.getType() == DPValue::LocationType::Value || + DPV.getType() == DPValue::LocationType::Assign) { Res = lowerDbgValue(V, DPV.getExpression(), DPV.getVariable(), DPV.getDebugLoc()); } else { @@ -1393,6 +1394,13 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { return true; } + case Intrinsic::dbg_assign: + // A dbg.assign is a dbg.value with more information, typically produced + // during optimisation. If one reaches fastisel then something odd has + // happened (such as an optimised function being always-inlined into an + // optnone function). We will not be using the extra information in the + // dbg.assign in that case, just use its dbg.value fields. + LLVM_FALLTHROUGH; case Intrinsic::dbg_value: { // This form of DBG_VALUE is target-independent. const DbgValueInst *DI = cast(II); diff --git a/llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll b/llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll new file mode 100644 index 0000000..77c9aa5 --- /dev/null +++ b/llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll @@ -0,0 +1,46 @@ +; RUN: llc %s -fast-isel -start-after=codegenprepare -stop-before=finalize-isel -o - | FileCheck %s +; RUN: llc %s -fast-isel -start-after=codegenprepare -stop-before=finalize-isel -o - --try-experimental-debuginfo-iterators | FileCheck %s +; RUN: llc %s -global-isel -start-after=codegenprepare -stop-before=finalize-isel -o - | FileCheck %s +; RUN: llc %s -global-isel -start-after=codegenprepare -stop-before=finalize-isel -o - --try-experimental-debuginfo-iterators | FileCheck %s + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; CHECK: DBG_VALUE + +declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) + +define dso_local i32 @foo(i32 %a, i32 %b) local_unnamed_addr !dbg !8 { +entry: + call void @llvm.dbg.assign(metadata !DIArgList(i32 %a, i32 %b), metadata !16, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), metadata !21, metadata ptr undef, metadata !DIExpression()), !dbg !17 + %mul = mul nsw i32 %b, %a, !dbg !18 + ret i32 %mul, !dbg !18 +} + + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5, !19, !6} +!llvm.ident = !{!7} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "debug_value_list_selectiondag.cpp", directory: "/") +!2 = !{} +!3 = !{i32 2, !"CodeView", i32 1} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 2} +!6 = !{i32 7, !"PIC Level", i32 2} +!7 = !{!"clang version 11.0.0"} +!8 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !9, file: !9, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13) +!9 = !DIFile(filename: ".\\debug_value_list.cpp", directory: "/tmp") +!10 = !DISubroutineType(types: !11) +!11 = !{!12, !12, !12} +!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!13 = !{!14, !15, !16} +!14 = !DILocalVariable(name: "b", arg: 2, scope: !8, file: !9, line: 1, type: !12) +!15 = !DILocalVariable(name: "a", arg: 1, scope: !8, file: !9, line: 1, type: !12) +!16 = !DILocalVariable(name: "c", scope: !8, file: !9, line: 2, type: !12) +!17 = !DILocation(line: 0, scope: !8) +!18 = !DILocation(line: 3, scope: !8) +!19 = !{i32 7, !"debug-info-assignment-tracking", i1 true} +!20 = !DILocalVariable(name: "d", scope: !8, file: !9, line: 2, type: !12) +!21 = distinct !DIAssignID() -- cgit v1.1 From 878234b3202c9fe343cd59c71b50c4c4c5dc1b8c Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 8 Feb 2024 11:07:33 +0000 Subject: [BasicAA] Scalable offset with scalable typesize. (#80818) This patch adds a simple alias analysis check for accesses that are scalable with a offset between them that is also trivially scalable (there are no other constant/variable offsets). We essentially divide each side by vscale and are left needing to check that the offset >= typesize. --- llvm/lib/Analysis/BasicAliasAnalysis.cpp | 21 +++++++++++++++++++++ llvm/test/Analysis/BasicAA/vscale.ll | 22 +++++++++++----------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 19c4393..ae31814 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -1170,6 +1170,27 @@ AliasResult BasicAAResult::aliasGEP( } } + // VScale Alias Analysis - Given one scalable offset between accesses and a + // scalable typesize, we can divide each side by vscale, treating both values + // as a constant. We prove that Offset/vscale >= TypeSize/vscale. + if (DecompGEP1.VarIndices.size() == 1 && DecompGEP1.VarIndices[0].IsNSW && + DecompGEP1.VarIndices[0].Val.TruncBits == 0 && + DecompGEP1.Offset.isZero() && + PatternMatch::match(DecompGEP1.VarIndices[0].Val.V, + PatternMatch::m_VScale())) { + const VariableGEPIndex &ScalableVar = DecompGEP1.VarIndices[0]; + APInt Scale = + ScalableVar.IsNegated ? -ScalableVar.Scale : ScalableVar.Scale; + LocationSize VLeftSize = Scale.isNegative() ? V1Size : V2Size; + + // Note that we do not check that the typesize is scalable, as vscale >= 1 + // so noalias still holds so long as the dependency distance is at least as + // big as the typesize. + if (VLeftSize.hasValue() && + Scale.uge(VLeftSize.getValue().getKnownMinValue())) + return AliasResult::NoAlias; + } + // Bail on analysing scalable LocationSize if (V1Size.isScalable() || V2Size.isScalable()) return AliasResult::MayAlias; diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll index 1b9118b..ce0c6f1 100644 --- a/llvm/test/Analysis/BasicAA/vscale.ll +++ b/llvm/test/Analysis/BasicAA/vscale.ll @@ -339,15 +339,15 @@ define void @vscale_neg_notscalable(ptr %p) { } ; CHECK-LABEL: vscale_neg_scalable -; CHECK-DAG: MayAlias: * %p, * %vm16 +; CHECK-DAG: NoAlias: * %p, * %vm16 ; CHECK-DAG: MayAlias: * %m16, * %p ; CHECK-DAG: MayAlias: * %m16, * %vm16 ; CHECK-DAG: MayAlias: * %p, * %vm16m16 ; CHECK-DAG: MayAlias: * %vm16, * %vm16m16 -; CHECK-DAG: MayAlias: * %m16, * %vm16m16 +; CHECK-DAG: NoAlias: * %m16, * %vm16m16 ; CHECK-DAG: MayAlias: * %m16pv16, * %p ; CHECK-DAG: MayAlias: * %m16pv16, * %vm16 -; CHECK-DAG: MayAlias: * %m16, * %m16pv16 +; CHECK-DAG: NoAlias: * %m16, * %m16pv16 ; CHECK-DAG: MayAlias: * %m16pv16, * %vm16m16 define void @vscale_neg_scalable(ptr %p) { %v = call i64 @llvm.vscale.i64() @@ -393,15 +393,15 @@ define void @vscale_pos_notscalable(ptr %p) { } ; CHECK-LABEL: vscale_pos_scalable -; CHECK-DAG: MayAlias: * %p, * %vm16 +; CHECK-DAG: NoAlias: * %p, * %vm16 ; CHECK-DAG: MayAlias: * %m16, * %p ; CHECK-DAG: MayAlias: * %m16, * %vm16 ; CHECK-DAG: MayAlias: * %p, * %vm16m16 ; CHECK-DAG: MayAlias: * %vm16, * %vm16m16 -; CHECK-DAG: MayAlias: * %m16, * %vm16m16 +; CHECK-DAG: NoAlias: * %m16, * %vm16m16 ; CHECK-DAG: MayAlias: * %m16pv16, * %p ; CHECK-DAG: MayAlias: * %m16pv16, * %vm16 -; CHECK-DAG: MayAlias: * %m16, * %m16pv16 +; CHECK-DAG: NoAlias: * %m16, * %m16pv16 ; CHECK-DAG: MayAlias: * %m16pv16, * %vm16m16 define void @vscale_pos_scalable(ptr %p) { %v = call i64 @llvm.vscale.i64() @@ -421,9 +421,9 @@ define void @vscale_pos_scalable(ptr %p) { ; CHECK-LABEL: vscale_v1v2types ; CHECK-DAG: MustAlias: <4 x i32>* %p, * %p -; CHECK-DAG: MayAlias: * %p, * %vm16 -; CHECK-DAG: MayAlias: <4 x i32>* %p, * %vm16 -; CHECK-DAG: MayAlias: * %p, <4 x i32>* %vm16 +; CHECK-DAG: NoAlias: * %p, * %vm16 +; CHECK-DAG: NoAlias: <4 x i32>* %p, * %vm16 +; CHECK-DAG: NoAlias: * %p, <4 x i32>* %vm16 ; CHECK-DAG: NoAlias: <4 x i32>* %p, <4 x i32>* %vm16 ; CHECK-DAG: MustAlias: <4 x i32>* %vm16, * %vm16 ; CHECK-DAG: MayAlias: * %m16, * %p @@ -435,8 +435,8 @@ define void @vscale_pos_scalable(ptr %p) { ; CHECK-DAG: MayAlias: <4 x i32>* %m16, * %vm16 ; CHECK-DAG: MayAlias: <4 x i32>* %m16, <4 x i32>* %vm16 ; CHECK-DAG: MustAlias: <4 x i32>* %m16, * %m16 -; CHECK-DAG: MayAlias: * %p, * %vp16 -; CHECK-DAG: MayAlias: <4 x i32>* %p, * %vp16 +; CHECK-DAG: NoAlias: * %p, * %vp16 +; CHECK-DAG: NoAlias: <4 x i32>* %p, * %vp16 ; CHECK-DAG: MayAlias: * %vm16, * %vp16 ; CHECK-DAG: MayAlias: <4 x i32>* %vm16, * %vp16 ; CHECK-DAG: MayAlias: * %m16, * %vp16 -- cgit v1.1 From 455c3966cd7305b40d6941b544a16c22120b4512 Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Thu, 8 Feb 2024 11:07:01 +0000 Subject: [RISCV][test] Add test coverage for RISCVInstrInfo::isCopyInstrImpl --- llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp | 63 ++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp index 5836239..5f3ce53 100644 --- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp +++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp @@ -94,6 +94,69 @@ TEST_P(RISCVInstrInfoTest, IsAddImmediate) { } } +TEST_P(RISCVInstrInfoTest, IsCopyInstrImpl) { + const RISCVInstrInfo *TII = ST->getInstrInfo(); + DebugLoc DL; + + // ADDI. + + MachineInstr *MI1 = BuildMI(*MF, DL, TII->get(RISCV::ADDI), RISCV::X1) + .addReg(RISCV::X2) + .addImm(-128) + .getInstr(); + auto MI1Res = TII->isCopyInstrImpl(*MI1); + EXPECT_FALSE(MI1Res.has_value()); + + MachineInstr *MI2 = BuildMI(*MF, DL, TII->get(RISCV::ADDI), RISCV::X1) + .addReg(RISCV::X2) + .addImm(0) + .getInstr(); + auto MI2Res = TII->isCopyInstrImpl(*MI2); + ASSERT_TRUE(MI2Res.has_value()); + EXPECT_EQ(MI2Res->Destination->getReg(), RISCV::X1); + EXPECT_EQ(MI2Res->Source->getReg(), RISCV::X2); + + // Partial coverage of FSGNJ_* instructions. + + MachineInstr *MI3 = BuildMI(*MF, DL, TII->get(RISCV::FSGNJ_D), RISCV::F1_D) + .addReg(RISCV::F2_D) + .addReg(RISCV::F1_D) + .getInstr(); + auto MI3Res = TII->isCopyInstrImpl(*MI3); + EXPECT_FALSE(MI3Res.has_value()); + + MachineInstr *MI4 = BuildMI(*MF, DL, TII->get(RISCV::FSGNJ_D), RISCV::F1_D) + .addReg(RISCV::F2_D) + .addReg(RISCV::F2_D) + .getInstr(); + auto MI4Res = TII->isCopyInstrImpl(*MI4); + ASSERT_TRUE(MI4Res.has_value()); + EXPECT_EQ(MI4Res->Destination->getReg(), RISCV::F1_D); + EXPECT_EQ(MI4Res->Source->getReg(), RISCV::F2_D); + + // ADD. TODO: Should return true for add reg, x0 and add x0, reg. + MachineInstr *MI5 = BuildMI(*MF, DL, TII->get(RISCV::ADD), RISCV::X1) + .addReg(RISCV::X2) + .addReg(RISCV::X3) + .getInstr(); + auto MI5Res = TII->isCopyInstrImpl(*MI5); + EXPECT_FALSE(MI5Res.has_value()); + + MachineInstr *MI6 = BuildMI(*MF, DL, TII->get(RISCV::ADD), RISCV::X1) + .addReg(RISCV::X0) + .addReg(RISCV::X2) + .getInstr(); + auto MI6Res = TII->isCopyInstrImpl(*MI6); + EXPECT_FALSE(MI6Res.has_value()); + + MachineInstr *MI7 = BuildMI(*MF, DL, TII->get(RISCV::ADD), RISCV::X1) + .addReg(RISCV::X2) + .addReg(RISCV::X0) + .getInstr(); + auto MI7Res = TII->isCopyInstrImpl(*MI7); + EXPECT_FALSE(MI7Res.has_value()); +} + TEST_P(RISCVInstrInfoTest, GetMemOperandsWithOffsetWidth) { const RISCVInstrInfo *TII = ST->getInstrInfo(); const TargetRegisterInfo *TRI = ST->getRegisterInfo(); -- cgit v1.1 From d7fb94b6daa643a764e9a756bc544f26c248dafd Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 8 Feb 2024 11:09:45 +0000 Subject: [lldb][TypeSynthetic][NFC] Make SyntheticChildrenFrontend::Update() return an enum (#80167) This patch changes the return value of `SyntheticChildrenFrontend::Update` to a scoped enum that aims to describe what the return value means. --- lldb/include/lldb/DataFormatters/TypeSynthetic.h | 27 ++++--- lldb/include/lldb/DataFormatters/VectorIterator.h | 2 +- lldb/include/lldb/lldb-enumerations.h | 9 +++ lldb/source/Core/ValueObjectSyntheticFilter.cpp | 6 +- lldb/source/DataFormatters/TypeSynthetic.cpp | 8 ++- lldb/source/DataFormatters/VectorType.cpp | 4 +- .../Plugins/Language/CPlusPlus/BlockPointer.cpp | 4 +- .../Plugins/Language/CPlusPlus/Coroutines.cpp | 16 ++--- .../source/Plugins/Language/CPlusPlus/Coroutines.h | 2 +- .../Plugins/Language/CPlusPlus/GenericBitset.cpp | 8 +-- .../Plugins/Language/CPlusPlus/GenericOptional.cpp | 8 +-- lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp | 63 ++++++++-------- lldb/source/Plugins/Language/CPlusPlus/LibCxx.h | 8 +-- .../Plugins/Language/CPlusPlus/LibCxxAtomic.cpp | 7 +- .../Language/CPlusPlus/LibCxxInitializerList.cpp | 10 +-- .../Plugins/Language/CPlusPlus/LibCxxList.cpp | 32 ++++----- .../Plugins/Language/CPlusPlus/LibCxxMap.cpp | 9 +-- .../Plugins/Language/CPlusPlus/LibCxxQueue.cpp | 8 +-- .../Language/CPlusPlus/LibCxxRangesRefView.cpp | 11 +-- .../Plugins/Language/CPlusPlus/LibCxxSpan.cpp | 9 +-- .../Plugins/Language/CPlusPlus/LibCxxTuple.cpp | 8 +-- .../Language/CPlusPlus/LibCxxUnorderedMap.cpp | 20 +++--- .../Plugins/Language/CPlusPlus/LibCxxVariant.cpp | 12 ++-- .../Plugins/Language/CPlusPlus/LibCxxVector.cpp | 28 ++++---- .../Plugins/Language/CPlusPlus/LibStdcpp.cpp | 44 ++++++------ .../Plugins/Language/CPlusPlus/LibStdcppTuple.cpp | 8 +-- .../Language/CPlusPlus/LibStdcppUniquePointer.cpp | 8 +-- lldb/source/Plugins/Language/ObjC/Cocoa.cpp | 4 +- lldb/source/Plugins/Language/ObjC/NSArray.cpp | 45 ++++++------ lldb/source/Plugins/Language/ObjC/NSDictionary.cpp | 83 ++++++++++++---------- lldb/source/Plugins/Language/ObjC/NSError.cpp | 12 ++-- lldb/source/Plugins/Language/ObjC/NSException.cpp | 9 ++- lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp | 14 ++-- lldb/source/Plugins/Language/ObjC/NSSet.cpp | 46 ++++++------ 34 files changed, 321 insertions(+), 271 deletions(-) diff --git a/lldb/include/lldb/DataFormatters/TypeSynthetic.h b/lldb/include/lldb/DataFormatters/TypeSynthetic.h index 41be9b7e..23cc054b 100644 --- a/lldb/include/lldb/DataFormatters/TypeSynthetic.h +++ b/lldb/include/lldb/DataFormatters/TypeSynthetic.h @@ -49,14 +49,15 @@ public: virtual size_t GetIndexOfChildWithName(ConstString name) = 0; - // this function is assumed to always succeed and it if fails, the front-end - // should know to deal with it in the correct way (most probably, by refusing - // to return any children) the return value of Update() should actually be - // interpreted as "ValueObjectSyntheticFilter cache is good/bad" if =true, - // ValueObjectSyntheticFilter is allowed to use the children it fetched - // previously and cached if =false, ValueObjectSyntheticFilter must throw - // away its cache, and query again for children - virtual bool Update() = 0; + /// This function is assumed to always succeed and if it fails, the front-end + /// should know to deal with it in the correct way (most probably, by refusing + /// to return any children). The return value of \ref Update should actually + /// be interpreted as "ValueObjectSyntheticFilter cache is good/bad". If this + /// function returns \ref lldb::ChildCacheState::eReuse, \ref + /// ValueObjectSyntheticFilter is allowed to use the children it fetched + /// previously and cached. Otherwise, \ref ValueObjectSyntheticFilter must + /// throw away its cache, and query again for children. + virtual lldb::ChildCacheState Update() = 0; // if this function returns false, then CalculateNumChildren() MUST return 0 // since UI frontends might validly decide not to inquire for children given @@ -116,7 +117,9 @@ public: return UINT32_MAX; } - bool Update() override { return false; } + lldb::ChildCacheState Update() override { + return lldb::ChildCacheState::eRefetch; + } bool MightHaveChildren() override { return false; } @@ -328,7 +331,9 @@ public: filter->GetExpressionPathAtIndex(idx), true); } - bool Update() override { return false; } + lldb::ChildCacheState Update() override { + return lldb::ChildCacheState::eRefetch; + } bool MightHaveChildren() override { return filter->GetCount() > 0; } @@ -427,7 +432,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; diff --git a/lldb/include/lldb/DataFormatters/VectorIterator.h b/lldb/include/lldb/DataFormatters/VectorIterator.h index 3414298..5f774bb 100644 --- a/lldb/include/lldb/DataFormatters/VectorIterator.h +++ b/lldb/include/lldb/DataFormatters/VectorIterator.h @@ -28,7 +28,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 392d333..7e9b538 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -1305,6 +1305,15 @@ enum CompletionType { eTerminatorCompletion = (1ul << 27) }; +/// Specifies if children need to be re-computed +/// after a call to \ref SyntheticChildrenFrontEnd::Update. +enum class ChildCacheState { + eRefetch = 0, ///< Children need to be recomputed dynamically. + + eReuse = 1, ///< Children did not change and don't need to be recomputed; + ///< re-use what we computed the last time we called Update. +}; + } // namespace lldb #endif // LLDB_LLDB_ENUMERATIONS_H diff --git a/lldb/source/Core/ValueObjectSyntheticFilter.cpp b/lldb/source/Core/ValueObjectSyntheticFilter.cpp index 43bc532..e8b4b02 100644 --- a/lldb/source/Core/ValueObjectSyntheticFilter.cpp +++ b/lldb/source/Core/ValueObjectSyntheticFilter.cpp @@ -43,7 +43,9 @@ public: bool MightHaveChildren() override { return m_backend.MightHaveChildren(); } - bool Update() override { return false; } + lldb::ChildCacheState Update() override { + return lldb::ChildCacheState::eRefetch; + } }; ValueObjectSynthetic::ValueObjectSynthetic(ValueObject &parent, @@ -177,7 +179,7 @@ bool ValueObjectSynthetic::UpdateValue() { } // let our backend do its update - if (!m_synth_filter_up->Update()) { + if (m_synth_filter_up->Update() == lldb::ChildCacheState::eRefetch) { LLDB_LOGF(log, "[ValueObjectSynthetic::UpdateValue] name=%s, synthetic " "filter said caches are stale - clearing", diff --git a/lldb/source/DataFormatters/TypeSynthetic.cpp b/lldb/source/DataFormatters/TypeSynthetic.cpp index de042e4..8a6f132 100644 --- a/lldb/source/DataFormatters/TypeSynthetic.cpp +++ b/lldb/source/DataFormatters/TypeSynthetic.cpp @@ -190,11 +190,13 @@ size_t ScriptedSyntheticChildren::FrontEnd::CalculateNumChildren(uint32_t max) { return m_interpreter->CalculateNumChildren(m_wrapper_sp, max); } -bool ScriptedSyntheticChildren::FrontEnd::Update() { +lldb::ChildCacheState ScriptedSyntheticChildren::FrontEnd::Update() { if (!m_wrapper_sp || m_interpreter == nullptr) - return false; + return lldb::ChildCacheState::eRefetch; - return m_interpreter->UpdateSynthProviderInstance(m_wrapper_sp); + return m_interpreter->UpdateSynthProviderInstance(m_wrapper_sp) + ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } bool ScriptedSyntheticChildren::FrontEnd::MightHaveChildren() { diff --git a/lldb/source/DataFormatters/VectorType.cpp b/lldb/source/DataFormatters/VectorType.cpp index 57dae0b..c94ca68 100644 --- a/lldb/source/DataFormatters/VectorType.cpp +++ b/lldb/source/DataFormatters/VectorType.cpp @@ -245,7 +245,7 @@ public: return child_sp; } - bool Update() override { + lldb::ChildCacheState Update() override { m_parent_format = m_backend.GetFormat(); CompilerType parent_type(m_backend.GetCompilerType()); CompilerType element_type; @@ -258,7 +258,7 @@ public: ::CalculateNumChildren(element_type, num_elements, m_child_type) .value_or(0); m_item_format = GetItemFormatForFormat(m_parent_format, m_child_type); - return false; + return lldb::ChildCacheState::eRefetch; } bool MightHaveChildren() override { return true; } diff --git a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp index 314a4ac..2e43aa3 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp @@ -136,7 +136,9 @@ public: // return true if this object is now safe to use forever without ever // updating again; the typical (and tested) answer here is 'false' - bool Update() override { return false; } + lldb::ChildCacheState Update() override { + return lldb::ChildCacheState::eRefetch; + } // maybe return false if the block pointer is, say, null bool MightHaveChildren() override { return true; } diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp index 6aeae97..7420174 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp @@ -125,24 +125,24 @@ lldb::ValueObjectSP lldb_private::formatters:: return lldb::ValueObjectSP(); } -bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd:: - Update() { +lldb::ChildCacheState +lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() { m_resume_ptr_sp.reset(); m_destroy_ptr_sp.reset(); m_promise_ptr_sp.reset(); ValueObjectSP valobj_sp = m_backend.GetNonSyntheticValue(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; lldb::addr_t frame_ptr_addr = GetCoroFramePtrFromHandle(valobj_sp); if (frame_ptr_addr == 0 || frame_ptr_addr == LLDB_INVALID_ADDRESS) - return false; + return lldb::ChildCacheState::eRefetch; auto ts = valobj_sp->GetCompilerType().GetTypeSystem(); auto ast_ctx = ts.dyn_cast_or_null(); if (!ast_ctx) - return false; + return lldb::ChildCacheState::eRefetch; // Create the `resume` and `destroy` children. lldb::TargetSP target_sp = m_backend.GetTargetSP(); @@ -165,7 +165,7 @@ bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd:: CompilerType promise_type( valobj_sp->GetCompilerType().GetTypeTemplateArgument(0)); if (!promise_type) - return false; + return lldb::ChildCacheState::eRefetch; // Try to infer the promise_type if it was type-erased if (promise_type.IsVoidType()) { @@ -180,7 +180,7 @@ bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd:: // If we don't know the promise type, we don't display the `promise` member. // `CreateValueObjectFromAddress` below would fail for `void` types. if (promise_type.IsVoidType()) { - return false; + return lldb::ChildCacheState::eRefetch; } // Add the `promise` member. We intentionally add `promise` as a pointer type @@ -194,7 +194,7 @@ bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd:: if (error.Success()) m_promise_ptr_sp = promisePtr->Clone(ConstString("promise")); - return false; + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd:: diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h index b26cc9e..d38c7ec 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h +++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h @@ -38,7 +38,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp index 2876efc..ac31663 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp @@ -33,7 +33,7 @@ public: } bool MightHaveChildren() override { return true; } - bool Update() override; + lldb::ChildCacheState Update() override; size_t CalculateNumChildren() override { return m_elements.size(); } ValueObjectSP GetChildAtIndex(size_t idx) override; @@ -78,13 +78,13 @@ llvm::StringRef GenericBitsetFrontEnd::GetDataContainerMemberName() { llvm_unreachable("Unknown StdLib enum"); } -bool GenericBitsetFrontEnd::Update() { +lldb::ChildCacheState GenericBitsetFrontEnd::Update() { m_elements.clear(); m_first = nullptr; TargetSP target_sp = m_backend.GetTargetSP(); if (!target_sp) - return false; + return lldb::ChildCacheState::eRefetch; size_t size = 0; @@ -94,7 +94,7 @@ bool GenericBitsetFrontEnd::Update() { m_elements.assign(size, ValueObjectSP()); m_first = m_backend.GetChildMemberWithName(GetDataContainerMemberName()).get(); - return false; + return lldb::ChildCacheState::eRefetch; } ValueObjectSP GenericBitsetFrontEnd::GetChildAtIndex(size_t idx) { diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp index 7415e91..57331ea 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp @@ -44,7 +44,7 @@ public: size_t CalculateNumChildren() override { return m_has_value ? 1U : 0U; } ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; private: bool m_has_value = false; @@ -61,7 +61,7 @@ GenericOptionalFrontend::GenericOptionalFrontend(ValueObject &valobj, } } -bool GenericOptionalFrontend::Update() { +lldb::ChildCacheState GenericOptionalFrontend::Update() { ValueObjectSP engaged_sp; if (m_stdlib == StdLib::LibCxx) @@ -71,14 +71,14 @@ bool GenericOptionalFrontend::Update() { ->GetChildMemberWithName("_M_engaged"); if (!engaged_sp) - return false; + return lldb::ChildCacheState::eRefetch; // _M_engaged/__engaged is a bool flag and is true if the optional contains a // value. Converting it to unsigned gives us a size of 1 if it contains a // value and 0 if not. m_has_value = engaged_sp->GetValueAsUnsigned(0) != 0; - return false; + return lldb::ChildCacheState::eRefetch; } ValueObjectSP GenericOptionalFrontend::GetChildAtIndex(size_t _idx) { diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp index d0bdbe1..a7d7066 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp @@ -231,21 +231,22 @@ lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd:: Update(); } -bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() { m_pair_sp.reset(); m_pair_ptr = nullptr; ValueObjectSP valobj_sp = m_backend.GetSP(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; TargetSP target_sp(valobj_sp->GetTargetSP()); if (!target_sp) - return false; + return lldb::ChildCacheState::eRefetch; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; // this must be a ValueObject* because it is a child of the ValueObject we // are producing children for it if were a ValueObjectSP, we would end up @@ -278,7 +279,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() { auto __i_(valobj_sp->GetChildMemberWithName("__i_")); if (!__i_) { m_pair_ptr = nullptr; - return false; + return lldb::ChildCacheState::eRefetch; } CompilerType pair_type( __i_->GetCompilerType().GetTypeTemplateArgument(0)); @@ -290,7 +291,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() { 0, name, &bit_offset_ptr, &bitfield_bit_size_ptr, &is_bitfield_ptr); if (!pair_type) { m_pair_ptr = nullptr; - return false; + return lldb::ChildCacheState::eRefetch; } auto addr(m_pair_ptr->GetValueAsUnsigned(LLDB_INVALID_ADDRESS)); @@ -299,7 +300,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() { auto ts = pair_type.GetTypeSystem(); auto ast_ctx = ts.dyn_cast_or_null(); if (!ast_ctx) - return false; + return lldb::ChildCacheState::eRefetch; // Mimick layout of std::__tree_iterator::__ptr_ and read it in // from process memory. @@ -328,14 +329,14 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() { {"payload", pair_type}}); std::optional size = tree_node_type.GetByteSize(nullptr); if (!size) - return false; + return lldb::ChildCacheState::eRefetch; WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0)); ProcessSP process_sp(target_sp->GetProcessSP()); Status error; process_sp->ReadMemory(addr, buffer_sp->GetBytes(), buffer_sp->GetByteSize(), error); if (error.Fail()) - return false; + return lldb::ChildCacheState::eRefetch; DataExtractor extractor(buffer_sp, process_sp->GetByteOrder(), process_sp->GetAddressByteSize()); auto pair_sp = CreateValueObjectFromData( @@ -347,7 +348,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() { } } - return false; + return lldb::ChildCacheState::eRefetch; } size_t lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd:: @@ -399,22 +400,22 @@ lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: Update(); } -bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: - Update() { +lldb::ChildCacheState lldb_private::formatters:: + LibCxxUnorderedMapIteratorSyntheticFrontEnd::Update() { m_pair_sp.reset(); m_iter_ptr = nullptr; ValueObjectSP valobj_sp = m_backend.GetSP(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; TargetSP target_sp(valobj_sp->GetTargetSP()); if (!target_sp) - return false; + return lldb::ChildCacheState::eRefetch; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; auto exprPathOptions = ValueObject::GetValueForExpressionPathOptions() .DontCheckDotVsArrowSyntax() @@ -437,7 +438,7 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: auto iter_child(valobj_sp->GetChildMemberWithName("__i_")); if (!iter_child) { m_iter_ptr = nullptr; - return false; + return lldb::ChildCacheState::eRefetch; } CompilerType node_type(iter_child->GetCompilerType() @@ -455,19 +456,19 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: 0, name, &bit_offset_ptr, &bitfield_bit_size_ptr, &is_bitfield_ptr); if (!pair_type) { m_iter_ptr = nullptr; - return false; + return lldb::ChildCacheState::eRefetch; } uint64_t addr = m_iter_ptr->GetValueAsUnsigned(LLDB_INVALID_ADDRESS); m_iter_ptr = nullptr; if (addr == 0 || addr == LLDB_INVALID_ADDRESS) - return false; + return lldb::ChildCacheState::eRefetch; auto ts = pair_type.GetTypeSystem(); auto ast_ctx = ts.dyn_cast_or_null(); if (!ast_ctx) - return false; + return lldb::ChildCacheState::eRefetch; // Mimick layout of std::__hash_iterator::__node_ and read it in // from process memory. @@ -489,14 +490,14 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: {"__value_", pair_type}}); std::optional size = tree_node_type.GetByteSize(nullptr); if (!size) - return false; + return lldb::ChildCacheState::eRefetch; WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0)); ProcessSP process_sp(target_sp->GetProcessSP()); Status error; process_sp->ReadMemory(addr, buffer_sp->GetBytes(), buffer_sp->GetByteSize(), error); if (error.Fail()) - return false; + return lldb::ChildCacheState::eRefetch; DataExtractor extractor(buffer_sp, process_sp->GetByteOrder(), process_sp->GetAddressByteSize()); auto pair_sp = CreateValueObjectFromData( @@ -505,7 +506,7 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: m_pair_sp = pair_sp->GetChildAtIndex(2); } - return false; + return lldb::ChildCacheState::eRefetch; } size_t lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: @@ -600,22 +601,23 @@ lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::GetChildAtIndex( return lldb::ValueObjectSP(); } -bool lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::Update() { m_cntrl = nullptr; ValueObjectSP valobj_sp = m_backend.GetSP(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; TargetSP target_sp(valobj_sp->GetTargetSP()); if (!target_sp) - return false; + return lldb::ChildCacheState::eRefetch; lldb::ValueObjectSP cntrl_sp(valobj_sp->GetChildMemberWithName("__cntrl_")); m_cntrl = cntrl_sp.get(); // need to store the raw pointer to avoid a circular // dependency - return false; + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd:: @@ -689,14 +691,15 @@ lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::GetChildAtIndex( return lldb::ValueObjectSP(); } -bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() { ValueObjectSP valobj_sp = m_backend.GetSP(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; ValueObjectSP ptr_sp(valobj_sp->GetChildMemberWithName("__ptr_")); if (!ptr_sp) - return false; + return lldb::ChildCacheState::eRefetch; // Retrieve the actual pointer and the deleter, and clone them to give them // user-friendly names. @@ -708,7 +711,7 @@ bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() { if (deleter_sp) m_deleter_sp = deleter_sp->Clone(ConstString("deleter")); - return false; + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd:: diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h index 72da6b2..cc8e13d 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h @@ -91,7 +91,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -139,7 +139,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -170,7 +170,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -190,7 +190,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp index eacc608..c81b1e80 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp @@ -94,7 +94,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -110,12 +110,13 @@ lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd:: LibcxxStdAtomicSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp) : SyntheticChildrenFrontEnd(*valobj_sp) {} -bool lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::Update() { ValueObjectSP atomic_value = GetLibCxxAtomicValue(m_backend); if (atomic_value) m_real_child = GetLibCxxAtomicValue(m_backend).get(); - return false; + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd:: diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp index bfd7b88..3c33f94 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp @@ -30,7 +30,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -82,13 +82,13 @@ lldb::ValueObjectSP lldb_private::formatters:: m_element_type); } -bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd:: - Update() { +lldb::ChildCacheState +lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::Update() { m_start = nullptr; m_num_elements = 0; m_element_type = m_backend.GetCompilerType().GetTypeTemplateArgument(0); if (!m_element_type.IsValid()) - return false; + return lldb::ChildCacheState::eRefetch; if (std::optional size = m_element_type.GetByteSize(nullptr)) { m_element_size = *size; @@ -96,7 +96,7 @@ bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd:: m_start = m_backend.GetChildMemberWithName("__begin_").get(); } - return false; + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd:: diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp index 2e2e2a8..e28ef81 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp @@ -109,7 +109,7 @@ public: return ExtractIndexFromString(name.GetCString()); } bool MightHaveChildren() override { return true; } - bool Update() override; + lldb::ChildCacheState Update() override; protected: AbstractListFrontEnd(ValueObject &valobj) @@ -138,7 +138,7 @@ public: size_t CalculateNumChildren() override; ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; }; class ListFrontEnd : public AbstractListFrontEnd { @@ -151,7 +151,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; private: lldb::addr_t m_node_address = 0; @@ -160,7 +160,7 @@ private: } // end anonymous namespace -bool AbstractListFrontEnd::Update() { +lldb::ChildCacheState AbstractListFrontEnd::Update() { m_loop_detected = 0; m_count = UINT32_MAX; m_head = nullptr; @@ -180,10 +180,10 @@ bool AbstractListFrontEnd::Update() { list_type = list_type.GetNonReferenceType(); if (list_type.GetNumTemplateArguments() == 0) - return false; + return lldb::ChildCacheState::eRefetch; m_element_type = list_type.GetTypeTemplateArgument(0); - return false; + return lldb::ChildCacheState::eRefetch; } bool AbstractListFrontEnd::HasLoop(size_t count) { @@ -284,22 +284,22 @@ ValueObjectSP ForwardListFrontEnd::GetChildAtIndex(size_t idx) { m_element_type); } -bool ForwardListFrontEnd::Update() { +lldb::ChildCacheState ForwardListFrontEnd::Update() { AbstractListFrontEnd::Update(); Status err; ValueObjectSP backend_addr(m_backend.AddressOf(err)); if (err.Fail() || !backend_addr) - return false; + return lldb::ChildCacheState::eRefetch; ValueObjectSP impl_sp(m_backend.GetChildMemberWithName("__before_begin_")); if (!impl_sp) - return false; + return lldb::ChildCacheState::eRefetch; impl_sp = GetFirstValueOfLibCXXCompressedPair(*impl_sp); if (!impl_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_head = impl_sp->GetChildMemberWithName("__next_").get(); - return false; + return lldb::ChildCacheState::eRefetch; } ListFrontEnd::ListFrontEnd(lldb::ValueObjectSP valobj_sp) @@ -394,7 +394,7 @@ lldb::ValueObjectSP ListFrontEnd::GetChildAtIndex(size_t idx) { m_element_type); } -bool ListFrontEnd::Update() { +lldb::ChildCacheState ListFrontEnd::Update() { AbstractListFrontEnd::Update(); m_tail = nullptr; m_node_address = 0; @@ -402,16 +402,16 @@ bool ListFrontEnd::Update() { Status err; ValueObjectSP backend_addr(m_backend.AddressOf(err)); if (err.Fail() || !backend_addr) - return false; + return lldb::ChildCacheState::eRefetch; m_node_address = backend_addr->GetValueAsUnsigned(0); if (!m_node_address || m_node_address == LLDB_INVALID_ADDRESS) - return false; + return lldb::ChildCacheState::eRefetch; ValueObjectSP impl_sp(m_backend.GetChildMemberWithName("__end_")); if (!impl_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_head = impl_sp->GetChildMemberWithName("__next_").get(); m_tail = impl_sp->GetChildMemberWithName("__prev_").get(); - return false; + return lldb::ChildCacheState::eRefetch; } SyntheticChildrenFrontEnd *formatters::LibcxxStdListSyntheticFrontEndCreator( diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp index d3ee63a..d208acf 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp @@ -181,7 +181,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -405,15 +405,16 @@ lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::GetChildAtIndex( return potential_child_sp; } -bool lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::Update() { m_count = UINT32_MAX; m_tree = m_root_node = nullptr; m_iterators.clear(); m_tree = m_backend.GetChildMemberWithName("__tree_").get(); if (!m_tree) - return false; + return lldb::ChildCacheState::eRefetch; m_root_node = m_tree->GetChildMemberWithName("__begin_node_").get(); - return false; + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd:: diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp index c31940a..83f93b1 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp @@ -26,7 +26,7 @@ public: } bool MightHaveChildren() override { return true; } - bool Update() override; + lldb::ChildCacheState Update() override; size_t CalculateNumChildren() override { return m_container_sp ? m_container_sp->GetNumChildren() : 0; @@ -47,13 +47,13 @@ private: }; } // namespace -bool QueueFrontEnd::Update() { +lldb::ChildCacheState QueueFrontEnd::Update() { m_container_sp = nullptr; ValueObjectSP c_sp = m_backend.GetChildMemberWithName("c"); if (!c_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_container_sp = c_sp->GetSyntheticValue().get(); - return false; + return lldb::ChildCacheState::eRefetch; } SyntheticChildrenFrontEnd * diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp index 6aeb557..c032d67 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp @@ -38,7 +38,7 @@ public: return m_range_sp; } - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override { return true; } @@ -59,17 +59,18 @@ lldb_private::formatters::LibcxxStdRangesRefViewSyntheticFrontEnd:: Update(); } -bool lldb_private::formatters::LibcxxStdRangesRefViewSyntheticFrontEnd:: - Update() { +lldb::ChildCacheState +lldb_private::formatters::LibcxxStdRangesRefViewSyntheticFrontEnd::Update() { ValueObjectSP range_ptr = GetChildMemberWithName(m_backend, {ConstString("__range_")}); if (!range_ptr) - return false; + return lldb::ChildCacheState::eRefetch; lldb_private::Status error; m_range_sp = range_ptr->Dereference(error); - return error.Success(); + return error.Success() ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } lldb_private::SyntheticChildrenFrontEnd * diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp index ec062ed..4ddfaef 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp @@ -53,7 +53,7 @@ public: // This function checks for a '__size' member to determine the number // of elements in the span. If no such member exists, we get the size // from the only other place it can be: the template argument. - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -93,12 +93,13 @@ lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::GetChildAtIndex( m_element_type); } -bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() { // Get element type. ValueObjectSP data_type_finder_sp = GetChildMemberWithName( m_backend, {ConstString("__data_"), ConstString("__data")}); if (!data_type_finder_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_element_type = data_type_finder_sp->GetCompilerType().GetPointeeType(); @@ -122,7 +123,7 @@ bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() { } } - return true; + return lldb::ChildCacheState::eReuse; } bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd:: diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp index 9024ed4..54687101 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp @@ -25,7 +25,7 @@ public: } bool MightHaveChildren() override { return true; } - bool Update() override; + lldb::ChildCacheState Update() override; size_t CalculateNumChildren() override { return m_elements.size(); } ValueObjectSP GetChildAtIndex(size_t idx) override; @@ -40,7 +40,7 @@ private: }; } -bool TupleFrontEnd::Update() { +lldb::ChildCacheState TupleFrontEnd::Update() { m_elements.clear(); m_base = nullptr; @@ -51,11 +51,11 @@ bool TupleFrontEnd::Update() { base_sp = m_backend.GetChildMemberWithName("base_"); } if (!base_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_base = base_sp.get(); m_elements.assign(base_sp->GetCompilerType().GetNumDirectBaseClasses(), nullptr); - return false; + return lldb::ChildCacheState::eRefetch; } ValueObjectSP TupleFrontEnd::GetChildAtIndex(size_t idx) { diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp index 1a85d37..4cac52f 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp @@ -37,7 +37,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -193,41 +193,41 @@ lldb::ValueObjectSP lldb_private::formatters:: m_element_type); } -bool lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd:: - Update() { +lldb::ChildCacheState +lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::Update() { m_num_elements = 0; m_next_element = nullptr; m_elements_cache.clear(); ValueObjectSP table_sp = m_backend.GetChildMemberWithName("__table_"); if (!table_sp) - return false; + return lldb::ChildCacheState::eRefetch; ValueObjectSP p2_sp = table_sp->GetChildMemberWithName("__p2_"); if (!p2_sp) - return false; + return lldb::ChildCacheState::eRefetch; ValueObjectSP num_elements_sp = GetFirstValueOfLibCXXCompressedPair(*p2_sp); if (!num_elements_sp) - return false; + return lldb::ChildCacheState::eRefetch; ValueObjectSP p1_sp = table_sp->GetChildMemberWithName("__p1_"); if (!p1_sp) - return false; + return lldb::ChildCacheState::eRefetch; ValueObjectSP value_sp = GetFirstValueOfLibCXXCompressedPair(*p1_sp); if (!value_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_tree = value_sp->GetChildMemberWithName("__next_").get(); if (m_tree == nullptr) - return false; + return lldb::ChildCacheState::eRefetch; m_num_elements = num_elements_sp->GetValueAsUnsigned(0); if (m_num_elements > 0) m_next_element = m_tree; - return false; + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd:: diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp index e863ccc..ecbb7cf 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp @@ -204,7 +204,7 @@ public: } bool MightHaveChildren() override { return true; } - bool Update() override; + lldb::ChildCacheState Update() override; size_t CalculateNumChildren() override { return m_size; } ValueObjectSP GetChildAtIndex(size_t idx) override; @@ -213,24 +213,24 @@ private: }; } // namespace -bool VariantFrontEnd::Update() { +lldb::ChildCacheState VariantFrontEnd::Update() { m_size = 0; ValueObjectSP impl_sp = formatters::GetChildMemberWithName( m_backend, {ConstString("__impl_"), ConstString("__impl")}); if (!impl_sp) - return false; + return lldb::ChildCacheState::eRefetch; LibcxxVariantIndexValidity validity = LibcxxVariantGetIndexValidity(impl_sp); if (validity == LibcxxVariantIndexValidity::Invalid) - return false; + return lldb::ChildCacheState::eRefetch; if (validity == LibcxxVariantIndexValidity::NPos) - return true; + return lldb::ChildCacheState::eReuse; m_size = 1; - return false; + return lldb::ChildCacheState::eRefetch; } ValueObjectSP VariantFrontEnd::GetChildAtIndex(size_t idx) { diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp index 9d88fcf..0c3c3f0 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp @@ -29,7 +29,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -50,7 +50,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override { return true; } @@ -116,17 +116,18 @@ lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::GetChildAtIndex( m_element_type); } -bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() { m_start = m_finish = nullptr; ValueObjectSP data_type_finder_sp( m_backend.GetChildMemberWithName("__end_cap_")); if (!data_type_finder_sp) - return false; + return lldb::ChildCacheState::eRefetch; data_type_finder_sp = GetFirstValueOfLibCXXCompressedPair(*data_type_finder_sp); if (!data_type_finder_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_element_type = data_type_finder_sp->GetCompilerType().GetPointeeType(); if (std::optional size = m_element_type.GetByteSize(nullptr)) { @@ -138,7 +139,7 @@ bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() { m_finish = m_backend.GetChildMemberWithName("__end_").get(); } } - return false; + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd:: @@ -226,29 +227,30 @@ lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::GetChildAtIndex( } }*/ -bool lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::Update() { m_children.clear(); ValueObjectSP valobj_sp = m_backend.GetSP(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); ValueObjectSP size_sp(valobj_sp->GetChildMemberWithName("__size_")); if (!size_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_count = size_sp->GetValueAsUnsigned(0); if (!m_count) - return true; + return lldb::ChildCacheState::eReuse; ValueObjectSP begin_sp(valobj_sp->GetChildMemberWithName("__begin_")); if (!begin_sp) { m_count = 0; - return false; + return lldb::ChildCacheState::eRefetch; } m_base_data_address = begin_sp->GetValueAsUnsigned(0); if (!m_base_data_address) { m_count = 0; - return false; + return lldb::ChildCacheState::eRefetch; } - return false; + return lldb::ChildCacheState::eRefetch; } size_t lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd:: diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp index 23af50f..4115518 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp @@ -47,7 +47,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -68,7 +68,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -94,29 +94,29 @@ LibstdcppMapIteratorSyntheticFrontEnd::LibstdcppMapIteratorSyntheticFrontEnd( Update(); } -bool LibstdcppMapIteratorSyntheticFrontEnd::Update() { +lldb::ChildCacheState LibstdcppMapIteratorSyntheticFrontEnd::Update() { ValueObjectSP valobj_sp = m_backend.GetSP(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; TargetSP target_sp(valobj_sp->GetTargetSP()); if (!target_sp) - return false; + return lldb::ChildCacheState::eRefetch; bool is_64bit = (target_sp->GetArchitecture().GetAddressByteSize() == 8); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); ValueObjectSP _M_node_sp(valobj_sp->GetChildMemberWithName("_M_node")); if (!_M_node_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_pair_address = _M_node_sp->GetValueAsUnsigned(0); if (m_pair_address == 0) - return false; + return lldb::ChildCacheState::eRefetch; m_pair_address += (is_64bit ? 32 : 16); @@ -124,12 +124,12 @@ bool LibstdcppMapIteratorSyntheticFrontEnd::Update() { if (my_type.GetNumTemplateArguments() >= 1) { CompilerType pair_type = my_type.GetTypeTemplateArgument(0); if (!pair_type) - return false; + return lldb::ChildCacheState::eRefetch; m_pair_type = pair_type; } else - return false; + return lldb::ChildCacheState::eRefetch; - return true; + return lldb::ChildCacheState::eReuse; } size_t LibstdcppMapIteratorSyntheticFrontEnd::CalculateNumChildren() { @@ -193,22 +193,22 @@ lldb_private::formatters::VectorIteratorSyntheticFrontEnd:: Update(); } -bool VectorIteratorSyntheticFrontEnd::Update() { +lldb::ChildCacheState VectorIteratorSyntheticFrontEnd::Update() { m_item_sp.reset(); ValueObjectSP valobj_sp = m_backend.GetSP(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; ValueObjectSP item_ptr = formatters::GetChildMemberWithName(*valobj_sp, m_item_names); if (!item_ptr) - return false; + return lldb::ChildCacheState::eRefetch; if (item_ptr->GetValueAsUnsigned(0) == 0) - return false; + return lldb::ChildCacheState::eRefetch; Status err; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); m_item_sp = CreateValueObjectFromAddress( @@ -216,7 +216,7 @@ bool VectorIteratorSyntheticFrontEnd::Update() { item_ptr->GetCompilerType().GetPointeeType()); if (err.Fail()) m_item_sp.reset(); - return false; + return lldb::ChildCacheState::eRefetch; } size_t VectorIteratorSyntheticFrontEnd::CalculateNumChildren() { return 1; } @@ -390,23 +390,23 @@ LibStdcppSharedPtrSyntheticFrontEnd::GetChildAtIndex(size_t idx) { return lldb::ValueObjectSP(); } -bool LibStdcppSharedPtrSyntheticFrontEnd::Update() { +lldb::ChildCacheState LibStdcppSharedPtrSyntheticFrontEnd::Update() { auto backend = m_backend.GetSP(); if (!backend) - return false; + return lldb::ChildCacheState::eRefetch; auto valobj_sp = backend->GetNonSyntheticValue(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; auto ptr_obj_sp = valobj_sp->GetChildMemberWithName("_M_ptr"); if (!ptr_obj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_obj = ptr_obj_sp->Clone(ConstString("pointer")).get(); m_obj_obj = nullptr; - return false; + return lldb::ChildCacheState::eRefetch; } bool LibStdcppSharedPtrSyntheticFrontEnd::MightHaveChildren() { return true; } diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp index f1bfeae..189f956 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp @@ -30,7 +30,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -53,12 +53,12 @@ LibStdcppTupleSyntheticFrontEnd::LibStdcppTupleSyntheticFrontEnd( Update(); } -bool LibStdcppTupleSyntheticFrontEnd::Update() { +lldb::ChildCacheState LibStdcppTupleSyntheticFrontEnd::Update() { m_members.clear(); ValueObjectSP valobj_backend_sp = m_backend.GetSP(); if (!valobj_backend_sp) - return false; + return lldb::ChildCacheState::eRefetch; ValueObjectSP next_child_sp = valobj_backend_sp->GetNonSyntheticValue(); while (next_child_sp != nullptr) { @@ -83,7 +83,7 @@ bool LibStdcppTupleSyntheticFrontEnd::Update() { } } - return false; + return lldb::ChildCacheState::eRefetch; } bool LibStdcppTupleSyntheticFrontEnd::MightHaveChildren() { return true; } diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp index a84d641..3b0f632 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp @@ -30,7 +30,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -84,11 +84,11 @@ ValueObjectSP LibStdcppUniquePtrSyntheticFrontEnd::GetTuple() { return obj_child_sp; } -bool LibStdcppUniquePtrSyntheticFrontEnd::Update() { +lldb::ChildCacheState LibStdcppUniquePtrSyntheticFrontEnd::Update() { ValueObjectSP tuple_sp = GetTuple(); if (!tuple_sp) - return false; + return lldb::ChildCacheState::eRefetch; std::unique_ptr tuple_frontend( LibStdcppTupleSyntheticFrontEndCreator(nullptr, tuple_sp)); @@ -110,7 +110,7 @@ bool LibStdcppUniquePtrSyntheticFrontEnd::Update() { } m_obj_obj = nullptr; - return false; + return lldb::ChildCacheState::eRefetch; } bool LibStdcppUniquePtrSyntheticFrontEnd::MightHaveChildren() { return true; } diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp index f1a7e04..64047dc 100644 --- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp +++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp @@ -1044,7 +1044,9 @@ public: return lldb::ValueObjectSP(); } - bool Update() override { return false; } + lldb::ChildCacheState Update() override { + return lldb::ChildCacheState::eRefetch; + } bool MightHaveChildren() override { return false; } diff --git a/lldb/source/Plugins/Language/ObjC/NSArray.cpp b/lldb/source/Plugins/Language/ObjC/NSArray.cpp index 7d0004c..09bf7a2 100644 --- a/lldb/source/Plugins/Language/ObjC/NSArray.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSArray.cpp @@ -54,7 +54,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override = 0; + lldb::ChildCacheState Update() override = 0; bool MightHaveChildren() override; @@ -81,7 +81,7 @@ public: ~GenericNSArrayMSyntheticFrontEnd() override; - bool Update() override; + lldb::ChildCacheState Update() override; protected: lldb::addr_t GetDataAddress() override; @@ -218,7 +218,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -306,7 +306,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -323,7 +323,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -500,9 +500,8 @@ lldb_private::formatters::NSArrayMSyntheticFrontEndBase::GetChildAtIndex( } template -bool -lldb_private::formatters:: - GenericNSArrayMSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::GenericNSArrayMSyntheticFrontEnd::Update() { ValueObjectSP valobj_sp = m_backend.GetSP(); m_ptr_size = 0; delete m_data_32; @@ -510,13 +509,13 @@ lldb_private::formatters:: delete m_data_64; m_data_64 = nullptr; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); Status error; error.Clear(); lldb::ProcessSP process_sp(valobj_sp->GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_size = process_sp->GetAddressByteSize(); uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size; if (m_ptr_size == 4) { @@ -529,7 +528,8 @@ lldb_private::formatters:: error); } - return error.Success(); + return error.Success() ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } bool @@ -641,9 +641,9 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd:: } template -bool -lldb_private::formatters::GenericNSArrayISyntheticFrontEnd:: - Update() { +lldb::ChildCacheState +lldb_private::formatters::GenericNSArrayISyntheticFrontEnd::Update() { ValueObjectSP valobj_sp = m_backend.GetSP(); m_ptr_size = 0; delete m_data_32; @@ -651,13 +651,13 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd:: delete m_data_64; m_data_64 = nullptr; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); Status error; error.Clear(); lldb::ProcessSP process_sp(valobj_sp->GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_size = process_sp->GetAddressByteSize(); uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size; if (m_ptr_size == 4) { @@ -670,7 +670,8 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd:: error); } - return error.Success(); + return error.Success() ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } template @@ -723,8 +724,9 @@ lldb_private::formatters::NSArray0SyntheticFrontEnd::CalculateNumChildren() { return 0; } -bool lldb_private::formatters::NSArray0SyntheticFrontEnd::Update() { - return false; +lldb::ChildCacheState +lldb_private::formatters::NSArray0SyntheticFrontEnd::Update() { + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::NSArray0SyntheticFrontEnd::MightHaveChildren() { @@ -757,8 +759,9 @@ lldb_private::formatters::NSArray1SyntheticFrontEnd::CalculateNumChildren() { return 1; } -bool lldb_private::formatters::NSArray1SyntheticFrontEnd::Update() { - return false; +lldb::ChildCacheState +lldb_private::formatters::NSArray1SyntheticFrontEnd::Update() { + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::NSArray1SyntheticFrontEnd::MightHaveChildren() { diff --git a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp index d377ee7..9c252a9 100644 --- a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp @@ -107,7 +107,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -148,7 +148,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -180,7 +180,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -213,7 +213,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -234,7 +234,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -266,9 +266,9 @@ namespace Foundation1100 { size_t CalculateNumChildren() override; lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - - bool Update() override; - + + lldb::ChildCacheState Update() override; + bool MightHaveChildren() override; size_t GetIndexOfChildWithName(ConstString name) override; @@ -613,7 +613,8 @@ size_t lldb_private::formatters::NSDictionaryISyntheticFrontEnd:: return (m_data_32 ? m_data_32->_used : m_data_64->_used); } -bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() { m_children.clear(); delete m_data_32; m_data_32 = nullptr; @@ -622,13 +623,13 @@ bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() { m_ptr_size = 0; ValueObjectSP valobj_sp = m_backend.GetSP(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); Status error; error.Clear(); lldb::ProcessSP process_sp(valobj_sp->GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_size = process_sp->GetAddressByteSize(); m_order = process_sp->GetByteOrder(); uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size; @@ -642,9 +643,9 @@ bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() { error); } if (error.Fail()) - return false; + return lldb::ChildCacheState::eRefetch; m_data_ptr = data_location + m_ptr_size; - return false; + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd:: @@ -750,20 +751,23 @@ size_t lldb_private::formatters::NSCFDictionarySyntheticFrontEnd:: return m_hashtable.GetCount(); } -bool lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::Update() { m_children.clear(); ValueObjectSP valobj_sp = m_backend.GetSP(); m_ptr_size = 0; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); lldb::ProcessSP process_sp(valobj_sp->GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_size = process_sp->GetAddressByteSize(); m_order = process_sp->GetByteOrder(); - return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref); + return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref) + ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::NSCFDictionarySyntheticFrontEnd:: @@ -881,30 +885,33 @@ size_t lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd:: return m_size; } -bool lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::Update() { ValueObjectSP valobj_sp = m_backend.GetSP(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); Status error; error.Clear(); lldb::ProcessSP process_sp(valobj_sp->GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_size = process_sp->GetAddressByteSize(); m_order = process_sp->GetByteOrder(); uint64_t valobj_addr = valobj_sp->GetValueAsUnsigned(0); m_size = process_sp->ReadUnsignedIntegerFromMemory( valobj_addr + 2 * m_ptr_size, m_ptr_size, 0, error); if (error.Fail()) - return false; + return lldb::ChildCacheState::eRefetch; m_keys_ptr = process_sp->ReadPointerFromMemory(valobj_addr + 3 * m_ptr_size, error); if (error.Fail()) - return false; + return lldb::ChildCacheState::eRefetch; m_objects_ptr = process_sp->ReadPointerFromMemory(valobj_addr + 4 * m_ptr_size, error); - return !error.Fail(); + + return error.Success() ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd:: @@ -992,9 +999,10 @@ size_t lldb_private::formatters::NSDictionary1SyntheticFrontEnd:: return 1; } -bool lldb_private::formatters::NSDictionary1SyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::NSDictionary1SyntheticFrontEnd::Update() { m_pair.reset(); - return false; + return lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::NSDictionary1SyntheticFrontEnd:: @@ -1087,9 +1095,9 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd::Calcul } template -bool -lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd:: - Update() { +lldb::ChildCacheState +lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd::Update() { m_children.clear(); ValueObjectSP valobj_sp = m_backend.GetSP(); m_ptr_size = 0; @@ -1098,13 +1106,13 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd:: delete m_data_64; m_data_64 = nullptr; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); Status error; error.Clear(); lldb::ProcessSP process_sp(valobj_sp->GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_size = process_sp->GetAddressByteSize(); m_order = process_sp->GetByteOrder(); uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size; @@ -1118,7 +1126,8 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd:: error); } - return error.Success(); + return error.Success() ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } template @@ -1249,9 +1258,8 @@ lldb_private::formatters::Foundation1100:: return (m_data_32 ? m_data_32->_used : m_data_64->_used); } -bool -lldb_private::formatters::Foundation1100:: - NSDictionaryMSyntheticFrontEnd::Update() { +lldb::ChildCacheState lldb_private::formatters::Foundation1100:: + NSDictionaryMSyntheticFrontEnd::Update() { m_children.clear(); ValueObjectSP valobj_sp = m_backend.GetSP(); m_ptr_size = 0; @@ -1260,13 +1268,13 @@ lldb_private::formatters::Foundation1100:: delete m_data_64; m_data_64 = nullptr; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); Status error; error.Clear(); lldb::ProcessSP process_sp(valobj_sp->GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_size = process_sp->GetAddressByteSize(); m_order = process_sp->GetByteOrder(); uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size; @@ -1280,7 +1288,8 @@ lldb_private::formatters::Foundation1100:: error); } - return error.Success(); + return error.Success() ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } bool diff --git a/lldb/source/Plugins/Language/ObjC/NSError.cpp b/lldb/source/Plugins/Language/ObjC/NSError.cpp index 99eeb2d..ce52ae5 100644 --- a/lldb/source/Plugins/Language/ObjC/NSError.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSError.cpp @@ -133,17 +133,17 @@ public: return m_child_sp; } - bool Update() override { + lldb::ChildCacheState Update() override { m_child_ptr = nullptr; m_child_sp.reset(); ProcessSP process_sp(m_backend.GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; lldb::addr_t userinfo_location = DerefToNSErrorPointer(m_backend); if (userinfo_location == LLDB_INVALID_ADDRESS) - return false; + return lldb::ChildCacheState::eRefetch; size_t ptr_size = process_sp->GetAddressByteSize(); @@ -152,17 +152,17 @@ public: lldb::addr_t userinfo = process_sp->ReadPointerFromMemory(userinfo_location, error); if (userinfo == LLDB_INVALID_ADDRESS || error.Fail()) - return false; + return lldb::ChildCacheState::eRefetch; InferiorSizedWord isw(userinfo, *process_sp); TypeSystemClangSP scratch_ts_sp = ScratchTypeSystemClang::GetForTarget(process_sp->GetTarget()); if (!scratch_ts_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_child_sp = CreateValueObjectFromData( "_userInfo", isw.GetAsData(process_sp->GetByteOrder()), m_backend.GetExecutionContextRef(), scratch_ts_sp->GetBasicType(lldb::eBasicTypeObjCID)); - return false; + return lldb::ChildCacheState::eRefetch; } bool MightHaveChildren() override { return true; } diff --git a/lldb/source/Plugins/Language/ObjC/NSException.cpp b/lldb/source/Plugins/Language/ObjC/NSException.cpp index 29805bb..e8011e5 100644 --- a/lldb/source/Plugins/Language/ObjC/NSException.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSException.cpp @@ -137,14 +137,17 @@ public: return lldb::ValueObjectSP(); } - bool Update() override { + lldb::ChildCacheState Update() override { m_name_sp.reset(); m_reason_sp.reset(); m_userinfo_sp.reset(); m_reserved_sp.reset(); - return ExtractFields(m_backend, &m_name_sp, &m_reason_sp, &m_userinfo_sp, - &m_reserved_sp); + const auto ret = ExtractFields(m_backend, &m_name_sp, &m_reason_sp, + &m_userinfo_sp, &m_reserved_sp); + + return ret ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } bool MightHaveChildren() override { return true; } diff --git a/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp b/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp index 2a4ce80..69e6ab1 100644 --- a/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp @@ -46,17 +46,17 @@ public: return m_impl.GetIndexAtIndex(idx, m_uint_star_type); } - bool Update() override { + lldb::ChildCacheState Update() override { m_impl.Clear(); auto type_system = m_backend.GetCompilerType().GetTypeSystem(); if (!type_system) - return false; + return lldb::ChildCacheState::eRefetch; auto ast = ScratchTypeSystemClang::GetForTarget( *m_backend.GetExecutionContextRef().GetTargetSP()); if (!ast) - return false; + return lldb::ChildCacheState::eRefetch; m_uint_star_type = ast->GetPointerSizedIntType(false); @@ -65,18 +65,18 @@ public: ProcessSP process_sp = m_backend.GetProcessSP(); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; ObjCLanguageRuntime *runtime = ObjCLanguageRuntime::Get(*process_sp); if (!runtime) - return false; + return lldb::ChildCacheState::eRefetch; ObjCLanguageRuntime::ClassDescriptorSP descriptor( runtime->GetClassDescriptor(m_backend)); if (!descriptor.get() || !descriptor->IsValid()) - return false; + return lldb::ChildCacheState::eRefetch; uint64_t info_bits(0), value_bits(0), payload(0); @@ -119,7 +119,7 @@ public: } } } - return false; + return lldb::ChildCacheState::eRefetch; } bool MightHaveChildren() override { return m_impl.m_mode != Mode::Invalid; } diff --git a/lldb/source/Plugins/Language/ObjC/NSSet.cpp b/lldb/source/Plugins/Language/ObjC/NSSet.cpp index ed1751c..ede6485 100644 --- a/lldb/source/Plugins/Language/ObjC/NSSet.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSSet.cpp @@ -50,7 +50,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -88,7 +88,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -121,7 +121,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -237,7 +237,7 @@ public: lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; - bool Update() override; + lldb::ChildCacheState Update() override; bool MightHaveChildren() override; @@ -426,7 +426,8 @@ lldb_private::formatters::NSSetISyntheticFrontEnd::CalculateNumChildren() { return (m_data_32 ? m_data_32->_used : m_data_64->_used); } -bool lldb_private::formatters::NSSetISyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::NSSetISyntheticFrontEnd::Update() { m_children.clear(); delete m_data_32; m_data_32 = nullptr; @@ -435,13 +436,13 @@ bool lldb_private::formatters::NSSetISyntheticFrontEnd::Update() { m_ptr_size = 0; ValueObjectSP valobj_sp = m_backend.GetSP(); if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); lldb::ProcessSP process_sp(valobj_sp->GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_size = process_sp->GetAddressByteSize(); uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size; Status error; @@ -455,9 +456,9 @@ bool lldb_private::formatters::NSSetISyntheticFrontEnd::Update() { error); } if (error.Fail()) - return false; + return lldb::ChildCacheState::eRefetch; m_data_ptr = data_location + m_ptr_size; - return true; + return lldb::ChildCacheState::eReuse; } bool lldb_private::formatters::NSSetISyntheticFrontEnd::MightHaveChildren() { @@ -561,20 +562,23 @@ lldb_private::formatters::NSCFSetSyntheticFrontEnd::CalculateNumChildren() { return m_hashtable.GetCount(); } -bool lldb_private::formatters::NSCFSetSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::NSCFSetSyntheticFrontEnd::Update() { m_children.clear(); ValueObjectSP valobj_sp = m_backend.GetSP(); m_ptr_size = 0; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); lldb::ProcessSP process_sp(valobj_sp->GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_size = process_sp->GetAddressByteSize(); m_order = process_sp->GetByteOrder(); - return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref); + return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref) + ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } bool lldb_private::formatters::NSCFSetSyntheticFrontEnd::MightHaveChildren() { @@ -701,9 +705,8 @@ lldb_private::formatters:: } template -bool -lldb_private::formatters:: - GenericNSSetMSyntheticFrontEnd::Update() { +lldb::ChildCacheState +lldb_private::formatters::GenericNSSetMSyntheticFrontEnd::Update() { m_children.clear(); ValueObjectSP valobj_sp = m_backend.GetSP(); m_ptr_size = 0; @@ -712,13 +715,13 @@ lldb_private::formatters:: delete m_data_64; m_data_64 = nullptr; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; if (!valobj_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_exe_ctx_ref = valobj_sp->GetExecutionContextRef(); lldb::ProcessSP process_sp(valobj_sp->GetProcessSP()); if (!process_sp) - return false; + return lldb::ChildCacheState::eRefetch; m_ptr_size = process_sp->GetAddressByteSize(); uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size; Status error; @@ -731,7 +734,8 @@ lldb_private::formatters:: process_sp->ReadMemory(data_location, m_data_64, sizeof(D64), error); } - return error.Success(); + return error.Success() ? lldb::ChildCacheState::eReuse + : lldb::ChildCacheState::eRefetch; } template -- cgit v1.1 From b35c5197629494cb675948fe33d2fdcd75b5aafa Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Feb 2024 11:43:29 +0000 Subject: [DAG] tryToFoldExtendOfConstant - share the same SDLoc argument instead of recreating it over and over again. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 61 +++++++++++++-------------- 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4adea02..d3cd9b1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12739,12 +12739,12 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI, /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). /// Vector extends are not folded if operations are legal; this is to /// avoid introducing illegal build_vector dag nodes. -static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, +static SDValue tryToFoldExtendOfConstant(SDNode *N, const SDLoc &DL, + const TargetLowering &TLI, SelectionDAG &DAG, bool LegalTypes) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - SDLoc DL(N); assert((ISD::isExtOpcode(Opcode) || ISD::isExtVecInRegOpcode(Opcode)) && "Expected EXTEND dag node in input!"); @@ -13400,7 +13400,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { if (N0.isUndef()) return DAG.getConstant(0, DL, VT); - if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) + if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes)) return Res; // fold (sext (sext x)) -> (sext x) @@ -13669,7 +13669,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (N0.isUndef()) return DAG.getConstant(0, DL, VT); - if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) + if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes)) return Res; // fold (zext (zext x)) -> (zext x) @@ -13937,12 +13937,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); // aext(undef) = undef if (N0.isUndef()) return DAG.getUNDEF(VT); - if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) + if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes)) return Res; // fold (aext (aext x)) -> (aext x) @@ -13951,7 +13952,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { if (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND) - return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); + return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0)); // fold (aext (aext_extend_vector_inreg x)) -> (aext_extend_vector_inreg x) // fold (aext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x) @@ -13959,7 +13960,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) - return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); + return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0)); // fold (aext (truncate (load x))) -> (aext (smaller load x)) // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) @@ -13977,7 +13978,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { // fold (aext (truncate x)) if (N0.getOpcode() == ISD::TRUNCATE) - return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); + return DAG.getAnyExtOrTrunc(N0.getOperand(0), DL, VT); // Fold (aext (and (trunc x), cst)) -> (and x, cst) // if the trunc is not free. @@ -13985,7 +13986,6 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { N0.getOperand(0).getOpcode() == ISD::TRUNCATE && N0.getOperand(1).getOpcode() == ISD::Constant && !TLI.isTruncateFree(N0.getOperand(0).getOperand(0), N0.getValueType())) { - SDLoc DL(N); SDValue X = DAG.getAnyExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT); SDValue Y = DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(1)); assert(isa(Y) && "Expected constant to be folded!"); @@ -14011,9 +14011,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI); if (DoXform) { LoadSDNode *LN0 = cast(N0); - SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, - LN0->getChain(), LN0->getBasePtr(), - N0.getValueType(), LN0->getMemOperand()); + SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, LN0->getChain(), + LN0->getBasePtr(), N0.getValueType(), + LN0->getMemOperand()); ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND); // If the load value is used only by N, replace it via CombineTo N. bool NoReplaceTrunc = N0.hasOneUse(); @@ -14039,9 +14039,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { ISD::LoadExtType ExtType = LN0->getExtensionType(); EVT MemVT = LN0->getMemoryVT(); if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) { - SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N), - VT, LN0->getChain(), LN0->getBasePtr(), - MemVT, LN0->getMemOperand()); + SDValue ExtLoad = + DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), LN0->getBasePtr(), + MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); recursivelyDeleteUnusedNodes(LN0); @@ -14069,23 +14069,20 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { // we know that the element size of the sext'd result matches the // element size of the compare operands. if (VT.getSizeInBits() == N00VT.getSizeInBits()) - return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), - N0.getOperand(1), - cast(N0.getOperand(2))->get()); + return DAG.getSetCC(DL, VT, N0.getOperand(0), N0.getOperand(1), + cast(N0.getOperand(2))->get()); // If the desired elements are smaller or larger than the source // elements we can use a matching integer vector type and then // truncate/any extend EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); - SDValue VsetCC = - DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0), - N0.getOperand(1), - cast(N0.getOperand(2))->get()); - return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT); + SDValue VsetCC = DAG.getSetCC( + DL, MatchingVectorType, N0.getOperand(0), N0.getOperand(1), + cast(N0.getOperand(2))->get()); + return DAG.getAnyExtOrTrunc(VsetCC, DL, VT); } // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc - SDLoc DL(N); if (SDValue SCC = SimplifySelectCC( DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT), @@ -14637,10 +14634,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { return SDValue(); } -static SDValue -foldExtendVectorInregToExtendOfSubvector(SDNode *N, const TargetLowering &TLI, - SelectionDAG &DAG, - bool LegalOperations) { +static SDValue foldExtendVectorInregToExtendOfSubvector( + SDNode *N, const SDLoc &DL, const TargetLowering &TLI, SelectionDAG &DAG, + bool LegalOperations) { unsigned InregOpcode = N->getOpcode(); unsigned Opcode = DAG.getOpcode_EXTEND(InregOpcode); @@ -14667,28 +14663,29 @@ foldExtendVectorInregToExtendOfSubvector(SDNode *N, const TargetLowering &TLI, if (LegalOperations && !TLI.isOperationLegal(Opcode, VT)) return SDValue(); - return DAG.getNode(Opcode, SDLoc(N), VT, Src); + return DAG.getNode(Opcode, DL, VT, Src); } SDValue DAGCombiner::visitEXTEND_VECTOR_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); if (N0.isUndef()) { // aext_vector_inreg(undef) = undef because the top bits are undefined. // {s/z}ext_vector_inreg(undef) = 0 because the top bits must be the same. return N->getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ? DAG.getUNDEF(VT) - : DAG.getConstant(0, SDLoc(N), VT); + : DAG.getConstant(0, DL, VT); } - if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) + if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes)) return Res; if (SimplifyDemandedVectorElts(SDValue(N, 0))) return SDValue(N, 0); - if (SDValue R = foldExtendVectorInregToExtendOfSubvector(N, TLI, DAG, + if (SDValue R = foldExtendVectorInregToExtendOfSubvector(N, DL, TLI, DAG, LegalOperations)) return R; -- cgit v1.1 From a643ab852a63a14dba86e031247734c5e3d5adb9 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 8 Feb 2024 11:49:04 +0000 Subject: [DebugInfo][RemoveDIs] Final omnibus test fixing for RemoveDIs (#81125) With this, I get a clean test suite running under RemoveDIs, the non-intrinsic representation of debug-info, including under asan. We've previously established that we generate identical binaries for some large projects, so this i just edge-case cleanup. The changes: * CodeGenPrepare fixups need to apply to dbg.assigns as well as dbg.values (a dbg.assign is a dbg.value). * Pin a test for constant-deletion to intrinsic debug-info: this very rare scenario uses a different kill-location sigil in dbg.value mode to RemoveDIs mode, which generates spurious test differences. * Suppress a memory leak in a unit test: the code for dealing with trailing debug-info in a block is necessarily fiddly, leading to this leak when testing it. Developer-facing interfaces for moving instructions around always deal with this behind the scenes. * SROA, when replacing some vector-loads, needs to insert the replacement loads ahead of any debug-info records so that their values remain dominated by a definition. Set the head-bit indicating our insertion should come before debug-info. --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 3 ++- llvm/lib/Transforms/Scalar/SROA.cpp | 7 ++++++- .../Generic/assignment-tracking/codegenprepare/sunk-addr.ll | 5 +++++ llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll | 7 ++++++- llvm/test/Transforms/SROA/vector-promotion.ll | 4 ++++ llvm/unittests/IR/BasicBlockDbgInfoTest.cpp | 1 + 6 files changed, 24 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 5383b15..09c4922 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -8455,7 +8455,8 @@ bool CodeGenPrepare::fixupDPValuesOnInst(Instruction &I) { // FIXME: should updating debug-info really cause the "changed" flag to fire, // which can cause a function to be reprocessed? bool CodeGenPrepare::fixupDPValue(DPValue &DPV) { - if (DPV.Type != DPValue::LocationType::Value) + if (DPV.Type != DPValue::LocationType::Value && + DPV.Type != DPValue::LocationType::Assign) return false; // Does this DPValue refer to a sunk address calculation? diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index bdbaf4f..e92e245 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2956,7 +2956,12 @@ private: assert(DL.typeSizeEqualsStoreSize(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. - IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI))); + BasicBlock::iterator LIIt = std::next(LI.getIterator()); + // Ensure the insertion point comes before any debug-info immediately + // after the load, so that variable values referring to the load are + // dominated by it. + LIIt.setHeadBit(true); + IRB.SetInsertPoint(LI.getParent(), LIIt); // Create a placeholder value with the same type as LI to use as the // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll index 70548465..8b226aa 100644 --- a/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll +++ b/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll @@ -3,6 +3,11 @@ ; RUN: -mtriple=x86_64-unknown-unknown %s -o - \ ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg." +;; Test with RemoveDIs non-intrinsic debug-info too. +; RUN: llc -start-before=codegenprepare -stop-after=codegenprepare \ +; RUN: -mtriple=x86_64-unknown-unknown %s -o - --try-experimental-debuginfo-iterators \ +; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg." + ;; Check that when CodeGenPrepare moves an address computation to a block it's ;; used in its dbg.assign uses are updated. ;; diff --git a/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll b/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll index 18dc038..5d6cc7d 100644 --- a/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll +++ b/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll @@ -1,4 +1,9 @@ -; RUN: opt -S < %s -passes=globalopt | FileCheck %s +; RUN: opt -S < %s -passes=globalopt --experimental-debuginfo-iterators=false | FileCheck %s +;; FIXME: this test is pinned to not use RemoveDIs non-intrinsic debug-info. +;; Constant-deletion takes a slightly different path and (correctly) replaces +;; the operand of the debug-info record with poison instead of a null pointer. +;; This is a spurious test difference that we'll suppress for turning RemoveDIs +;; on. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll index e2aa1e2..e48dd5b 100644 --- a/llvm/test/Transforms/SROA/vector-promotion.ll +++ b/llvm/test/Transforms/SROA/vector-promotion.ll @@ -2,6 +2,10 @@ ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG ; RUN: opt < %s -passes=debugify,sroa -S | FileCheck %s --check-prefix=DEBUG +;; Ensure that these work with non-intrinsic variable locations. +; RUN: opt < %s -passes='sroa' -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG +; RUN: opt < %s -passes='sroa' -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG +; RUN: opt < %s -passes=debugify,sroa -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=DEBUG target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" %S1 = type { i64, [42 x float] } diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp index 827b4a9..ef2b288 100644 --- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp +++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp @@ -1476,6 +1476,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) { // ... except for some dangling DPValues. EXPECT_NE(Exit.getTrailingDPValues(), nullptr); EXPECT_FALSE(Exit.getTrailingDPValues()->empty()); + Exit.getTrailingDPValues()->eraseFromParent(); Exit.deleteTrailingDPValues(); UseNewDbgInfoFormat = false; -- cgit v1.1 From 7d4733a267cafa2109dc43b151dbae5716f372e4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Feb 2024 11:49:17 +0000 Subject: [X86] LowerBUILD_VECTOR - share the same SDLoc argument instead of recreating it over and over again. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 87 ++++++++++++++++----------------- 1 file changed, 42 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b5b76c6..f310010 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7135,6 +7135,7 @@ static bool isFoldableUseOfShuffle(SDNode *N) { /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, + const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. @@ -7145,8 +7146,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, MVT VT = BVOp->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); - SDLoc dl(BVOp); - assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); @@ -7492,14 +7491,13 @@ static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. -static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, +static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"); - - SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode()) || ISD::isBuildVectorAllOnes(Op.getNode())) return Op; @@ -7618,7 +7616,7 @@ LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) { /// See the corrected implementation in isHopBuildVector(). Can we reduce this /// code because it is only used for partial h-op matching now? static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, - SelectionDAG &DAG, + const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { EVT VT = N->getValueType(0); @@ -7928,6 +7926,7 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or /// X86ISD::FMSUBADD node. static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, + const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; @@ -7938,7 +7937,6 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, return SDValue(); MVT VT = BV->getSimpleValueType(0); - SDLoc DL(BV); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; @@ -8057,22 +8055,22 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, } static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, - SelectionDAG &DAG, unsigned HOpcode, - SDValue V0, SDValue V1) { + const SDLoc &DL, SelectionDAG &DAG, + unsigned HOpcode, SDValue V0, SDValue V1) { // If either input vector is not the same size as the build vector, // extract/insert the low bits to the correct size. // This is free (examples: zmm --> xmm, xmm --> ymm). MVT VT = BV->getSimpleValueType(0); unsigned Width = VT.getSizeInBits(); if (V0.getValueSizeInBits() > Width) - V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width); + V0 = extractSubVector(V0, 0, DAG, DL, Width); else if (V0.getValueSizeInBits() < Width) - V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width); + V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width); if (V1.getValueSizeInBits() > Width) - V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width); + V1 = extractSubVector(V1, 0, DAG, DL, Width); else if (V1.getValueSizeInBits() < Width) - V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width); + V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width); unsigned NumElts = VT.getVectorNumElements(); APInt DemandedElts = APInt::getAllOnes(NumElts); @@ -8084,17 +8082,17 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, unsigned HalfNumElts = NumElts / 2; if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { MVT HalfVT = VT.getHalfNumVectorElementsVT(); - V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128); - V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128); - SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1); - return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256); + V0 = extractSubVector(V0, 0, DAG, DL, 128); + V1 = extractSubVector(V1, 0, DAG, DL, 128); + SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1); + return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256); } - return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1); + return DAG.getNode(HOpcode, DL, VT, V0, V1); } /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. -static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, +static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // We need at least 2 non-undef elements to make this worthwhile by default. @@ -8114,7 +8112,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, unsigned HOpcode; SDValue V0, V1; if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) - return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); + return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1); } // Try harder to match 256-bit ops by using extract/concat. @@ -8134,22 +8132,21 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, if (BV->getOperand(i)->isUndef()) NumUndefsHI++; - SDLoc DL(BV); SDValue InVec0, InVec1; if (VT == MVT::v8i32 || VT == MVT::v16i16) { SDValue InVec2, InVec3; unsigned X86Opcode; bool CanFold = true; - if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && - isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2, + if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; - else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0, + else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0, InVec1) && - isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2, + isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) @@ -8179,15 +8176,16 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) { unsigned X86Opcode; - if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0, + InVec1)) X86Opcode = X86ISD::HADD; - else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0, + else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; - else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0, + else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; - else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, + else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else @@ -8218,10 +8216,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, /// NOTE: Its not in our interest to start make a general purpose vectorizer /// from this, but enough scalar bit operations are created from the later /// legalization + scalarization stages to need basic support. -static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, +static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); MVT VT = Op->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -8296,9 +8293,9 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, /// Create a vector constant without a load. SSE/AVX provide the bare minimum /// functionality to do this, so it's all zeros, all ones, or some derivation /// that is cheap to calculate. -static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, +static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); // Vectors containing all zeros can be matched by pxor and xorps. @@ -8322,7 +8319,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, /// from a vector of source values and a vector of extraction indices. /// The vectors might be manipulated to match the type of the permute op. static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, - SDLoc &DL, SelectionDAG &DAG, + const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT ShuffleVT = VT; EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); @@ -8590,7 +8587,8 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, // TODO: Utilize pshufb and zero mask blending to support more efficient // construction of vectors with constant-0 elements. static SDValue -LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, +LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue SrcVec, IndicesVec; // Check for a match of the permute source vector and permute index elements. @@ -8629,7 +8627,6 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, return SDValue(); } - SDLoc DL(V); MVT VT = V.getSimpleValueType(); return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); } @@ -8645,14 +8642,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Generate vectors for predicate vectors. if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) - return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget); + return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget); if (VT.getVectorElementType() == MVT::bf16 && (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget); - if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget)) - return VectorConstant; + if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget)) + return VectorCst; unsigned EVTBits = EltVT.getSizeInBits(); APInt UndefMask = APInt::getZero(NumElems); @@ -8747,13 +8744,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } } - if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) + if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG)) return AddSub; - if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) + if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG)) return HorizontalOp; - if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) + if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG)) return Broadcast; - if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG)) + if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG)) return BitOp; unsigned NumZero = ZeroMask.popcount(); @@ -8901,8 +8898,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (IsAllConstants) return SDValue(); - if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget)) - return V; + if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget)) + return V; // See if we can use a vector load to get all of the elements. { -- cgit v1.1 From 8e707f8444692762b35fde3e94bbcb02686272a5 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Thu, 8 Feb 2024 12:33:43 +0000 Subject: [Flang][Lower] NFC: Update target-features/target-cpu tests (#80984) Previously, some of these lowering tests inadvertently relied on a default triple not introducing any target features. This caused failures when compiling on a ppc64le-linux-unknown-gnu system. This patch updates these lowering tests to always explicitly set the target triple and check that the -target-cpu and -target-features compiler options are processed as expected. --- flang/test/Lower/target-features-amdgcn.f90 | 23 ++++++++++++----------- flang/test/Lower/target-features-x86_64.f90 | 16 +++++++--------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/flang/test/Lower/target-features-amdgcn.f90 b/flang/test/Lower/target-features-amdgcn.f90 index 1f0439b..382230d 100644 --- a/flang/test/Lower/target-features-amdgcn.f90 +++ b/flang/test/Lower/target-features-amdgcn.f90 @@ -1,21 +1,22 @@ ! REQUIRES: amdgpu-registered-target -! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefixes=ALL,NONE -! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefixes=ALL,TRIPLE -! RUN: %flang_fc1 -emit-fir -target-cpu gfx90a %s -o - | FileCheck %s --check-prefixes=ALL,CPU -! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx90a %s -o - | FileCheck %s --check-prefixes=ALL,BOTH +! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx90a %s -o - | FileCheck %s --check-prefixes=ALL,CPU +! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,FEATURE +! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx90a -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,BOTH ! ALL: module attributes { -! NONE-NOT: fir.target_cpu -! NONE-NOT: fir.target_features - -! TRIPLE-SAME: fir.target_cpu = "generic-hsa" -! TRIPLE-NOT: fir.target_features - ! CPU-SAME: fir.target_cpu = "gfx90a" -! CPU-NOT: fir.target_features +! CPU-SAME: fir.target_features = #llvm.target_features<[ +! CPU-SAME: "+gfx90a-insts" +! CPU-SAME: ]> + +! FEATURE-SAME: fir.target_features = #llvm.target_features<[ +! FEATURE-NOT: "+gfx90a-insts" +! FEATURE-SAME: "+sse" +! FEATURE-SAME: ]> ! BOTH-SAME: fir.target_cpu = "gfx90a" ! BOTH-SAME: fir.target_features = #llvm.target_features<[ ! BOTH-SAME: "+gfx90a-insts" +! BOTH-SAME: "+sse" ! BOTH-SAME: ]> diff --git a/flang/test/Lower/target-features-x86_64.f90 b/flang/test/Lower/target-features-x86_64.f90 index 1b628b6..282c479 100644 --- a/flang/test/Lower/target-features-x86_64.f90 +++ b/flang/test/Lower/target-features-x86_64.f90 @@ -1,19 +1,17 @@ ! REQUIRES: x86-registered-target -! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu %s -o - | FileCheck %s --check-prefixes=ALL,NONE ! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-cpu x86-64 %s -o - | FileCheck %s --check-prefixes=ALL,CPU ! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,FEATURE ! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,BOTH ! ALL: module attributes { -! NONE-NOT: fir.target_cpu -! NONE-NOT: fir.target_features +! CPU-SAME: fir.target_cpu = "x86-64" -! CPU-SAME: fir.target_cpu = "x86-64" -! CPU-NOT: fir.target_features - -! FEATURE-NOT: fir.target_cpu -! FEATURE-SAME: fir.target_features = #llvm.target_features<["+sse"]> +! FEATURE-SAME: fir.target_features = #llvm.target_features<[ +! FEATURE-SAME: "+sse" +! FEATURE-SAME: ]> ! BOTH-SAME: fir.target_cpu = "x86-64" -! BOTH-SAME: fir.target_features = #llvm.target_features<["+sse"]> +! BOTH-SAME: fir.target_features = #llvm.target_features<[ +! BOTH-SAME: "+sse" +! BOTH-SAME: ]> -- cgit v1.1 From 42902d22d1272c1bc10132b06be2d5251b17f225 Mon Sep 17 00:00:00 2001 From: Zain Jaffal Date: Tue, 2 Jan 2024 16:52:59 +0000 Subject: [InstCombine] Add tests for x / sqrt(y / z) with fast-math --- llvm/test/Transforms/InstCombine/fdiv-sqrt.ll | 85 +++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/fdiv-sqrt.ll diff --git a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll new file mode 100644 index 0000000..a8d4b6d --- /dev/null +++ b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +declare double @llvm.sqrt.f64(double) + +define double @sqrt_div_fast(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div_fast( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv fast double %y, %z + %sqrt = call fast double @llvm.sqrt.f64(double %div) + %div1 = fdiv fast double %x, %sqrt + ret double %div1 +} + +define double @sqrt_div(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv double %y, %z + %sqrt = call double @llvm.sqrt.f64(double %div) + %div1 = fdiv double %x, %sqrt + ret double %div1 +} + +define double @sqrt_div_reassoc_arcp(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div_reassoc_arcp( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv reassoc arcp double %y, %z + %sqrt = call reassoc arcp double @llvm.sqrt.f64(double %div) + %div1 = fdiv reassoc arcp double %x, %sqrt + ret double %div1 +} + +declare void @use(double) +define double @sqrt_div_fast_multiple_uses_1(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div_fast_multiple_uses_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: call void @use(double [[DIV]]) +; CHECK-NEXT: [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv fast double %y, %z + call void @use(double %div) + %sqrt = call fast double @llvm.sqrt.f64(double %div) + %div1 = fdiv fast double %x, %sqrt + ret double %div1 +} + +define double @sqrt_div_fast_multiple_uses_2(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div_fast_multiple_uses_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: call void @use(double [[SQRT]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv fast double %y, %z + %sqrt = call fast double @llvm.sqrt.f64(double %div) + call void @use(double %sqrt) + %div1 = fdiv fast double %x, %sqrt + ret double %div1 +} + -- cgit v1.1 From e50189b0fdf382e3e0d5fc5e58fe81a78d0de7c8 Mon Sep 17 00:00:00 2001 From: Zain Jaffal Date: Sat, 6 Jan 2024 17:31:48 +0000 Subject: [InstCombine] Add additional tests for fdiv-sqrt Add more tests where some of the instructions have missing flags. --- llvm/test/Transforms/InstCombine/fdiv-sqrt.ll | 96 ++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll index a8d4b6d..346271b 100644 --- a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll +++ b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll @@ -42,9 +42,99 @@ define double @sqrt_div_reassoc_arcp(double %x, double %y, double %z) { ; CHECK-NEXT: ret double [[DIV1]] ; entry: - %div = fdiv reassoc arcp double %y, %z - %sqrt = call reassoc arcp double @llvm.sqrt.f64(double %div) - %div1 = fdiv reassoc arcp double %x, %sqrt + %div = fdiv arcp reassoc double %y, %z + %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div) + %div1 = fdiv arcp reassoc double %x, %sqrt + ret double %div1 +} + +define double @sqrt_div_reassoc_missing(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div_reassoc_missing( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv arcp double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv arcp double %y, %z + %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div) + %div1 = fdiv arcp reassoc double %x, %sqrt + ret double %div1 +} + +define double @sqrt_div_reassoc_missing2(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div_reassoc_missing2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call arcp double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv arcp reassoc double %y, %z + %sqrt = call arcp double @llvm.sqrt.f64(double %div) + %div1 = fdiv arcp reassoc double %x, %sqrt + ret double %div1 +} + +define double @sqrt_div_reassoc_missing3(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div_reassoc_missing3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv arcp double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv arcp reassoc double %y, %z + %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div) + %div1 = fdiv arcp double %x, %sqrt + ret double %div1 +} + +define double @sqrt_div_arcp_missing(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div_arcp_missing( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv reassoc double %y, %z + %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div) + %div1 = fdiv arcp reassoc double %x, %sqrt + ret double %div1 +} + +define double @sqrt_div_arcp_missing2(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div_arcp_missing2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv arcp reassoc double %y, %z + %sqrt = call reassoc double @llvm.sqrt.f64(double %div) + %div1 = fdiv arcp reassoc double %x, %sqrt + ret double %div1 +} + +define double @sqrt_div_arcp_missing3(double %x, double %y, double %z) { +; CHECK-LABEL: @sqrt_div_arcp_missing3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: ret double [[DIV1]] +; +entry: + %div = fdiv arcp reassoc double %y, %z + %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div) + %div1 = fdiv reassoc double %x, %sqrt ret double %div1 } -- cgit v1.1 From 4b72c5e8277f8688f7ce0bc953f9f3ea54420358 Mon Sep 17 00:00:00 2001 From: whisperity Date: Thu, 8 Feb 2024 13:37:55 +0100 Subject: [clang][Sema] Subclass `-Wshorten-64-to-32` under `-Wimplicit-int-conversion` (#80814) Although "implicit int conversions" is supposed to be a superset containing the more specific "64-to-32" case, previously they were a disjoint set, only enabled in common in the much larger `-Wconversion`. --- clang/docs/ReleaseNotes.rst | 7 ++++++- clang/include/clang/Basic/DiagnosticGroups.td | 6 +++--- clang/test/Sema/conversion-64-32.c | 6 +++++- .../conversion-implicit-int-includes-64-to-32.c | 21 +++++++++++++++++++++ 4 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 clang/test/Sema/conversion-implicit-int-includes-64-to-32.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 52a48c7..e158284 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -149,7 +149,12 @@ Improvements to Clang's diagnostics prints. - Clang now diagnoses member template declarations with multiple declarators. -- Clang now diagnoses use of the ``template`` keyword after declarative nested name specifiers. + +- Clang now diagnoses use of the ``template`` keyword after declarative nested + name specifiers. + +- The ``-Wshorten-64-to-32`` diagnostic is now grouped under ``-Wimplicit-int-conversion`` instead + of ``-Wconversion``. Fixes `#69444 `_. Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 6765721..975eca0a 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -108,8 +108,10 @@ def EnumConversion : DiagGroup<"enum-conversion", EnumCompareConditional]>; def ObjCSignedCharBoolImplicitIntConversion : DiagGroup<"objc-signed-char-bool-implicit-int-conversion">; +def Shorten64To32 : DiagGroup<"shorten-64-to-32">; def ImplicitIntConversion : DiagGroup<"implicit-int-conversion", - [ObjCSignedCharBoolImplicitIntConversion]>; + [Shorten64To32, + ObjCSignedCharBoolImplicitIntConversion]>; def ImplicitConstIntFloatConversion : DiagGroup<"implicit-const-int-float-conversion">; def ImplicitIntFloatConversion : DiagGroup<"implicit-int-float-conversion", [ImplicitConstIntFloatConversion]>; @@ -631,7 +633,6 @@ def Shadow : DiagGroup<"shadow", [ShadowFieldInConstructorModified, def ShadowAll : DiagGroup<"shadow-all", [Shadow, ShadowFieldInConstructor, ShadowUncapturedLocal, ShadowField]>; -def Shorten64To32 : DiagGroup<"shorten-64-to-32">; def : DiagGroup<"sign-promo">; def SignCompare : DiagGroup<"sign-compare">; def SwitchDefault : DiagGroup<"switch-default">; @@ -942,7 +943,6 @@ def Conversion : DiagGroup<"conversion", EnumConversion, BitFieldEnumConversion, FloatConversion, - Shorten64To32, IntConversion, ImplicitIntConversion, ImplicitFloatConversion, diff --git a/clang/test/Sema/conversion-64-32.c b/clang/test/Sema/conversion-64-32.c index dc417ed..c172dd1 100644 --- a/clang/test/Sema/conversion-64-32.c +++ b/clang/test/Sema/conversion-64-32.c @@ -9,9 +9,13 @@ typedef long long long2 __attribute__((__vector_size__(16))); int4 test1(long2 a) { int4 v127 = a; // no warning. - return v127; + return v127; } int test2(long v) { return v / 2; // expected-warning {{implicit conversion loses integer precision: 'long' to 'int'}} } + +char test3(short s) { + return s * 2; // no warning. +} diff --git a/clang/test/Sema/conversion-implicit-int-includes-64-to-32.c b/clang/test/Sema/conversion-implicit-int-includes-64-to-32.c new file mode 100644 index 0000000..e22ccbe --- /dev/null +++ b/clang/test/Sema/conversion-implicit-int-includes-64-to-32.c @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -Wimplicit-int-conversion -triple x86_64-apple-darwin %s + +int test0(long v) { + return v; // expected-warning {{implicit conversion loses integer precision}} +} + +typedef int int4 __attribute__ ((vector_size(16))); +typedef long long long2 __attribute__((__vector_size__(16))); + +int4 test1(long2 a) { + int4 v127 = a; // no warning. + return v127; +} + +int test2(long v) { + return v / 2; // expected-warning {{implicit conversion loses integer precision: 'long' to 'int'}} +} + +char test3(short s) { + return s * 2; // expected-warning {{implicit conversion loses integer precision: 'int' to 'char'}} +} -- cgit v1.1 From 448fe73428a810eb67617e07c23510033a21de5a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Feb 2024 12:34:50 +0000 Subject: [X86] Add X86::getVectorRegisterWidth helper. NFC. Replaces internal helper used by addConstantComments to allow reuse in a future patch. --- llvm/lib/Target/X86/X86InstrInfo.cpp | 12 ++++++++++++ llvm/lib/Target/X86/X86InstrInfo.h | 3 +++ llvm/lib/Target/X86/X86MCInstLower.cpp | 24 ++++++------------------ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 0d30a31..0f21880 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3423,6 +3423,18 @@ unsigned X86::getSwappedVCMPImm(unsigned Imm) { return Imm; } +unsigned X86::getVectorRegisterWidth(const MCOperandInfo &Info) { + if (Info.RegClass == X86::VR128RegClassID || + Info.RegClass == X86::VR128XRegClassID) + return 128; + if (Info.RegClass == X86::VR256RegClassID || + Info.RegClass == X86::VR256XRegClassID) + return 256; + if (Info.RegClass == X86::VR512RegClassID) + return 512; + llvm_unreachable("Unknown register class!"); +} + /// Return true if the Reg is X87 register. static bool isX87Reg(unsigned Reg) { return (Reg == X86::FPCW || Reg == X86::FPSW || diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index ee0d2d0..996a24d 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -77,6 +77,9 @@ unsigned getSwappedVPCOMImm(unsigned Imm); /// Get the VCMP immediate if the opcodes are swapped. unsigned getSwappedVCMPImm(unsigned Imm); +/// Get the width of the vector register operand. +unsigned getVectorRegisterWidth(const MCOperandInfo &Info); + /// Check if the instruction is X87 instruction. bool isX87Instruction(MachineInstr &MI); diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index b336ba3..d3b7d97 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1388,18 +1388,6 @@ PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) { return MBBI; } -static unsigned getRegisterWidth(const MCOperandInfo &Info) { - if (Info.RegClass == X86::VR128RegClassID || - Info.RegClass == X86::VR128XRegClassID) - return 128; - if (Info.RegClass == X86::VR256RegClassID || - Info.RegClass == X86::VR256XRegClassID) - return 256; - if (Info.RegClass == X86::VR512RegClassID) - return 512; - llvm_unreachable("Unknown register class!"); -} - static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) { if (X86II::isKMasked(MI->getDesc().TSFlags)) { // Skip mask operand. @@ -1648,7 +1636,7 @@ static void printZeroExtend(const MachineInstr *MI, MCStreamer &OutStreamer, CS << " = "; SmallVector Mask; - unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]); + unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); assert((Width % DstEltBits) == 0 && (DstEltBits % SrcEltBits) == 0 && "Illegal extension ratio"); DecodeZeroExtendMask(SrcEltBits, DstEltBits, Width / DstEltBits, false, Mask); @@ -1753,7 +1741,7 @@ static void addConstantComments(const MachineInstr *MI, case X86::VPSHUFBZrmkz: { unsigned SrcIdx = getSrcIdx(MI, 1); if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) { - unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]); + unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); SmallVector Mask; DecodePSHUFBMask(C, Width, Mask); if (!Mask.empty()) @@ -1775,7 +1763,7 @@ static void addConstantComments(const MachineInstr *MI, case X86::VPERMILPSZrmkz: { unsigned SrcIdx = getSrcIdx(MI, 1); if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) { - unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]); + unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); SmallVector Mask; DecodeVPERMILPMask(C, 32, Width, Mask); if (!Mask.empty()) @@ -1796,7 +1784,7 @@ static void addConstantComments(const MachineInstr *MI, case X86::VPERMILPDZrmkz: { unsigned SrcIdx = getSrcIdx(MI, 1); if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) { - unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]); + unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); SmallVector Mask; DecodeVPERMILPMask(C, 64, Width, Mask); if (!Mask.empty()) @@ -1824,7 +1812,7 @@ static void addConstantComments(const MachineInstr *MI, } if (auto *C = X86::getConstantFromPool(*MI, 3)) { - unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]); + unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); SmallVector Mask; DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask); if (!Mask.empty()) @@ -1835,7 +1823,7 @@ static void addConstantComments(const MachineInstr *MI, case X86::VPPERMrrm: { if (auto *C = X86::getConstantFromPool(*MI, 3)) { - unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]); + unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); SmallVector Mask; DecodeVPPERMMask(C, Width, Mask); if (!Mask.empty()) -- cgit v1.1 From 6ea76c1328e04799981c78b3661a175a2ba47cec Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 8 Feb 2024 12:41:55 +0000 Subject: [NFCI][RemoveDIs] Build LLVM with RemoveDIs iterators This commit flips a bit to make LLVM build with "debuginfo iterators", causing BasicBlock::iterator to contain a bit that's used for debug-info purposes. More about this can be read on Discourse [0], but the runtime impact of this should be negligable (iterators usually end up being inlined), and there should be no change to LLVMs behaviour as a result of this commit. What this does mean though, is that roughly 400 debug-info tests where we've added "--try-experimental-debuginfo-iterators" to RUNlines are going to start operating in RemoveDIs mode. These are already tested on the new-debug-iterators buildbot [1], and I've even tested with asan, so I'm not _expecting_ any turbulence. [0] https://discourse.llvm.org/t/rfc-instruction-api-changes-needed-to-eliminate-debug-intrinsics-from-ir/68939 [1] https://lab.llvm.org/buildbot/#/builders/275 --- llvm/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 485c76b..c31980a 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -654,7 +654,7 @@ option(LLVM_EXTERNALIZE_DEBUGINFO "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF) option(LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS - "Add extra Booleans to ilist_iterators to communicate facts for debug-info" OFF) + "Add extra Booleans to ilist_iterators to communicate facts for debug-info" ON) set(LLVM_CODESIGNING_IDENTITY "" CACHE STRING "Sign executables and dylibs with the given identity or skip if empty (Darwin Only)") -- cgit v1.1 From ec1fcb381d884ca53e2e0dd4075f946c8f002de2 Mon Sep 17 00:00:00 2001 From: agozillon Date: Thu, 8 Feb 2024 14:03:39 +0100 Subject: [Flang][bbc] Prevent bbc -emit-fir command invoking OpenMP passes twice (#80927) Currently when the bbc tool is invoked with the emit-fir command the pass pipeline will be invoked twice for verification causing the previously added OpenMP pass pipeline to be invoked multiple times. This change seeks to prevent that from occurring by using a seperate pass manager and run command immediately when it is necessary for the OpenMP passes to be executed. --- flang/tools/bbc/bbc.cpp | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index 9d5caf5..c9358c8 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -256,6 +256,22 @@ createTargetMachine(llvm::StringRef targetTriple, std::string &error) { /*Reloc::Model=*/std::nullopt)}; } +/// Build and execute the OpenMPFIRPassPipeline with its own instance +/// of the pass manager, allowing it to be invoked as soon as it's +/// required without impacting the main pass pipeline that may be invoked +/// more than once for verification. +static mlir::LogicalResult runOpenMPPasses(mlir::ModuleOp mlirModule) { + mlir::PassManager pm(mlirModule->getName(), + mlir::OpPassManager::Nesting::Implicit); + fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice); + (void)mlir::applyPassManagerCLOptions(pm); + if (mlir::failed(pm.run(mlirModule))) { + llvm::errs() << "FATAL: failed to correctly apply OpenMP pass pipeline"; + return mlir::failure(); + } + return mlir::success(); +} + //===----------------------------------------------------------------------===// // Translate Fortran input to FIR, a dialect of MLIR. //===----------------------------------------------------------------------===// @@ -369,14 +385,16 @@ static mlir::LogicalResult convertFortranSourceToMLIR( "could not open output file ") << outputName; + // WARNING: This pipeline must be run immediately after the lowering to + // ensure that the FIR is correct with respect to OpenMP operations/ + // attributes. + if (enableOpenMP) + if (mlir::failed(runOpenMPPasses(mlirModule))) + return mlir::failure(); + // Otherwise run the default passes. mlir::PassManager pm(mlirModule->getName(), mlir::OpPassManager::Nesting::Implicit); - if (enableOpenMP) - // WARNING: This pipeline must be run immediately after the lowering to - // ensure that the FIR is correct with respect to OpenMP operations/ - // attributes. - fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice); pm.enableVerifier(/*verifyPasses=*/true); (void)mlir::applyPassManagerCLOptions(pm); if (passPipeline.hasAnyOccurrences()) { -- cgit v1.1 From 72f04fa0734f8559ad515f507a4a3ce3f461f196 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 8 Feb 2024 15:28:46 +0200 Subject: [OpenMP] [cmake] Don't use -fno-semantic-interposition on Windows (#81113) This was added in 4b7beab4187ab0766c3d7b272511d5751431a8da. When the flag was added implicitly elsewhere, it was added via llvm/cmake/modules/HandleLLVMOptions.cmake, where it wasn't added on Windows/Cygwin targets. This avoids one warning per object file in OpenMP. --- openmp/cmake/HandleOpenMPOptions.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/openmp/cmake/HandleOpenMPOptions.cmake b/openmp/cmake/HandleOpenMPOptions.cmake index 7134620..9387d9b 100644 --- a/openmp/cmake/HandleOpenMPOptions.cmake +++ b/openmp/cmake/HandleOpenMPOptions.cmake @@ -46,7 +46,11 @@ append_if(OPENMP_HAVE_WEXTRA_FLAG "-Wno-extra" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) append_if(OPENMP_HAVE_WPEDANTIC_FLAG "-Wno-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) append_if(OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) -append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +if (NOT (WIN32 OR CYGWIN)) + # This flag is not relevant on Windows; the flag is accepted, but produces warnings + # about argument unused during compilation. + append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +endif() append_if(OPENMP_HAVE_FUNCTION_SECTIONS "-ffunction-section" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) append_if(OPENMP_HAVE_DATA_SECTIONS "-fdata-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) -- cgit v1.1 From 8697bbe2d4aed109520e83c6beab52196ec5b702 Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Thu, 8 Feb 2024 16:31:57 +0300 Subject: [clang] Use CPlusPlus language option instead of Bool (#80975) As it was pointed out in https://github.com/llvm/llvm-project/pull/80724, we should not be checking `getLangOpts().Bool` when determining something related to logical operators, since it only indicates that bool keyword is present, not which semantic logical operators have. As a side effect a missing `-Wpointer-bool-conversion` in OpenCL C was restored since like C23, OpenCL C has bool keyword but logical operators still return int. --- clang/lib/Sema/SemaChecking.cpp | 8 ++++---- clang/test/SemaOpenCL/operators.cl | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index c775ff2..f8b73c7 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -16129,10 +16129,10 @@ static void CheckConditionalOperator(Sema &S, AbstractConditionalOperator *E, /// Check conversion of given expression to boolean. /// Input argument E is a logical expression. static void CheckBoolLikeConversion(Sema &S, Expr *E, SourceLocation CC) { - // While C23 does have bool as a keyword, we still need to run the bool-like - // conversion checks as bools are still not used as the return type from - // "boolean" operators or as the input type for conditional operators. - if (S.getLangOpts().Bool && !S.getLangOpts().C23) + // Run the bool-like conversion checks only for C since there bools are + // still not used as the return type from "boolean" operators or as the input + // type for conditional operators. + if (S.getLangOpts().CPlusPlus) return; if (E->IgnoreParenImpCasts()->getType()->isAtomicType()) return; diff --git a/clang/test/SemaOpenCL/operators.cl b/clang/test/SemaOpenCL/operators.cl index cf359ac..76a7692 100644 --- a/clang/test/SemaOpenCL/operators.cl +++ b/clang/test/SemaOpenCL/operators.cl @@ -118,6 +118,6 @@ kernel void pointer_ops(){ bool b = !p; b = p==0; int i; - b = !&i; + b = !&i; // expected-warning {{address of 'i' will always evaluate to 'true'}} b = &i==(int *)1; } -- cgit v1.1 From fe8a62c46365f5ef0c15df2265bbf0026d0a4047 Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Thu, 8 Feb 2024 19:16:29 +0530 Subject: [MLIR] Fix crash in AffineMap::replace for zero result maps (#80930) Fix obvious bug in AffineMap::replace for the case of zero result maps. Extend/complete inferExprsFromList to work with empty expression lists. --- mlir/include/mlir/Dialect/Affine/IR/AffineOps.td | 3 ++- .../mlir/Dialect/Utils/StructuredOpsUtils.h | 4 +++- mlir/include/mlir/IR/AffineMap.h | 6 ++++-- mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp | 14 ++++++++----- mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp | 8 ++++++-- mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 8 ++++++-- mlir/lib/Dialect/Linalg/Transforms/Split.cpp | 4 +++- mlir/lib/Dialect/Linalg/Utils/Utils.cpp | 17 +++++++-------- .../SparseTensor/Transforms/SparseGPUCodegen.cpp | 4 +++- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 7 ++++--- .../Vector/Transforms/LowerVectorContract.cpp | 4 +++- .../VectorTransferSplitRewritePatterns.cpp | 2 +- .../Dialect/Vector/Transforms/VectorTransforms.cpp | 9 +++++--- mlir/lib/IR/AffineMap.cpp | 24 ++++++++++++++-------- mlir/lib/IR/BuiltinTypes.cpp | 2 +- mlir/unittests/IR/AffineMapTest.cpp | 23 +++++++++++++++++++++ mlir/unittests/IR/CMakeLists.txt | 1 + 17 files changed, 99 insertions(+), 41 deletions(-) create mode 100644 mlir/unittests/IR/AffineMapTest.cpp diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index 225e4d3..edcfcfd 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -67,7 +67,8 @@ def AffineApplyOp : Affine_Op<"apply", [Pure]> { OpBuilder<(ins "ArrayRef ":$exprList,"ValueRange":$mapOperands), [{ build($_builder, $_state, $_builder.getIndexType(), - AffineMap::inferFromExprList(exprList).front(), mapOperands); + AffineMap::inferFromExprList(exprList, $_builder.getContext()) + .front(), mapOperands); }]> ]; diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h index 134c556..929a2a7 100644 --- a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h +++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h @@ -121,7 +121,9 @@ public: } bool layout(MapList l) { - auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); }; + auto infer = [&](MapList m) { + return AffineMap::inferFromExprList(m, ctx); + }; return maps == infer(l); } diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h index cd751af..cce1412 100644 --- a/mlir/include/mlir/IR/AffineMap.h +++ b/mlir/include/mlir/IR/AffineMap.h @@ -122,9 +122,11 @@ public: /// `exprs.size()`, as many dims as the largest dim in `exprs` and as many /// symbols as the largest symbol in `exprs`. static SmallVector - inferFromExprList(ArrayRef> exprsList); + inferFromExprList(ArrayRef> exprsList, + MLIRContext *context); static SmallVector - inferFromExprList(ArrayRef> exprsList); + inferFromExprList(ArrayRef> exprsList, + MLIRContext *context); MLIRContext *getContext() const; diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 1eb5678..f4f6dadf 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -2010,7 +2010,8 @@ public: } bool didEncounterError = false; - auto maps = AffineMap::inferFromExprList({srcExprs, dstExprs, dstExprs}); + auto maps = AffineMap::inferFromExprList({srcExprs, dstExprs, dstExprs}, + rewriter.getContext()); auto linalgOp = rewriter.create( loc, ArrayRef({resultTy, resultMaxTy}), input, ValueRange({filledTensorIdx, filledTensorMax}), maps, iteratorTypes, @@ -2351,9 +2352,11 @@ struct RFFT2dConverter final : public OpRewritePattern { createZeroTensor(rewriter, loc, outputType, dynamicSizes)}; // Indexing maps for input and output tensors - auto indexingMaps = AffineMap::inferFromExprList(llvm::ArrayRef{ - affineDimsExpr(rewriter, 0, 3, 4), affineDimsExpr(rewriter, 0, 1, 2), - affineDimsExpr(rewriter, 0, 1, 2)}); + auto indexingMaps = AffineMap::inferFromExprList( + llvm::ArrayRef{affineDimsExpr(rewriter, 0, 3, 4), + affineDimsExpr(rewriter, 0, 1, 2), + affineDimsExpr(rewriter, 0, 1, 2)}, + rewriter.getContext()); // Width and height dimensions of the original input. auto dimH = rewriter.createOrFold(loc, input, 1); @@ -2463,7 +2466,8 @@ struct FFT2dConverter final : OpRewritePattern { ArrayRef{RFFT2dConverter::affineDimsExpr(rewriter, 0, 3, 4), RFFT2dConverter::affineDimsExpr(rewriter, 0, 3, 4), RFFT2dConverter::affineDimsExpr(rewriter, 0, 1, 2), - RFFT2dConverter::affineDimsExpr(rewriter, 0, 1, 2)}); + RFFT2dConverter::affineDimsExpr(rewriter, 0, 1, 2)}, + rewriter.getContext()); // Width and height dimensions of the original input. auto dimH = rewriter.createOrFold(loc, input_real, 1); diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp index b63baf3..85fb8a5 100644 --- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp +++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp @@ -77,7 +77,9 @@ static void getXferIndices(RewriterBase &rewriter, TransferOpType xferOp, static bool contractSupportsMMAMatrixType(vector::ContractionOp contract, bool useNvGpu) { using MapList = ArrayRef>; - auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); }; + auto infer = [&](MapList m) { + return AffineMap::inferFromExprList(m, contract.getContext()); + }; AffineExpr m, n, k; bindDims(contract.getContext(), m, n, k); auto iteratorTypes = contract.getIteratorTypes().getValue(); @@ -394,7 +396,9 @@ struct PrepareContractToGPUMMA // Set up the parallel/reduction structure in right form. using MapList = ArrayRef>; - auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); }; + auto infer = [&](MapList m) { + return AffineMap::inferFromExprList(m, op.getContext()); + }; AffineExpr m, n, k; bindDims(rewriter.getContext(), m, n, k); static constexpr std::array perm = {1, 0}; diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index adb56ab..c4b1319 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -1145,7 +1145,9 @@ AffineApplyOp mlir::affine::makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e, ArrayRef operands) { return makeComposedAffineApply( - b, loc, AffineMap::inferFromExprList(ArrayRef{e}).front(), + b, loc, + AffineMap::inferFromExprList(ArrayRef{e}, b.getContext()) + .front(), operands); } @@ -1220,7 +1222,9 @@ mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineExpr expr, ArrayRef operands) { return makeComposedFoldedAffineApply( - b, loc, AffineMap::inferFromExprList(ArrayRef{expr}).front(), + b, loc, + AffineMap::inferFromExprList(ArrayRef{expr}, b.getContext()) + .front(), operands); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Split.cpp b/mlir/lib/Dialect/Linalg/Transforms/Split.cpp index 0174db4..47b5fcd 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Split.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Split.cpp @@ -83,7 +83,9 @@ linalg::splitOp(RewriterBase &rewriter, TilingInterface op, unsigned dimension, bindDims(rewriter.getContext(), d0, d1, d2); OpFoldResult minSplitPoint = affine::makeComposedFoldedAffineMin( rewriter, op.getLoc(), - AffineMap::inferFromExprList(ArrayRef{d0, d1 + d2}).front(), + AffineMap::inferFromExprList(ArrayRef{d0, d1 + d2}, + rewriter.getContext()) + .front(), {splitPoint, offsets[dimension], sizes[dimension]}); // Compute the size of the second part. Return early if the second part would diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index 986b5f3..5d220c6 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -670,7 +670,8 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile, << ": make sure in bound with affine.min\n"); AffineExpr dim0, dim1, dim2; - bindDims(builder.getContext(), dim0, dim1, dim2); + MLIRContext *context = builder.getContext(); + bindDims(context, dim0, dim1, dim2); // Get the dimension size for this dimension. We need to first calculate // the max index and then plus one. This is important because for @@ -678,12 +679,12 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile, // form `(d0 * s0 + d1)`, where `d0`/`d1 is an output/filter window // dimension and `s0` is stride. Directly use the dimension size of // output/filer window dimensions will cause incorrect calculation. - AffineMap minusOneMap = - AffineMap::inferFromExprList({ArrayRef{dim0 - 1}}) - .front(); - AffineMap plusOneMap = - AffineMap::inferFromExprList({ArrayRef{dim0 + 1}}) - .front(); + AffineMap minusOneMap = AffineMap::inferFromExprList( + {ArrayRef{dim0 - 1}}, context) + .front(); + AffineMap plusOneMap = AffineMap::inferFromExprList( + {ArrayRef{dim0 + 1}}, context) + .front(); SmallVector maxIndices = llvm::to_vector(llvm::map_range(ubs, [&](OpFoldResult ub) { return makeComposedFoldedAffineApply(rewriter, loc, minusOneMap, @@ -696,7 +697,7 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile, // Compute min(dim - offset, size) to avoid out-of-bounds accesses. AffineMap minMap = AffineMap::inferFromExprList( - {ArrayRef{dim1 - dim2, dim0}}) + {ArrayRef{dim1 - dim2, dim0}}, context) .front(); size = makeComposedFoldedAffineMin(rewriter, loc, minMap, {size, d, offset}); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp index 87a37a7..dd3af9d 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -1263,7 +1263,9 @@ struct LinalgOpRewriter : public OpRewritePattern { SmallVector maps = op.getIndexingMapsArray(); using MapList = ArrayRef>; - auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); }; + auto infer = [&](MapList m) { + return AffineMap::inferFromExprList(m, op.getContext()); + }; AffineExpr i, j, k; bindDims(getContext(), i, j, k); diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 4523544..5be6a62 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -675,9 +675,10 @@ void vector::ContractionOp::build(OpBuilder &builder, OperationState &result, ArrayRef iteratorTypes) { result.addOperands({lhs, rhs, acc}); result.addTypes(acc.getType()); - result.addAttribute(getIndexingMapsAttrName(result.name), - builder.getAffineMapArrayAttr( - AffineMap::inferFromExprList(indexingExprs))); + result.addAttribute( + getIndexingMapsAttrName(result.name), + builder.getAffineMapArrayAttr( + AffineMap::inferFromExprList(indexingExprs, builder.getContext()))); result.addAttribute( getIteratorTypesAttrName(result.name), builder.getArrayAttr(llvm::to_vector(llvm::map_range( diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp index 446eb85..0eaf9f7 100644 --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp @@ -695,7 +695,9 @@ ContractionOpToDotLowering::matchAndRewrite(vector::ContractionOp op, Value lhs = op.getLhs(), rhs = op.getRhs(); using MapList = ArrayRef>; - auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); }; + auto infer = [&](MapList m) { + return AffineMap::inferFromExprList(m, op.getContext()); + }; AffineExpr m, n, k; bindDims(rewriter.getContext(), m, n, k); SmallVector maps = op.getIndexingMapsArray(); diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp index f1a2716..b844c2b 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp @@ -209,7 +209,7 @@ createSubViewIntersection(RewriterBase &b, VectorTransferOpInterface xferOp, AffineExpr i, j, k; bindDims(xferOp.getContext(), i, j, k); SmallVector maps = - AffineMap::inferFromExprList(MapList{{i - j, k}}); + AffineMap::inferFromExprList(MapList{{i - j, k}}, b.getContext()); // affine_min(%dimMemRef - %index, %dimAlloc) Value affineMin = b.create( loc, index.getType(), maps[0], ValueRange{dimMemRef, index, dimAlloc}); diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp index 4034dc4..53ae138 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp @@ -160,8 +160,9 @@ struct MultiReduceToContract iteratorTypes.push_back(vector::IteratorType::reduction); } } - auto dstMap = AffineMap::get(/*dimCount=*/reductionMask.size(), - /*symCount=*/0, exprs, reduceOp.getContext()); + auto dstMap = + AffineMap::get(/*dimCount=*/reductionMask.size(), + /*symbolCount=*/0, exprs, reduceOp.getContext()); rewriter.replaceOpWithNewOp( reduceOp, mulOp->getOperand(0), mulOp->getOperand(1), reduceOp.getAcc(), rewriter.getAffineMapArrayAttr({srcMap, srcMap, dstMap}), @@ -1399,7 +1400,9 @@ struct CanonicalizeContractMatmulToMMT final // Set up the parallel/reduction structure in right form. using MapList = ArrayRef>; - auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); }; + auto infer = [&](MapList m) { + return AffineMap::inferFromExprList(m, op.getContext()); + }; AffineExpr m; AffineExpr n; AffineExpr k; diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp index c280462..4aa0d4f 100644 --- a/mlir/lib/IR/AffineMap.cpp +++ b/mlir/lib/IR/AffineMap.cpp @@ -272,12 +272,16 @@ AffineMap AffineMap::getMultiDimMapWithTargets(unsigned numDims, return result; } +/// Creates an affine map each for each list of AffineExpr's in `exprsList` +/// while inferring the right number of dimensional and symbolic inputs needed +/// based on the maximum dimensional and symbolic identifier appearing in the +/// expressions. template static SmallVector -inferFromExprList(ArrayRef exprsList) { - assert(!exprsList.empty()); - assert(!exprsList[0].empty()); - auto context = exprsList[0][0].getContext(); +inferFromExprList(ArrayRef exprsList, + MLIRContext *context) { + if (exprsList.empty()) + return {}; int64_t maxDim = -1, maxSym = -1; getMaxDimAndSymbol(exprsList, maxDim, maxSym); SmallVector maps; @@ -289,13 +293,15 @@ inferFromExprList(ArrayRef exprsList) { } SmallVector -AffineMap::inferFromExprList(ArrayRef> exprsList) { - return ::inferFromExprList(exprsList); +AffineMap::inferFromExprList(ArrayRef> exprsList, + MLIRContext *context) { + return ::inferFromExprList(exprsList, context); } SmallVector -AffineMap::inferFromExprList(ArrayRef> exprsList) { - return ::inferFromExprList(exprsList); +AffineMap::inferFromExprList(ArrayRef> exprsList, + MLIRContext *context) { + return ::inferFromExprList(exprsList, context); } uint64_t AffineMap::getLargestKnownDivisorOfMapExprs() { @@ -521,7 +527,7 @@ AffineMap::replace(const DenseMap &map) const { newResults.reserve(getNumResults()); for (AffineExpr e : getResults()) newResults.push_back(e.replace(map)); - return AffineMap::inferFromExprList(newResults).front(); + return AffineMap::inferFromExprList(newResults, getContext()).front(); } AffineMap AffineMap::dropResults(const llvm::SmallBitVector &positions) const { diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp index 9b8ee3d..1794b38 100644 --- a/mlir/lib/IR/BuiltinTypes.cpp +++ b/mlir/lib/IR/BuiltinTypes.cpp @@ -921,7 +921,7 @@ AffineExpr mlir::makeCanonicalStridedLayoutExpr(ArrayRef sizes, return getAffineConstantExpr(0, context); assert(!exprs.empty() && "expected exprs"); - auto maps = AffineMap::inferFromExprList(exprs); + auto maps = AffineMap::inferFromExprList(exprs, context); assert(!maps.empty() && "Expected one non-empty map"); unsigned numDims = maps[0].getNumDims(), nSymbols = maps[0].getNumSymbols(); diff --git a/mlir/unittests/IR/AffineMapTest.cpp b/mlir/unittests/IR/AffineMapTest.cpp new file mode 100644 index 0000000..081afad --- /dev/null +++ b/mlir/unittests/IR/AffineMapTest.cpp @@ -0,0 +1,23 @@ +//===- AffineMapTest.cpp - unit tests for affine map API ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/Builders.h" +#include "gtest/gtest.h" + +using namespace mlir; + +// Test AffineMap replace API for the zero result case. +TEST(AffineMapTest, inferMapFromAffineExprs) { + MLIRContext ctx; + OpBuilder b(&ctx); + AffineMap map = b.getEmptyAffineMap(); + DenseMap replacements; + map.replace(replacements); + EXPECT_EQ(map, map); +} diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt index 1ed4686..e7e9c3b 100644 --- a/mlir/unittests/IR/CMakeLists.txt +++ b/mlir/unittests/IR/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_unittest(MLIRIRTests AdaptorTest.cpp + AffineMapTest.cpp AttributeTest.cpp DialectTest.cpp InterfaceTest.cpp -- cgit v1.1 From d63c8bee58b5d4dad9f1c550a342e782e0038f28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 8 Feb 2024 15:29:44 +0100 Subject: [clang][ExprConst] Remove unnecessary cast FD is a FunctionDecl, so no need to cast a FunctionDecl to a CXXMethodDecl just to assign it to a FunctionDecl. --- clang/lib/AST/ExprConstant.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 089bc20..02e153f 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -8006,7 +8006,8 @@ public: assert(CorrespondingCallOpSpecialization && "We must always have a function call operator specialization " "that corresponds to our static invoker specialization"); - FD = cast(CorrespondingCallOpSpecialization); + assert(isa(CorrespondingCallOpSpecialization)); + FD = CorrespondingCallOpSpecialization; } else FD = LambdaCallOp; } else if (FD->isReplaceableGlobalAllocationFunction()) { -- cgit v1.1 From 3ad63593dac390e320808f3de0e1906c5fa45c8a Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 8 Feb 2024 15:29:32 +0100 Subject: [PatternMatch] Add m_PtrAdd() matcher (NFC) This matches a getelementptr i8 instruction or constant expression, with a given pointer operand and index. --- llvm/include/llvm/IR/PatternMatch.h | 22 ++++++++++++++++++++++ llvm/unittests/IR/PatternMatch.cpp | 22 ++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index 3155e7d..fed5524 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -1614,6 +1614,21 @@ struct m_SplatOrUndefMask { } }; +template struct PtrAdd_match { + PointerOpTy PointerOp; + OffsetOpTy OffsetOp; + + PtrAdd_match(const PointerOpTy &PointerOp, const OffsetOpTy &OffsetOp) + : PointerOp(PointerOp), OffsetOp(OffsetOp) {} + + template bool match(OpTy *V) { + auto *GEP = dyn_cast(V); + return GEP && GEP->getSourceElementType()->isIntegerTy(8) && + PointerOp.match(GEP->getPointerOperand()) && + OffsetOp.match(GEP->idx_begin()->get()); + } +}; + /// Matches ShuffleVectorInst independently of mask value. template inline TwoOps_match @@ -1647,6 +1662,13 @@ inline auto m_GEP(const OperandTypes &...Ops) { return AnyOps_match(Ops...); } +/// Matches GEP with i8 source element type +template +inline PtrAdd_match +m_PtrAdd(const PointerOpTy &PointerOp, const OffsetOpTy &OffsetOp) { + return PtrAdd_match(PointerOp, OffsetOp); +} + //===----------------------------------------------------------------------===// // Matchers for CastInst classes // diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp index 885b134..883149c 100644 --- a/llvm/unittests/IR/PatternMatch.cpp +++ b/llvm/unittests/IR/PatternMatch.cpp @@ -1889,4 +1889,26 @@ TEST_F(PatternMatchTest, ConstExpr) { EXPECT_TRUE(match(V, m_ConstantExpr())); } +TEST_F(PatternMatchTest, PtrAdd) { + Type *PtrTy = PointerType::getUnqual(Ctx); + Type *IdxTy = Type::getInt64Ty(Ctx); + Constant *Null = Constant::getNullValue(PtrTy); + Constant *Offset = ConstantInt::get(IdxTy, 42); + Value *PtrAdd = IRB.CreatePtrAdd(Null, Offset); + Value *OtherGEP = IRB.CreateGEP(IdxTy, Null, Offset); + Value *PtrAddConst = + ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ctx), Null, Offset); + + Value *A, *B; + EXPECT_TRUE(match(PtrAdd, m_PtrAdd(m_Value(A), m_Value(B)))); + EXPECT_EQ(A, Null); + EXPECT_EQ(B, Offset); + + EXPECT_TRUE(match(PtrAddConst, m_PtrAdd(m_Value(A), m_Value(B)))); + EXPECT_EQ(A, Null); + EXPECT_EQ(B, Offset); + + EXPECT_FALSE(match(OtherGEP, m_PtrAdd(m_Value(A), m_Value(B)))); +} + } // anonymous namespace. -- cgit v1.1 From d9e92765c5f9b0fa7adafa769dd13d37b6bca038 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Thu, 8 Feb 2024 22:34:52 +0800 Subject: [ConstantRange] Improve ConstantRange::binaryXor (#80146) `ConstantRange::binaryXor` gives poor results as it currently depends on `KnownBits::operator^`. Since `sub A, B` is canonicalized into `xor A, B` if `B` is the subset of `A`, this patch reverts the transform in `ConstantRange::binaryXor`, which will give better results. Alive2: https://alive2.llvm.org/ce/z/bmTMV9 Fixes #79696. --- llvm/lib/IR/ConstantRange.cpp | 17 +++++++++- llvm/test/Transforms/SCCP/pr79696.ll | 55 +++++++++++++++++++++++++++++++++ llvm/unittests/IR/ConstantRangeTest.cpp | 6 ++++ 3 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SCCP/pr79696.ll diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index cbb64b2..3394a1e 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -1467,7 +1467,22 @@ ConstantRange ConstantRange::binaryXor(const ConstantRange &Other) const { if (isSingleElement() && getSingleElement()->isAllOnes()) return Other.binaryNot(); - return fromKnownBits(toKnownBits() ^ Other.toKnownBits(), /*IsSigned*/false); + KnownBits LHSKnown = toKnownBits(); + KnownBits RHSKnown = Other.toKnownBits(); + KnownBits Known = LHSKnown ^ RHSKnown; + ConstantRange CR = fromKnownBits(Known, /*IsSigned*/ false); + // Typically the following code doesn't improve the result if BW = 1. + if (getBitWidth() == 1) + return CR; + + // If LHS is known to be the subset of RHS, treat LHS ^ RHS as RHS -nuw/nsw + // LHS. If RHS is known to be the subset of LHS, treat LHS ^ RHS as LHS + // -nuw/nsw RHS. + if ((~LHSKnown.Zero).isSubsetOf(RHSKnown.One)) + CR = CR.intersectWith(Other.sub(*this), PreferredRangeType::Unsigned); + else if ((~RHSKnown.Zero).isSubsetOf(LHSKnown.One)) + CR = CR.intersectWith(this->sub(Other), PreferredRangeType::Unsigned); + return CR; } ConstantRange diff --git a/llvm/test/Transforms/SCCP/pr79696.ll b/llvm/test/Transforms/SCCP/pr79696.ll new file mode 100644 index 0000000..a860112 --- /dev/null +++ b/llvm/test/Transforms/SCCP/pr79696.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=ipsccp -S | FileCheck %s + +; Tests from PR79696 + +define i1 @constant_range_xor(i64 %a) { +; CHECK-LABEL: define i1 @constant_range_xor( +; CHECK-SAME: i64 [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[A]], 8192 +; CHECK-NEXT: br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[CTLZ:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A]], i1 true) +; CHECK-NEXT: [[CONV:%.*]] = xor i64 [[CTLZ]], 63 +; CHECK-NEXT: ret i1 false +; CHECK: else: +; CHECK-NEXT: ret i1 false +; +entry: + %cmp = icmp ugt i64 %a, 8192 + br i1 %cmp, label %then, label %else +then: + %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 true) ;[0, 50] + %conv = xor i64 %ctlz, 63 ;[13, 63] + %cmp1 = icmp ult i64 %conv, 13 + ret i1 %cmp1 +else: + ret i1 false +} + +define i1 @constant_range_xor_negative(i64 %a) { +; CHECK-LABEL: define i1 @constant_range_xor_negative( +; CHECK-SAME: i64 [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[A]], 8192 +; CHECK-NEXT: br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[CTLZ:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A]], i1 true) +; CHECK-NEXT: [[CONV:%.*]] = xor i64 [[CTLZ]], 62 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[CONV]], 13 +; CHECK-NEXT: ret i1 [[CMP1]] +; CHECK: else: +; CHECK-NEXT: ret i1 false +; +entry: + %cmp = icmp ugt i64 %a, 8192 + br i1 %cmp, label %then, label %else +then: + %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 true) ;[0, 50] + %conv = xor i64 %ctlz, 62 ;[12, 63] + %cmp1 = icmp ult i64 %conv, 13 + ret i1 %cmp1 +else: + ret i1 false +} diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp index e505af5..34a162a 100644 --- a/llvm/unittests/IR/ConstantRangeTest.cpp +++ b/llvm/unittests/IR/ConstantRangeTest.cpp @@ -2565,6 +2565,12 @@ TEST_F(ConstantRangeTest, binaryXor) { EXPECT_EQ(R16_35.binaryXor(R0_99), ConstantRange(APInt(8, 0), APInt(8, 128))); EXPECT_EQ(R0_99.binaryXor(R16_35), ConstantRange(APInt(8, 0), APInt(8, 128))); + // Treat xor A, B as sub nsw nuw A, B + ConstantRange R0_51(APInt(8, 0), APInt(8, 51)); + ConstantRange R63(APInt(8, 63)); + EXPECT_EQ(R0_51.binaryXor(R63), ConstantRange(APInt(8, 13), APInt(8, 64))); + EXPECT_EQ(R63.binaryXor(R0_51), ConstantRange(APInt(8, 13), APInt(8, 64))); + TestBinaryOpExhaustive( [](const ConstantRange &CR1, const ConstantRange &CR2) { return CR1.binaryXor(CR2); -- cgit v1.1 From 06774d6bbf32aff45b67d8c3753524ec36bf8869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 8 Feb 2024 09:55:07 +0100 Subject: [clang][Interp] Handle CXXInheritedCtorInitExprs We need to forward all arguments of the current function and call the ctor function. --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 31 +++++++++++++++++ clang/lib/AST/Interp/ByteCodeExprGen.h | 1 + clang/test/AST/Interp/records.cpp | 60 ++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 59fddfc..21bc29f 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -2020,6 +2020,37 @@ bool ByteCodeExprGen::VisitObjCBoolLiteralExpr( return this->emitConst(E->getValue(), E); } +template +bool ByteCodeExprGen::VisitCXXInheritedCtorInitExpr( + const CXXInheritedCtorInitExpr *E) { + const CXXConstructorDecl *Ctor = E->getConstructor(); + assert(!Ctor->isTrivial() && + "Trivial CXXInheritedCtorInitExpr, implement. (possible?)"); + const Function *F = this->getFunction(Ctor); + assert(F); + assert(!F->hasRVO()); + assert(F->hasThisPointer()); + + if (!this->emitDupPtr(SourceInfo{})) + return false; + + // Forward all arguments of the current function (which should be a + // constructor itself) to the inherited ctor. + // This is necessary because the calling code has pushed the pointer + // of the correct base for us already, but the arguments need + // to come after. + unsigned Offset = align(primSize(PT_Ptr)); // instance pointer. + for (const ParmVarDecl *PD : Ctor->parameters()) { + PrimType PT = this->classify(PD->getType()).value_or(PT_Ptr); + + if (!this->emitGetParam(PT, Offset, E)) + return false; + Offset += align(primSize(PT)); + } + + return this->emitCall(F, E); +} + template bool ByteCodeExprGen::discard(const Expr *E) { if (E->containsErrors()) return false; diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h index 2c9cca5..c908a9b 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.h +++ b/clang/lib/AST/Interp/ByteCodeExprGen.h @@ -111,6 +111,7 @@ public: bool VisitGenericSelectionExpr(const GenericSelectionExpr *E); bool VisitChooseExpr(const ChooseExpr *E); bool VisitObjCBoolLiteralExpr(const ObjCBoolLiteralExpr *E); + bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E); protected: bool visitExpr(const Expr *E) override; diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp index 5ce1e6e..1ef13f5 100644 --- a/clang/test/AST/Interp/records.cpp +++ b/clang/test/AST/Interp/records.cpp @@ -1223,3 +1223,63 @@ namespace IndirectFieldInit { #endif } + +namespace InheritedConstructor { + namespace PR47555 { + struct A { + int c; + int d; + constexpr A(int c, int d) : c(c), d(d){} + }; + struct B : A { using A::A; }; + + constexpr B b = {13, 1}; + static_assert(b.c == 13, ""); + static_assert(b.d == 1, ""); + } + + namespace PR47555_2 { + struct A { + int c; + int d; + double e; + constexpr A(int c, int &d, double e) : c(c), d(++d), e(e){} + }; + struct B : A { using A::A; }; + + constexpr int f() { + int a = 10; + B b = {10, a, 40.0}; + return a; + } + static_assert(f() == 11, ""); + } + + namespace AaronsTest { + struct T { + constexpr T(float) {} + }; + + struct Base { + constexpr Base(T t = 1.0f) {} + constexpr Base(float) {} + }; + + struct FirstMiddle : Base { + using Base::Base; + constexpr FirstMiddle() : Base(2.0f) {} + }; + + struct SecondMiddle : Base { + constexpr SecondMiddle() : Base(3.0f) {} + constexpr SecondMiddle(T t) : Base(t) {} + }; + + struct S : FirstMiddle, SecondMiddle { + using FirstMiddle::FirstMiddle; + constexpr S(int i) : S(4.0f) {} + }; + + constexpr S s(1); + } +} -- cgit v1.1 From c4b0dfcc99da7506bff6b57d563e5cbce9caf4cd Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 8 Feb 2024 09:44:42 -0500 Subject: [Clang] Fix a non-effective assertion (#81083) `PTy` here is literally `FTy->getParamType(i)`, which makes this assertion not work as expected. --- clang/lib/CodeGen/CGBuiltin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index e051cbc..a7a410d 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5908,7 +5908,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) && + assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. -- cgit v1.1 From fb6ef4233968ffefb616d1c779a5483ef1f140d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 8 Feb 2024 10:49:14 +0100 Subject: [clang][Interp][NFC] Convert records test to verify=expected,both style --- clang/test/AST/Interp/records.cpp | 187 ++++++++++++++------------------------ 1 file changed, 66 insertions(+), 121 deletions(-) diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp index 1ef13f5..fb50d1c 100644 --- a/clang/test/AST/Interp/records.cpp +++ b/clang/test/AST/Interp/records.cpp @@ -1,11 +1,11 @@ -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify %s -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify %s -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify %s -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -triple i686 -verify %s -// RUN: %clang_cc1 -verify=ref %s -// RUN: %clang_cc1 -verify=ref -std=c++14 %s -// RUN: %clang_cc1 -verify=ref -std=c++20 %s -// RUN: %clang_cc1 -verify=ref -triple i686 %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify=expected,both %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify=expected,both %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -triple i686 -verify=expected,both %s +// RUN: %clang_cc1 -verify=ref,both %s +// RUN: %clang_cc1 -verify=ref,both -std=c++14 %s +// RUN: %clang_cc1 -verify=ref,both -std=c++20 %s +// RUN: %clang_cc1 -verify=ref,both -triple i686 %s /// Used to crash. struct Empty {}; @@ -90,9 +90,8 @@ struct Ints2 { int a = 10; int b; }; -constexpr Ints2 ints22; // expected-error {{without a user-provided default constructor}} \ - // expected-error {{must be initialized by a constant expression}} \ - // ref-error {{without a user-provided default constructor}} +constexpr Ints2 ints22; // both-error {{without a user-provided default constructor}} \ + // expected-error {{must be initialized by a constant expression}} constexpr Ints2 I2 = Ints2{12, 25}; static_assert(I2.a == 12, ""); @@ -164,17 +163,13 @@ constexpr C RVOAndParams(int a) { } constexpr C RVOAndParamsResult2 = RVOAndParams(12); -class Bar { // expected-note {{definition of 'Bar' is not complete}} \ - // ref-note {{definition of 'Bar' is not complete}} +class Bar { // both-note {{definition of 'Bar' is not complete}} public: constexpr Bar(){} - constexpr Bar b; // expected-error {{cannot be constexpr}} \ - // expected-error {{has incomplete type 'const Bar'}} \ - // ref-error {{cannot be constexpr}} \ - // ref-error {{has incomplete type 'const Bar'}} + constexpr Bar b; // both-error {{cannot be constexpr}} \ + // both-error {{has incomplete type 'const Bar'}} }; -constexpr Bar B; // expected-error {{must be initialized by a constant expression}} \ - // ref-error {{must be initialized by a constant expression}} +constexpr Bar B; // both-error {{must be initialized by a constant expression}} constexpr Bar *pb = nullptr; constexpr int locals() { @@ -198,17 +193,13 @@ namespace thisPointer { constexpr int get12() { return 12; } }; - constexpr int foo() { // ref-error {{never produces a constant expression}} \ - // expected-error {{never produces a constant expression}} + constexpr int foo() { // both-error {{never produces a constant expression}} S *s = nullptr; - return s->get12(); // ref-note 2{{member call on dereferenced null pointer}} \ - // expected-note 2{{member call on dereferenced null pointer}} + return s->get12(); // both-note 2{{member call on dereferenced null pointer}} } - static_assert(foo() == 12, ""); // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to 'foo()'}} \ - // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to 'foo()'}} + static_assert(foo() == 12, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'foo()'}} }; struct FourBoolPairs { @@ -244,20 +235,16 @@ constexpr A a{}; static_assert(a.i == 100, ""); constexpr A a2{12}; static_assert(a2.i == 12, ""); -static_assert(a2.i == 200, ""); // ref-error {{static assertion failed}} \ - // ref-note {{evaluates to '12 == 200'}} \ - // expected-error {{static assertion failed}} \ - // expected-note {{evaluates to '12 == 200'}} +static_assert(a2.i == 200, ""); // both-error {{static assertion failed}} \ + // both-note {{evaluates to '12 == 200'}} struct S { int a = 0; constexpr int get5() const { return 5; } constexpr void fo() const { - this; // expected-warning {{expression result unused}} \ - // ref-warning {{expression result unused}} - this->a; // expected-warning {{expression result unused}} \ - // ref-warning {{expression result unused}} + this; // both-warning {{expression result unused}} + this->a; // both-warning {{expression result unused}} get5(); getInts(); } @@ -342,12 +329,9 @@ namespace InitializerTemporaries { // Invalid destructor. struct S { constexpr S() {} - constexpr ~S() noexcept(false) { throw 12; } // expected-error {{cannot use 'throw'}} \ - // expected-error {{never produces a constant expression}} \ - // expected-note 2{{subexpression not valid}} \ - // ref-error {{cannot use 'throw'}} \ - // ref-error {{never produces a constant expression}} \ - // ref-note 2{{subexpression not valid}} + constexpr ~S() noexcept(false) { throw 12; } // both-error {{cannot use 'throw'}} \ + // both-error {{never produces a constant expression}} \ + // both-note 2{{subexpression not valid}} }; constexpr int f() { @@ -355,10 +339,8 @@ namespace InitializerTemporaries { /// FIXME: Wrong source location below. return 12; // expected-note {{in call to '&S{}->~S()'}} } - static_assert(f() == 12); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to 'f()'}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to 'f()'}} + static_assert(f() == 12); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'f()'}} #endif @@ -423,7 +405,8 @@ namespace MI { namespace DeriveFailures { #if __cplusplus < 202002L - struct Base { // ref-note 2{{declared here}} expected-note {{declared here}} + struct Base { // both-note {{declared here}} \ + // ref-note {{declared here}} int Val; }; @@ -431,35 +414,29 @@ namespace DeriveFailures { int OtherVal; constexpr Derived(int i) : OtherVal(i) {} // ref-error {{never produces a constant expression}} \ - // ref-note 2{{non-constexpr constructor 'Base' cannot be used in a constant expression}} \ - // expected-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} + // both-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} \ + // ref-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} }; - constexpr Derived D(12); // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{in call to 'Derived(12)'}} \ - // ref-note {{declared here}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{in call to 'Derived(12)'}} + constexpr Derived D(12); // both-error {{must be initialized by a constant expression}} \ + // both-note {{in call to 'Derived(12)'}} \ + // ref-note {{declared here}} - static_assert(D.Val == 0, ""); // ref-error {{not an integral constant expression}} \ + static_assert(D.Val == 0, ""); // both-error {{not an integral constant expression}} \ // ref-note {{initializer of 'D' is not a constant expression}} \ - // expected-error {{not an integral constant expression}} \ // expected-note {{read of uninitialized object}} #endif struct AnotherBase { int Val; - constexpr AnotherBase(int i) : Val(12 / i) {} //ref-note {{division by zero}} \ - //expected-note {{division by zero}} + constexpr AnotherBase(int i) : Val(12 / i) {} // both-note {{division by zero}} }; struct AnotherDerived : AnotherBase { constexpr AnotherDerived(int i) : AnotherBase(i) {} }; - constexpr AnotherBase Derp(0); // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{in call to 'AnotherBase(0)'}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{in call to 'AnotherBase(0)'}} + constexpr AnotherBase Derp(0); // both-error {{must be initialized by a constant expression}} \ + // both-note {{in call to 'AnotherBase(0)'}} struct YetAnotherBase { int Val; @@ -467,17 +444,14 @@ namespace DeriveFailures { }; struct YetAnotherDerived : YetAnotherBase { - using YetAnotherBase::YetAnotherBase; // ref-note {{declared here}} \ - // expected-note {{declared here}} + using YetAnotherBase::YetAnotherBase; // both-note {{declared here}} int OtherVal; constexpr bool doit() const { return Val == OtherVal; } }; - constexpr YetAnotherDerived Oops(0); // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{constructor inherited from base class 'YetAnotherBase' cannot be used in a constant expression}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{constructor inherited from base class 'YetAnotherBase' cannot be used in a constant expression}} + constexpr YetAnotherDerived Oops(0); // both-error {{must be initialized by a constant expression}} \ + // both-note {{constructor inherited from base class 'YetAnotherBase' cannot be used in a constant expression}} }; namespace EmptyCtor { @@ -543,18 +517,10 @@ namespace PointerArith { constexpr B *b1 = &b + 1; constexpr B *b2 = &b + 0; -#if 0 - constexpr A *a2 = &b + 1; // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{cannot access base class of pointer past the end of object}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{cannot access base class of pointer past the end of object}} - -#endif - constexpr const int *pn = &(&b + 1)->n; // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{cannot access field of pointer past the end of object}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{cannot access field of pointer past the end of object}} - + constexpr A *a2 = &b + 1; // both-error {{must be initialized by a constant expression}} \ + // both-note {{cannot access base class of pointer past the end of object}} + constexpr const int *pn = &(&b + 1)->n; // both-error {{must be initialized by a constant expression}} \ + // both-note {{cannot access field of pointer past the end of object}} } #if __cplusplus >= 202002L @@ -632,12 +598,9 @@ namespace Destructors { struct S { constexpr S() {} - constexpr ~S() { // expected-error {{never produces a constant expression}} \ - // ref-error {{never produces a constant expression}} - int i = 1 / 0; // expected-warning {{division by zero}} \ - // expected-note 2{{division by zero}} \ - // ref-warning {{division by zero}} \ - // ref-note 2{{division by zero}} + constexpr ~S() { // both-error {{never produces a constant expression}} + int i = 1 / 0; // both-warning {{division by zero}} \ + // both-note 2{{division by zero}} } }; constexpr int testS() { @@ -645,10 +608,8 @@ namespace Destructors { return 1; // expected-note {{in call to '&S{}->~S()'}} // FIXME: ^ Wrong line } - static_assert(testS() == 1); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to 'testS()'}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to 'testS()'}} + static_assert(testS() == 1); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'testS()'}} } namespace BaseToDerived { @@ -657,10 +618,8 @@ namespace A { struct B : A { int n; }; struct C : B {}; C c = {}; - constexpr C *pb = (C*)((A*)&c + 1); // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{cannot access derived class of pointer past the end of object}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{cannot access derived class of pointer past the end of object}} + constexpr C *pb = (C*)((A*)&c + 1); // both-error {{must be initialized by a constant expression}} \ + // both-note {{cannot access derived class of pointer past the end of object}} } namespace B { struct A {}; @@ -894,10 +853,8 @@ namespace VirtualFromBase { // Virtual f(), not OK. constexpr X> xxs2; constexpr X *q = const_cast>*>(&xxs2); - static_assert(q->f() == sizeof(X), ""); // ref-error {{not an integral constant expression}} \ - // ref-note {{cannot evaluate call to virtual function}} \ - // expected-error {{not an integral constant expression}} \ - // expected-note {{cannot evaluate call to virtual function}} + static_assert(q->f() == sizeof(X), ""); // both-error {{not an integral constant expression}} \ + // both-note {{cannot evaluate call to virtual function}} } #endif @@ -1070,14 +1027,10 @@ namespace ParenInit { /// Not constexpr! O o1(0); - constinit O o2(0); // ref-error {{variable does not have a constant initializer}} \ - // ref-note {{required by 'constinit' specifier}} \ - // ref-note {{reference to temporary is not a constant expression}} \ - // ref-note {{temporary created here}} \ - // expected-error {{variable does not have a constant initializer}} \ - // expected-note {{required by 'constinit' specifier}} \ - // expected-note {{reference to temporary is not a constant expression}} \ - // expected-note {{temporary created here}} + constinit O o2(0); // both-error {{variable does not have a constant initializer}} \ + // both-note {{required by 'constinit' specifier}} \ + // both-note {{reference to temporary is not a constant expression}} \ + // both-note {{temporary created here}} } #endif @@ -1109,32 +1062,24 @@ namespace AccessOnNullptr { int a; }; - constexpr int a() { // expected-error {{never produces a constant expression}} \ - // ref-error {{never produces a constant expression}} + constexpr int a() { // both-error {{never produces a constant expression}} F *f = nullptr; - f->a = 0; // expected-note 2{{cannot access field of null pointer}} \ - // ref-note 2{{cannot access field of null pointer}} + f->a = 0; // both-note 2{{cannot access field of null pointer}} return f->a; } - static_assert(a() == 0, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to 'a()'}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to 'a()'}} + static_assert(a() == 0, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'a()'}} - constexpr int a2() { // expected-error {{never produces a constant expression}} \ - // ref-error {{never produces a constant expression}} + constexpr int a2() { // both-error {{never produces a constant expression}} F *f = nullptr; - const int *a = &(f->a); // expected-note 2{{cannot access field of null pointer}} \ - // ref-note 2{{cannot access field of null pointer}} + const int *a = &(f->a); // both-note 2{{cannot access field of null pointer}} return f->a; } - static_assert(a2() == 0, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to 'a2()'}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to 'a2()'}} + static_assert(a2() == 0, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'a2()'}} } namespace IndirectFieldInit { -- cgit v1.1 From 10cd0e7a8bdcd80c0b017f8d0b6b71dd61973b54 Mon Sep 17 00:00:00 2001 From: Tarun Prabhu Date: Thu, 8 Feb 2024 07:56:16 -0700 Subject: [flang][docs] Update flang documentation regarding the test suite (#80755) Remove redundant reference to flang not being able to generate code. Add a reference to the gfortran tests that are part of the LLVM Test Suite. --- flang/docs/FortranLLVMTestSuite.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/flang/docs/FortranLLVMTestSuite.md b/flang/docs/FortranLLVMTestSuite.md index 62459e6..f07d415 100644 --- a/flang/docs/FortranLLVMTestSuite.md +++ b/flang/docs/FortranLLVMTestSuite.md @@ -12,12 +12,6 @@ first-time users read through [LLVM Test Suite Guide](https://llvm.org/docs/TestSuiteGuide.html) which describes the organizational structure of the test suite and how to run it. -Although the Flang driver is unable to generate code at this time, we -are neverthelesss incrementally adding Fortran tests into the LLVM -Test Suite. We are currently testing against GFortran while we make -progress towards completing the new Flang driver with full -code-generation capabilities. - ## Running the LLVM test-suite with Fortran Fortran support can be enabled by setting the following CMake variables: @@ -63,3 +57,12 @@ cmake -G "Ninja" -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ \ -DTEST_SUITE_FORTRAN:STRING=ON \ -DTEST_SUITE_SPEC2017_ROOT= .. ``` + +## Running the gfortran tests + +Tests from the gfortran test suite have been imported into the LLVM Test Suite. +The tests will be run automatically if the test suite is built following the +instructions described [above](#running-the-LLVM-test-suite-with-fortran). +There are additional configure-time options that can be used with the gfortran +tests. More details about those options and their purpose can be found in +[`Fortran/gfortran/README.md`](https://github.com/llvm/llvm-test-suite/tree/main/Fortran/gfortran/README.md)`. -- cgit v1.1 From cd183428a9af6d7dda2018a88aeb495f268716b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 8 Feb 2024 15:15:14 +0100 Subject: [clang][Interp] Fix handling of generic lambdas When compiling their static invoker, we need to get the right specialization. --- clang/lib/AST/Interp/ByteCodeEmitter.cpp | 30 +++++++++++++++++++++++++++++- clang/test/AST/Interp/lambda.cpp | 13 +++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp index 8bbfa92..e697e24f 100644 --- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp +++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp @@ -23,6 +23,34 @@ using namespace clang; using namespace clang::interp; Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) { + bool IsLambdaStaticInvoker = false; + if (const auto *MD = dyn_cast(FuncDecl); + MD && MD->isLambdaStaticInvoker()) { + // For a lambda static invoker, we might have to pick a specialized + // version if the lambda is generic. In that case, the picked function + // will *NOT* be a static invoker anymore. However, it will still + // be a non-static member function, this (usually) requiring an + // instance pointer. We suppress that later in this function. + IsLambdaStaticInvoker = true; + + const CXXRecordDecl *ClosureClass = MD->getParent(); + assert(ClosureClass->captures_begin() == ClosureClass->captures_end()); + if (ClosureClass->isGenericLambda()) { + const CXXMethodDecl *LambdaCallOp = ClosureClass->getLambdaCallOperator(); + assert(MD->isFunctionTemplateSpecialization() && + "A generic lambda's static-invoker function must be a " + "template specialization"); + const TemplateArgumentList *TAL = MD->getTemplateSpecializationArgs(); + FunctionTemplateDecl *CallOpTemplate = + LambdaCallOp->getDescribedFunctionTemplate(); + void *InsertPos = nullptr; + const FunctionDecl *CorrespondingCallOpSpecialization = + CallOpTemplate->findSpecialization(TAL->asArray(), InsertPos); + assert(CorrespondingCallOpSpecialization); + FuncDecl = cast(CorrespondingCallOpSpecialization); + } + } + // Set up argument indices. unsigned ParamOffset = 0; SmallVector ParamTypes; @@ -46,7 +74,7 @@ Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) { // InterpStack when calling the function. bool HasThisPointer = false; if (const auto *MD = dyn_cast(FuncDecl)) { - if (MD->isImplicitObjectMemberFunction()) { + if (MD->isImplicitObjectMemberFunction() && !IsLambdaStaticInvoker) { HasThisPointer = true; ParamTypes.push_back(PT_Ptr); ParamOffsets.push_back(ParamOffset); diff --git a/clang/test/AST/Interp/lambda.cpp b/clang/test/AST/Interp/lambda.cpp index f840089..a433e56 100644 --- a/clang/test/AST/Interp/lambda.cpp +++ b/clang/test/AST/Interp/lambda.cpp @@ -155,6 +155,19 @@ namespace StaticInvoker { return fp(i).a; } static_assert(sv6(12) == 12); + + + /// A generic lambda. + auto GL = [](auto a) { return a; }; + constexpr char (*fp2)(char) = GL; + static_assert(fp2('3') == '3', ""); + + struct GLS { + int a; + }; + auto GL2 = [](auto a) { return GLS{a}; }; + constexpr GLS (*fp3)(char) = GL2; + static_assert(fp3('3').a == '3', ""); } namespace LambdasAsParams { -- cgit v1.1 From 3e33b6f5de6905c98395a77b41d474b87ef9e677 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 8 Feb 2024 10:11:39 -0500 Subject: [libc++][NFC] Reformat a few files that had gotten mis-formatted Those appear to be oversights when committing patches in the last few months. --- libcxx/include/ostream | 36 ++++++++++++++++-------------------- libcxx/include/scoped_allocator | 4 ++-- libcxx/include/shared_mutex | 6 +++--- libcxx/include/string | 16 +++++++++------- libcxx/include/valarray | 4 ++-- libcxx/include/vector | 4 ++-- 6 files changed, 34 insertions(+), 36 deletions(-) diff --git a/libcxx/include/ostream b/libcxx/include/ostream index 180adda..2e26073 100644 --- a/libcxx/include/ostream +++ b/libcxx/include/ostream @@ -1090,11 +1090,10 @@ _LIBCPP_EXPORTED_FROM_ABI FILE* __get_ostream_file(ostream& __os); # ifndef _LIBCPP_HAS_NO_UNICODE template // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563). -_LIBCPP_HIDE_FROM_ABI void -__vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __write_nl) { -#if _LIBCPP_AVAILABILITY_HAS_PRINT == 0 +_LIBCPP_HIDE_FROM_ABI void __vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __write_nl) { +# if _LIBCPP_AVAILABILITY_HAS_PRINT == 0 return std::__vprint_nonunicode(__os, __fmt, __args, __write_nl); -#else +# else FILE* __file = std::__get_ostream_file(__os); if (!__file || !__print::__is_terminal(__file)) return std::__vprint_nonunicode(__os, __fmt, __args, __write_nl); @@ -1110,38 +1109,36 @@ __vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __wr // This is the path for the native API, start with flushing. __os.flush(); -# ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -# endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS ostream::sentry __s(__os); if (__s) { -# ifndef _LIBCPP_WIN32API +# ifndef _LIBCPP_WIN32API __print::__vprint_unicode_posix(__file, __fmt, __args, __write_nl, true); -# elif !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) +# elif !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) __print::__vprint_unicode_windows(__file, __fmt, __args, __write_nl, true); -# else -# error "Windows builds with wchar_t disabled are not supported." -# endif +# else +# error "Windows builds with wchar_t disabled are not supported." +# endif } -# ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { __os.__set_badbit_and_consider_rethrow(); } -# endif // _LIBCPP_HAS_NO_EXCEPTIONS -#endif // _LIBCPP_AVAILABILITY_HAS_PRINT +# endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_AVAILABILITY_HAS_PRINT } template // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563). -_LIBCPP_HIDE_FROM_ABI inline void -vprint_unicode(ostream& __os, string_view __fmt, format_args __args) { +_LIBCPP_HIDE_FROM_ABI inline void vprint_unicode(ostream& __os, string_view __fmt, format_args __args) { std::__vprint_unicode(__os, __fmt, __args, false); } # endif // _LIBCPP_HAS_NO_UNICODE template -_LIBCPP_HIDE_FROM_ABI void -print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) { +_LIBCPP_HIDE_FROM_ABI void print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) { # ifndef _LIBCPP_HAS_NO_UNICODE if constexpr (__print::__use_unicode_execution_charset) std::__vprint_unicode(__os, __fmt.get(), std::make_format_args(__args...), false); @@ -1153,8 +1150,7 @@ print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) { } template -_LIBCPP_HIDE_FROM_ABI void -println(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) { +_LIBCPP_HIDE_FROM_ABI void println(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) { # ifndef _LIBCPP_HAS_NO_UNICODE // Note the wording in the Standard is inefficient. The output of // std::format is a std::string which is then copied. This solution diff --git a/libcxx/include/scoped_allocator b/libcxx/include/scoped_allocator index 1626453..eff6fbd 100644 --- a/libcxx/include/scoped_allocator +++ b/libcxx/include/scoped_allocator @@ -476,8 +476,8 @@ public: } private: - _LIBCPP_HIDE_FROM_ABI explicit scoped_allocator_adaptor(outer_allocator_type&& __o, inner_allocator_type&& __i) _NOEXCEPT - : base(std::move(__o), std::move(__i)) {} + _LIBCPP_HIDE_FROM_ABI explicit scoped_allocator_adaptor( + outer_allocator_type&& __o, inner_allocator_type&& __i) _NOEXCEPT : base(std::move(__o), std::move(__i)) {} template _LIBCPP_HIDE_FROM_ABI void __construct(integral_constant, _Tp* __p, _Args&&... __args) { diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex index ac66b3a..57f385b 100644 --- a/libcxx/include/shared_mutex +++ b/libcxx/include/shared_mutex @@ -124,9 +124,9 @@ template #include <__config> -# ifdef _LIBCPP_HAS_NO_THREADS -# error " is not supported since libc++ has been configured without support for threads." -# endif +#ifdef _LIBCPP_HAS_NO_THREADS +# error " is not supported since libc++ has been configured without support for threads." +#endif #include <__assert> // all public C++ headers provide the assertion handler #include <__availability> diff --git a/libcxx/include/string b/libcxx/include/string index ed4fdbe..530a223 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -938,7 +938,11 @@ public: // Turning off ASan instrumentation for variable initialization with _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS // does not work consistently during initialization of __r_, so we instead unpoison __str's memory manually first. // __str's memory needs to be unpoisoned only in the case where it's a short string. - : __r_([](basic_string &__s) -> decltype(__s.__r_)&& { if(!__s.__is_long()) __s.__annotate_delete(); return std::move(__s.__r_); }(__str)) { + : __r_([](basic_string& __s) -> decltype(__s.__r_)&& { + if (!__s.__is_long()) + __s.__annotate_delete(); + return std::move(__s.__r_); + }(__str)) { __str.__r_.first() = __rep(); __str.__annotate_new(0); if (!__is_long()) @@ -1918,7 +1922,7 @@ private: } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_new(size_type __current_size) const _NOEXCEPT { - (void) __current_size; + (void)__current_size; #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long())) __annotate_contiguous_container(data() + capacity() + 1, data() + __current_size + 1); @@ -1933,7 +1937,7 @@ private: } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_increase(size_type __n) const _NOEXCEPT { - (void) __n; + (void)__n; #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long())) __annotate_contiguous_container(data() + size() + 1, data() + size() + 1 + __n); @@ -1941,7 +1945,7 @@ private: } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_shrink(size_type __old_size) const _NOEXCEPT { - (void) __old_size; + (void)__old_size; #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long())) __annotate_contiguous_container(data() + __old_size + 1, data() + size() + 1); @@ -1952,9 +1956,7 @@ private: static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __align_it(size_type __s) _NOEXCEPT { return (__s + (__a - 1)) & ~(__a - 1); } - enum { - __alignment = 8 - }; + enum { __alignment = 8 }; static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __recommend(size_type __s) _NOEXCEPT { if (__s < __min_cap) { return static_cast(__min_cap) - 1; diff --git a/libcxx/include/valarray b/libcxx/include/valarray index 44adcd7..88b161e 100644 --- a/libcxx/include/valarray +++ b/libcxx/include/valarray @@ -2435,7 +2435,7 @@ template ::value, int> > inline valarray<_Tp>& valarray<_Tp>::operator*=(const _Expr& __v) { size_t __i = 0; for (value_type* __t = __begin_; __t != __end_; ++__t, ++__i) - *__t *= std::__get(__v,__i); + *__t *= std::__get(__v, __i); return *this; } @@ -2444,7 +2444,7 @@ template ::value, int> > inline valarray<_Tp>& valarray<_Tp>::operator/=(const _Expr& __v) { size_t __i = 0; for (value_type* __t = __begin_; __t != __end_; ++__t, ++__i) - *__t /= std::__get(__v,__i); + *__t /= std::__get(__v, __i); return *this; } diff --git a/libcxx/include/vector b/libcxx/include/vector index e9615ab..3934361 100644 --- a/libcxx/include/vector +++ b/libcxx/include/vector @@ -831,8 +831,8 @@ private: // For more details, see the "Using libc++" documentation page or // the documentation for __sanitizer_annotate_contiguous_container. - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_contiguous_container( - const void* __old_mid, const void* __new_mid) const { + _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void + __annotate_contiguous_container(const void* __old_mid, const void* __new_mid) const { (void)__old_mid; (void)__new_mid; #ifndef _LIBCPP_HAS_NO_ASAN -- cgit v1.1 From 5452cbc4a6bfb905fedeacaa6f27895e249da1e5 Mon Sep 17 00:00:00 2001 From: ostannard Date: Thu, 8 Feb 2024 15:31:54 +0000 Subject: [AArch64] Indirect tail-calls cannot use x16 with pac-ret+pc (#81020) When using -mbranch-protection=pac-ret+pc, x16 is used in the function epilogue to hold the address of the signing instruction. This is used by a HINT instruction which can only use x16, so we can't change this. This means that we can't use it to hold the function pointer for an indirect tail-call. There is existing code to force indirect tail-calls to use x16 or x17 when BTI is enabled, so there are now 4 combinations: bti pac-ret+pc Valid function pointer registers off off Any non callee-saved register on off x16 or x17 off on Any non callee-saved register except x16 on on x17 --- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 4 +- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 +- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 4 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 47 +++++++++++++--- llvm/lib/Target/AArch64/AArch64RegisterInfo.td | 15 +++-- .../Target/AArch64/GISel/AArch64CallLowering.cpp | 15 +++-- .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 4 +- .../branch-target-enforcement-indirect-calls.ll | 65 ++++++++++++++++++++++ llvm/test/CodeGen/AArch64/kcfi-bti.ll | 4 +- 9 files changed, 138 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index de24725..5b5ffd7 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1602,7 +1602,9 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { // attributes (isCall, isReturn, etc.). We lower them to the real // instruction here. case AArch64::TCRETURNri: - case AArch64::TCRETURNriBTI: + case AArch64::TCRETURNrix16x17: + case AArch64::TCRETURNrix17: + case AArch64::TCRETURNrinotx16: case AArch64::TCRETURNriALL: { MCInst TmpInst; TmpInst.setOpcode(AArch64::BR); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8573939..20290c9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25700,7 +25700,9 @@ AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, case AArch64::BLR: case AArch64::BLRNoIP: case AArch64::TCRETURNri: - case AArch64::TCRETURNriBTI: + case AArch64::TCRETURNrix16x17: + case AArch64::TCRETURNrix17: + case AArch64::TCRETURNrinotx16: break; default: llvm_unreachable("Unexpected CFI call opcode"); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 9add7d8..39c9609 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2503,7 +2503,9 @@ bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) { return false; case AArch64::TCRETURNdi: case AArch64::TCRETURNri: - case AArch64::TCRETURNriBTI: + case AArch64::TCRETURNrix16x17: + case AArch64::TCRETURNrix17: + case AArch64::TCRETURNrinotx16: case AArch64::TCRETURNriALL: return true; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 77fdb68..9c3a692 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -928,8 +928,25 @@ let RecomputePerFunction = 1 in { // Avoid generating STRQro if it is slow, unless we're optimizing for code size. def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || shouldOptForSize(MF)">; - def UseBTI : Predicate<[{ MF->getInfo()->branchTargetEnforcement() }]>; - def NotUseBTI : Predicate<[{ !MF->getInfo()->branchTargetEnforcement() }]>; + // Register restrictions for indirect tail-calls: + // - If branch target enforcement is enabled, indirect calls must use x16 or + // x17, because these are the only registers which can target the BTI C + // instruction. + // - If PAuthLR is enabled, x16 is used in the epilogue to hold the address + // of the signing instruction. This can't be changed because it is used by a + // HINT instruction which only accepts x16. We can't load anything from the + // stack after this because the authentication instruction checks that SP is + // the same as it was at function entry, so we can't have anything on the + // stack. + + // BTI on, PAuthLR off: x16 or x17 + def TailCallX16X17 : Predicate<[{ MF->getInfo()->branchTargetEnforcement() && !MF->getInfo()->branchProtectionPAuthLR() }]>; + // BTI on, PAuthLR on: x17 only + def TailCallX17 : Predicate<[{ MF->getInfo()->branchTargetEnforcement() && MF->getInfo()->branchProtectionPAuthLR() }]>; + // BTI off, PAuthLR on: Any non-callee-saved register except x16 + def TailCallNotX16 : Predicate<[{ !MF->getInfo()->branchTargetEnforcement() && MF->getInfo()->branchProtectionPAuthLR() }]>; + // BTI off, PAuthLR off: Any non-callee-saved register + def TailCallAny : Predicate<[{ !MF->getInfo()->branchTargetEnforcement() && !MF->getInfo()->branchProtectionPAuthLR() }]>; def SLSBLRMitigation : Predicate<[{ MF->getSubtarget().hardenSlsBlr() }]>; def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget().hardenSlsBlr() }]>; @@ -9121,18 +9138,30 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { // some verifier checks for outlined functions. def TCRETURNriALL : Pseudo<(outs), (ins GPR64:$dst, i32imm:$FPDiff), []>, Sched<[WriteBrReg]>; - // Indirect tail-call limited to only use registers (x16 and x17) which are - // allowed to tail-call a "BTI c" instruction. - def TCRETURNriBTI : Pseudo<(outs), (ins rtcGPR64:$dst, i32imm:$FPDiff), []>, + + // Indirect tail-calls with reduced register classes, needed for BTI and + // PAuthLR. + def TCRETURNrix16x17 : Pseudo<(outs), (ins tcGPRx16x17:$dst, i32imm:$FPDiff), []>, + Sched<[WriteBrReg]>; + def TCRETURNrix17 : Pseudo<(outs), (ins tcGPRx17:$dst, i32imm:$FPDiff), []>, + Sched<[WriteBrReg]>; + def TCRETURNrinotx16 : Pseudo<(outs), (ins tcGPRnotx16:$dst, i32imm:$FPDiff), []>, Sched<[WriteBrReg]>; } def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)), (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>, - Requires<[NotUseBTI]>; -def : Pat<(AArch64tcret rtcGPR64:$dst, (i32 timm:$FPDiff)), - (TCRETURNriBTI rtcGPR64:$dst, imm:$FPDiff)>, - Requires<[UseBTI]>; + Requires<[TailCallAny]>; +def : Pat<(AArch64tcret tcGPRx16x17:$dst, (i32 timm:$FPDiff)), + (TCRETURNrix16x17 tcGPRx16x17:$dst, imm:$FPDiff)>, + Requires<[TailCallX16X17]>; +def : Pat<(AArch64tcret tcGPRx17:$dst, (i32 timm:$FPDiff)), + (TCRETURNrix17 tcGPRx17:$dst, imm:$FPDiff)>, + Requires<[TailCallX17]>; +def : Pat<(AArch64tcret tcGPRnotx16:$dst, (i32 timm:$FPDiff)), + (TCRETURNrinotx16 tcGPRnotx16:$dst, imm:$FPDiff)>, + Requires<[TailCallNotX16]>; + def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)), (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index b70ab85..569944e 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -217,11 +217,16 @@ def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X2 X22, X23, X24, X25, X26, X27, X28, FP, LR)>; -// Restricted set of tail call registers, for use when branch target -// enforcement is enabled. These are the only registers which can be used to -// indirectly branch (not call) to the "BTI c" instruction at the start of a -// BTI-protected function. -def rtcGPR64 : RegisterClass<"AArch64", [i64], 64, (add X16, X17)>; +// Restricted sets of tail call registers, for use when branch target +// enforcement or PAuthLR are enabled. +// For BTI, x16 and x17 are the only registers which can be used to indirectly +// branch (not call) to the "BTI c" instruction at the start of a BTI-protected +// function. +// For PAuthLR, x16 must be used in the function epilogue for other purposes, +// so cannot hold the function pointer. +def tcGPRx17 : RegisterClass<"AArch64", [i64], 64, (add X17)>; +def tcGPRx16x17 : RegisterClass<"AArch64", [i64], 64, (add X16, X17)>; +def tcGPRnotx16 : RegisterClass<"AArch64", [i64], 64, (sub tcGPR64, X16)>; // Register set that excludes registers that are reserved for procedure calls. // This is used for pseudo-instructions that are actually implemented using a diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 55cad84..3dc3d31 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -1012,16 +1012,23 @@ bool AArch64CallLowering::isEligibleForTailCallOptimization( static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, bool IsTailCall) { + const AArch64FunctionInfo *FuncInfo = CallerF.getInfo(); + if (!IsTailCall) return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL; if (!IsIndirect) return AArch64::TCRETURNdi; - // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use - // x16 or x17. - if (CallerF.getInfo()->branchTargetEnforcement()) - return AArch64::TCRETURNriBTI; + // When BTI or PAuthLR are enabled, there are restrictions on using x16 and + // x17 to hold the function pointer. + if (FuncInfo->branchTargetEnforcement()) { + if (FuncInfo->branchProtectionPAuthLR()) + return AArch64::TCRETURNrix17; + else + return AArch64::TCRETURNrix16x17; + } else if (FuncInfo->branchProtectionPAuthLR()) + return AArch64::TCRETURNrinotx16; return AArch64::TCRETURNri; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index b8e5e7b..0fc4d7f 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -273,7 +273,9 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, case AArch64::GPR64common_and_GPR64noipRegClassID: case AArch64::GPR64noip_and_tcGPR64RegClassID: case AArch64::tcGPR64RegClassID: - case AArch64::rtcGPR64RegClassID: + case AArch64::tcGPRx16x17RegClassID: + case AArch64::tcGPRx17RegClassID: + case AArch64::tcGPRnotx16RegClassID: case AArch64::WSeqPairsClassRegClassID: case AArch64::XSeqPairsClassRegClassID: case AArch64::MatrixIndexGPR32_8_11RegClassID: diff --git a/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll b/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll index de543f4..833a6d5 100644 --- a/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll +++ b/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll @@ -26,3 +26,68 @@ entry: ; CHECK: br {{x16|x17}} ret void } +define void @bti_enabled_force_x10(ptr %p) "branch-target-enforcement"="true" { +entry: + %p_x10 = tail call ptr asm "", "={x10},{x10},~{lr}"(ptr %p) + tail call void %p_x10() +; CHECK: br {{x16|x17}} + ret void +} + +; sign-return-address places no further restrictions on the tail-call register. + +define void @bti_enabled_pac_enabled(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" { +entry: + tail call void %p() +; CHECK: br {{x16|x17}} + ret void +} +define void @bti_enabled_pac_enabled_force_x10(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" { +entry: + %p_x10 = tail call ptr asm "", "={x10},{x10},~{lr}"(ptr %p) + tail call void %p_x10() +; CHECK: br {{x16|x17}} + ret void +} + +; PAuthLR needs to use x16 to hold the address of the signing instruction. That +; can't be changed because the hint instruction only uses that register, so the +; only choice for the tail-call function pointer is x17. + +define void @bti_enabled_pac_pc_enabled(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" "branch-protection-pauth-lr"="true" { +entry: + tail call void %p() +; CHECK: br x17 + ret void +} +define void @bti_enabled_pac_pc_enabled_force_x16(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" "branch-protection-pauth-lr"="true" { +entry: + %p_x16 = tail call ptr asm "", "={x16},{x16},~{lr}"(ptr %p) + tail call void %p_x16() +; CHECK: br x17 + ret void +} + +; PAuthLR by itself prevents x16 from being used, but any other +; non-callee-saved register can be used. + +define void @pac_pc_enabled(ptr %p) "sign-return-address"="all" "branch-protection-pauth-lr"="true" { +entry: + tail call void %p() +; CHECK: br {{(x[0-9]|x1[0-578])$}} + ret void +} +define void @pac_pc_enabled_force_x16(ptr %p) "sign-return-address"="all" "branch-protection-pauth-lr"="true" { +entry: + %p_x16 = tail call ptr asm "", "={x16},{x16},~{lr}"(ptr %p) + tail call void %p_x16() +; CHECK: br {{(x[0-9]|x1[0-578])$}} + ret void +} +define void @pac_pc_enabled_force_x17(ptr %p) "sign-return-address"="all" "branch-protection-pauth-lr"="true" { +entry: + %p_x17 = tail call ptr asm "", "={x17},{x17},~{lr}"(ptr %p) + tail call void %p_x17() +; CHECK: br x17 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/kcfi-bti.ll b/llvm/test/CodeGen/AArch64/kcfi-bti.ll index 12cde43..d3febb5 100644 --- a/llvm/test/CodeGen/AArch64/kcfi-bti.ll +++ b/llvm/test/CodeGen/AArch64/kcfi-bti.ll @@ -73,11 +73,11 @@ define void @f3(ptr noundef %x) { ; MIR-LABEL: name: f3 ; MIR: body: -; ISEL: TCRETURNriBTI %1, 0, csr_aarch64_aapcs, implicit $sp, cfi-type 12345678 +; ISEL: TCRETURNrix16x17 %1, 0, csr_aarch64_aapcs, implicit $sp, cfi-type 12345678 ; KCFI: BUNDLE{{.*}} { ; KCFI-NEXT: KCFI_CHECK $x16, 12345678, implicit-def $x9, implicit-def $x16, implicit-def $x17, implicit-def $nzcv -; KCFI-NEXT: TCRETURNriBTI internal killed $x16, 0, csr_aarch64_aapcs, implicit $sp +; KCFI-NEXT: TCRETURNrix16x17 internal killed $x16, 0, csr_aarch64_aapcs, implicit $sp ; KCFI-NEXT: } tail call void %x() [ "kcfi"(i32 12345678) ] -- cgit v1.1 From 0802596df3d1ffd15f6b828a0f5c1e5b687a730f Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Thu, 8 Feb 2024 10:38:50 -0500 Subject: [Flang] Update the fix of PR 80738 to cover generic interface inside modules (#81087) The following test cases crashes. The problem is that the fix for PR https://github.com/llvm/llvm-project/pull/80738 is not quite complete. It should `GetUltimate()` of the `interface_` before check if it is generic. ``` MODULE M CONTAINS FUNCTION Int(Arg) INTEGER :: Int, Arg Int = Arg END FUNCTION FUNCTION Int8(Arg) INTEGER(8) :: Int8, Arg Int8 = 8_8 END FUNCTION END MODULE MODULE M1 USE M INTERFACE Int8 MODULE PROCEDURE Int MODULE PROCEDURE Int8 END INTERFACE END MODULE PROGRAM PtrAssignGen USE M USE M1 IMPLICIT NONE INTERFACE Int MODULE PROCEDURE Int MODULE PROCEDURE Int8 END INTERFACE PROCEDURE(Int8), POINTER :: PtrInt8 PtrInt8 => Int8 IF ( PtrInt8(100_8) .NE. 8_8 ) ERROR STOP 12 END ``` --- flang/lib/Semantics/resolve-names.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 36deab9..2a42c791 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -5648,9 +5648,10 @@ void DeclarationVisitor::Post(const parser::ProcDecl &x) { const auto &name{std::get(x.t)}; const Symbol *procInterface{nullptr}; if (interfaceName_) { - procInterface = interfaceName_->symbol->has() - ? interfaceName_->symbol->get().specific() - : interfaceName_->symbol; + Symbol *ultimate{&interfaceName_->symbol->GetUltimate()}; + procInterface = ultimate->has() + ? ultimate->get().specific() + : ultimate; } auto attrs{HandleSaveName(name.source, GetAttrs())}; DerivedTypeDetails *dtDetails{nullptr}; -- cgit v1.1 From dc5da4851de5d29dd040d85a8387e2e5b4b12b7b Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 8 Feb 2024 16:41:02 +0100 Subject: [InstCombine] Add tests for #77108 (NFC) --- llvm/test/Transforms/InstCombine/dependent-ivs.ll | 374 ++++++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/dependent-ivs.ll diff --git a/llvm/test/Transforms/InstCombine/dependent-ivs.ll b/llvm/test/Transforms/InstCombine/dependent-ivs.ll new file mode 100644 index 0000000..bd66791 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/dependent-ivs.ll @@ -0,0 +1,374 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +define void @int_iv_nuw(i64 %base, i64 %end) { +; CHECK-LABEL: define void @int_iv_nuw( +; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ] + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + call void @use.i64(i64 %iv2) + %iv.next = add nuw nsw i64 %iv, 4 + %iv2.next = add nuw i64 %iv.next, %base + %cmp = icmp eq i64 %iv.next, %end + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @int_iv_nsw(i64 %base, i64 %end) { +; CHECK-LABEL: define void @int_iv_nsw( +; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[IV2_NEXT]] = add nsw i64 [[IV_NEXT]], [[BASE]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ] + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + call void @use.i64(i64 %iv2) + %iv.next = add nuw nsw i64 %iv, 4 + %iv2.next = add nsw i64 %iv.next, %base + %cmp = icmp eq i64 %iv.next, %end + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @int_iv_commuted(i64 %base, i64 %end) { +; CHECK-LABEL: define void @int_iv_commuted( +; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE2:%.*]] = mul i64 [[BASE]], 42 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE2]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[IV2_NEXT]] = add i64 [[BASE2]], [[IV_NEXT]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %base2 = mul i64 %base, 42 ; thwart complexity-based canonicalization + br label %loop + +loop: + %iv2 = phi i64 [ %iv2.next, %loop ], [ %base2, %entry ] + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + call void @use.i64(i64 %iv2) + %iv.next = add nuw nsw i64 %iv, 4 + %iv2.next = add i64 %base2, %iv.next + %cmp = icmp eq i64 %iv.next, %end + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @int_iv_vector(<2 x i64> %base) { +; CHECK-LABEL: define void @int_iv_vector( +; CHECK-SAME: <2 x i64> [[BASE:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV2:%.*]] = phi <2 x i64> [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi <2 x i64> [ [[IV_NEXT:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] +; CHECK-NEXT: call void @use.v2i64(<2 x i64> [[IV2]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw <2 x i64> [[IV]], +; CHECK-NEXT: [[IV2_NEXT]] = add <2 x i64> [[IV_NEXT]], [[BASE]] +; CHECK-NEXT: [[CMP:%.*]] = call i1 @get.i1() +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv2 = phi <2 x i64> [ %iv2.next, %loop ], [ %base, %entry ] + %iv = phi <2 x i64> [ %iv.next, %loop ], [ zeroinitializer, %entry ] + call void @use.v2i64(<2 x i64> %iv2) + %iv.next = add nuw nsw <2 x i64> %iv, + %iv2.next = add <2 x i64> %iv.next, %base + %cmp = call i1 @get.i1() + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @int_iv_loop_variant_step(i64 %base, i64 %end) { +; CHECK-LABEL: define void @int_iv_loop_variant_step( +; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) +; CHECK-NEXT: [[STEP:%.*]] = call i64 @get.i64() +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[STEP]] +; CHECK-NEXT: [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ] + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + call void @use.i64(i64 %iv2) + %step = call i64 @get.i64() + %iv.next = add nuw nsw i64 %iv, %step + %iv2.next = add nuw i64 %iv.next, %base + %cmp = icmp eq i64 %iv.next, %end + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @ptr_iv_inbounds(ptr %base, i64 %end) { +; CHECK-LABEL: define void @ptr_iv_inbounds( +; CHECK-SAME: ptr [[BASE:%.*]], i64 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV_PTR:%.*]] = phi ptr [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: call void @use.p0(ptr [[IV_PTR]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[IV_PTR_NEXT]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv.ptr = phi ptr [ %iv.ptr.next, %loop ], [ %base, %entry ] + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + call void @use.p0(ptr %iv.ptr) + %iv.next = add nuw nsw i64 %iv, 4 + %iv.ptr.next = getelementptr inbounds i8, ptr %base, i64 %iv.next + %cmp = icmp eq i64 %iv.next, %end + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @ptr_iv_no_inbounds(ptr %base, i64 %end) { +; CHECK-LABEL: define void @ptr_iv_no_inbounds( +; CHECK-SAME: ptr [[BASE:%.*]], i64 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV_PTR:%.*]] = phi ptr [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: call void @use.p0(ptr [[IV_PTR]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[IV_PTR_NEXT]] = getelementptr i8, ptr [[BASE]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv.ptr = phi ptr [ %iv.ptr.next, %loop ], [ %base, %entry ] + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + call void @use.p0(ptr %iv.ptr) + %iv.next = add nuw nsw i64 %iv, 4 + %iv.ptr.next = getelementptr i8, ptr %base, i64 %iv.next + %cmp = icmp eq i64 %iv.next, %end + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @ptr_iv_vector(<2 x ptr> %base, i64 %end) { +; CHECK-LABEL: define void @ptr_iv_vector( +; CHECK-SAME: <2 x ptr> [[BASE:%.*]], i64 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV_PTR:%.*]] = phi <2 x ptr> [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: call void @use.v2p0(<2 x ptr> [[IV_PTR]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[IV_PTR_NEXT]] = getelementptr inbounds i8, <2 x ptr> [[BASE]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv.ptr = phi <2 x ptr> [ %iv.ptr.next, %loop ], [ %base, %entry ] + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + call void @use.v2p0(<2 x ptr> %iv.ptr) + %iv.next = add nuw nsw i64 %iv, 4 + %iv.ptr.next = getelementptr inbounds i8, <2 x ptr> %base, i64 %iv.next + %cmp = icmp eq i64 %iv.next, %end + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @ptr_iv_vector2(<2 x ptr> %base) { +; CHECK-LABEL: define void @ptr_iv_vector2( +; CHECK-SAME: <2 x ptr> [[BASE:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV_PTR:%.*]] = phi <2 x ptr> [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi <2 x i64> [ [[IV_NEXT:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] +; CHECK-NEXT: call void @use.v2p0(<2 x ptr> [[IV_PTR]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw <2 x i64> [[IV]], +; CHECK-NEXT: [[IV_PTR_NEXT]] = getelementptr i8, <2 x ptr> [[BASE]], <2 x i64> [[IV_NEXT]] +; CHECK-NEXT: [[CMP:%.*]] = call i1 @get.i1() +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv.ptr = phi <2 x ptr> [ %iv.ptr.next, %loop ], [ %base, %entry ] + %iv = phi <2 x i64> [ %iv.next, %loop ], [ zeroinitializer, %entry ] + call void @use.v2p0(<2 x ptr> %iv.ptr) + %iv.next = add nuw nsw <2 x i64> %iv, + %iv.ptr.next = getelementptr i8, <2 x ptr> %base, <2 x i64> %iv.next + %cmp = call i1 @get.i1() + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @wrong_start_value(i64 %base, i64 %end) { +; CHECK-LABEL: define void @wrong_start_value( +; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 1, [[ENTRY]] ] +; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[IV2_NEXT]] = add i64 [[IV_NEXT]], [[BASE]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ] + %iv = phi i64 [ %iv.next, %loop ], [ 1, %entry ] + call void @use.i64(i64 %iv2) + %iv.next = add nuw nsw i64 %iv, 4 + %iv2.next = add i64 %base, %iv.next + %cmp = icmp eq i64 %iv.next, %end + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @different_loops(i64 %base) { +; CHECK-LABEL: define void @different_loops( +; CHECK-SAME: i64 [[BASE:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP1:%.*]] +; CHECK: loop1: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP1]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: call void @use.i64(i64 [[IV]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[CMP:%.*]] = call i1 @get.i1() +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP2:%.*]], label [[LOOP1]] +; CHECK: loop2: +; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP2]] ], [ [[BASE]], [[LOOP1]] ] +; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) +; CHECK-NEXT: [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]] +; CHECK-NEXT: [[CMP2:%.*]] = call i1 @get.i1() +; CHECK-NEXT: br i1 [[CMP2]], label [[EXIT:%.*]], label [[LOOP2]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop1 + +loop1: + %iv = phi i64 [ %iv.next, %loop1 ], [ 0, %entry ] + call void @use.i64(i64 %iv) + %iv.next = add nuw nsw i64 %iv, 4 + %cmp = call i1 @get.i1() + br i1 %cmp, label %loop2, label %loop1 + +loop2: + %iv2 = phi i64 [ %iv2.next, %loop2 ], [ %base, %loop1 ] + call void @use.i64(i64 %iv2) + %iv2.next = add nuw i64 %base, %iv.next + %cmp2 = call i1 @get.i1() + br i1 %cmp2, label %exit, label %loop2 + +exit: + ret void +} + +declare void @use.p0(ptr) +declare void @use.v2p0(<2 x ptr>) +declare void @use.i64(i64) +declare void @use.v2i64(<2 x i64>) +declare i1 @get.i1() +declare i64 @get.i64() -- cgit v1.1 From fffcc5ca83ad2700a3586c1b849a36c6081e2023 Mon Sep 17 00:00:00 2001 From: Francesco Petrogalli Date: Thu, 8 Feb 2024 16:54:12 +0100 Subject: [CodeGen] Add ValueType v3i8 (NFCI). (#80826) --- llvm/include/llvm/CodeGen/ValueTypes.td | 363 +++++++++++++------------- llvm/lib/CodeGen/ValueTypes.cpp | 2 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 + 3 files changed, 187 insertions(+), 181 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index 55baaf8..1054738 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -97,192 +97,193 @@ def v128i4 : VTVec<128, i4, 32>; // 128 x i4 vector value def v1i8 : VTVec<1, i8, 33>; // 1 x i8 vector value def v2i8 : VTVec<2, i8, 34>; // 2 x i8 vector value -def v4i8 : VTVec<4, i8, 35>; // 4 x i8 vector value -def v8i8 : VTVec<8, i8, 36>; // 8 x i8 vector value -def v16i8 : VTVec<16, i8, 37>; // 16 x i8 vector value -def v32i8 : VTVec<32, i8, 38>; // 32 x i8 vector value -def v64i8 : VTVec<64, i8, 39>; // 64 x i8 vector value -def v128i8 : VTVec<128, i8, 40>; // 128 x i8 vector value -def v256i8 : VTVec<256, i8, 41>; // 256 x i8 vector value -def v512i8 : VTVec<512, i8, 42>; // 512 x i8 vector value -def v1024i8 : VTVec<1024, i8, 43>; // 1024 x i8 vector value - -def v1i16 : VTVec<1, i16, 44>; // 1 x i16 vector value -def v2i16 : VTVec<2, i16, 45>; // 2 x i16 vector value -def v3i16 : VTVec<3, i16, 46>; // 3 x i16 vector value -def v4i16 : VTVec<4, i16, 47>; // 4 x i16 vector value -def v8i16 : VTVec<8, i16, 48>; // 8 x i16 vector value -def v16i16 : VTVec<16, i16, 49>; // 16 x i16 vector value -def v32i16 : VTVec<32, i16, 50>; // 32 x i16 vector value -def v64i16 : VTVec<64, i16, 51>; // 64 x i16 vector value -def v128i16 : VTVec<128, i16, 52>; // 128 x i16 vector value -def v256i16 : VTVec<256, i16, 53>; // 256 x i16 vector value -def v512i16 : VTVec<512, i16, 54>; // 512 x i16 vector value - -def v1i32 : VTVec<1, i32, 55>; // 1 x i32 vector value -def v2i32 : VTVec<2, i32, 56>; // 2 x i32 vector value -def v3i32 : VTVec<3, i32, 57>; // 3 x i32 vector value -def v4i32 : VTVec<4, i32, 58>; // 4 x i32 vector value -def v5i32 : VTVec<5, i32, 59>; // 5 x i32 vector value -def v6i32 : VTVec<6, i32, 60>; // 6 x f32 vector value -def v7i32 : VTVec<7, i32, 61>; // 7 x f32 vector value -def v8i32 : VTVec<8, i32, 62>; // 8 x i32 vector value -def v9i32 : VTVec<9, i32, 63>; // 9 x i32 vector value -def v10i32 : VTVec<10, i32, 64>; // 10 x i32 vector value -def v11i32 : VTVec<11, i32, 65>; // 11 x i32 vector value -def v12i32 : VTVec<12, i32, 66>; // 12 x i32 vector value -def v16i32 : VTVec<16, i32, 67>; // 16 x i32 vector value -def v32i32 : VTVec<32, i32, 68>; // 32 x i32 vector value -def v64i32 : VTVec<64, i32, 69>; // 64 x i32 vector value -def v128i32 : VTVec<128, i32, 70>; // 128 x i32 vector value -def v256i32 : VTVec<256, i32, 71>; // 256 x i32 vector value -def v512i32 : VTVec<512, i32, 72>; // 512 x i32 vector value -def v1024i32 : VTVec<1024, i32, 73>; // 1024 x i32 vector value -def v2048i32 : VTVec<2048, i32, 74>; // 2048 x i32 vector value - -def v1i64 : VTVec<1, i64, 75>; // 1 x i64 vector value -def v2i64 : VTVec<2, i64, 76>; // 2 x i64 vector value -def v3i64 : VTVec<3, i64, 77>; // 3 x i64 vector value -def v4i64 : VTVec<4, i64, 78>; // 4 x i64 vector value -def v8i64 : VTVec<8, i64, 79>; // 8 x i64 vector value -def v16i64 : VTVec<16, i64, 80>; // 16 x i64 vector value -def v32i64 : VTVec<32, i64, 81>; // 32 x i64 vector value -def v64i64 : VTVec<64, i64, 82>; // 64 x i64 vector value -def v128i64 : VTVec<128, i64, 83>; // 128 x i64 vector value -def v256i64 : VTVec<256, i64, 84>; // 256 x i64 vector value - -def v1i128 : VTVec<1, i128, 85>; // 1 x i128 vector value - -def v1f16 : VTVec<1, f16, 86>; // 1 x f16 vector value -def v2f16 : VTVec<2, f16, 87>; // 2 x f16 vector value -def v3f16 : VTVec<3, f16, 88>; // 3 x f16 vector value -def v4f16 : VTVec<4, f16, 89>; // 4 x f16 vector value -def v8f16 : VTVec<8, f16, 90>; // 8 x f16 vector value -def v16f16 : VTVec<16, f16, 91>; // 16 x f16 vector value -def v32f16 : VTVec<32, f16, 92>; // 32 x f16 vector value -def v64f16 : VTVec<64, f16, 93>; // 64 x f16 vector value -def v128f16 : VTVec<128, f16, 94>; // 128 x f16 vector value -def v256f16 : VTVec<256, f16, 95>; // 256 x f16 vector value -def v512f16 : VTVec<512, f16, 96>; // 512 x f16 vector value - -def v2bf16 : VTVec<2, bf16, 97>; // 2 x bf16 vector value -def v3bf16 : VTVec<3, bf16, 98>; // 3 x bf16 vector value -def v4bf16 : VTVec<4, bf16, 99>; // 4 x bf16 vector value -def v8bf16 : VTVec<8, bf16, 100>; // 8 x bf16 vector value -def v16bf16 : VTVec<16, bf16, 101>; // 16 x bf16 vector value -def v32bf16 : VTVec<32, bf16, 102>; // 32 x bf16 vector value -def v64bf16 : VTVec<64, bf16, 103>; // 64 x bf16 vector value -def v128bf16 : VTVec<128, bf16, 104>; // 128 x bf16 vector value - -def v1f32 : VTVec<1, f32, 105>; // 1 x f32 vector value -def v2f32 : VTVec<2, f32, 106>; // 2 x f32 vector value -def v3f32 : VTVec<3, f32, 107>; // 3 x f32 vector value -def v4f32 : VTVec<4, f32, 108>; // 4 x f32 vector value -def v5f32 : VTVec<5, f32, 109>; // 5 x f32 vector value -def v6f32 : VTVec<6, f32, 110>; // 6 x f32 vector value -def v7f32 : VTVec<7, f32, 111>; // 7 x f32 vector value -def v8f32 : VTVec<8, f32, 112>; // 8 x f32 vector value -def v9f32 : VTVec<9, f32, 113>; // 9 x f32 vector value -def v10f32 : VTVec<10, f32, 114>; // 10 x f32 vector value -def v11f32 : VTVec<11, f32, 115>; // 11 x f32 vector value -def v12f32 : VTVec<12, f32, 116>; // 12 x f32 vector value -def v16f32 : VTVec<16, f32, 117>; // 16 x f32 vector value -def v32f32 : VTVec<32, f32, 118>; // 32 x f32 vector value -def v64f32 : VTVec<64, f32, 119>; // 64 x f32 vector value -def v128f32 : VTVec<128, f32, 120>; // 128 x f32 vector value -def v256f32 : VTVec<256, f32, 121>; // 256 x f32 vector value -def v512f32 : VTVec<512, f32, 122>; // 512 x f32 vector value -def v1024f32 : VTVec<1024, f32, 123>; // 1024 x f32 vector value -def v2048f32 : VTVec<2048, f32, 124>; // 2048 x f32 vector value - -def v1f64 : VTVec<1, f64, 125>; // 1 x f64 vector value -def v2f64 : VTVec<2, f64, 126>; // 2 x f64 vector value -def v3f64 : VTVec<3, f64, 127>; // 3 x f64 vector value -def v4f64 : VTVec<4, f64, 128>; // 4 x f64 vector value -def v8f64 : VTVec<8, f64, 129>; // 8 x f64 vector value -def v16f64 : VTVec<16, f64, 130>; // 16 x f64 vector value -def v32f64 : VTVec<32, f64, 131>; // 32 x f64 vector value -def v64f64 : VTVec<64, f64, 132>; // 64 x f64 vector value -def v128f64 : VTVec<128, f64, 133>; // 128 x f64 vector value -def v256f64 : VTVec<256, f64, 134>; // 256 x f64 vector value - -def nxv1i1 : VTScalableVec<1, i1, 135>; // n x 1 x i1 vector value -def nxv2i1 : VTScalableVec<2, i1, 136>; // n x 2 x i1 vector value -def nxv4i1 : VTScalableVec<4, i1, 137>; // n x 4 x i1 vector value -def nxv8i1 : VTScalableVec<8, i1, 138>; // n x 8 x i1 vector value -def nxv16i1 : VTScalableVec<16, i1, 139>; // n x 16 x i1 vector value -def nxv32i1 : VTScalableVec<32, i1, 140>; // n x 32 x i1 vector value -def nxv64i1 : VTScalableVec<64, i1, 141>; // n x 64 x i1 vector value - -def nxv1i8 : VTScalableVec<1, i8, 142>; // n x 1 x i8 vector value -def nxv2i8 : VTScalableVec<2, i8, 143>; // n x 2 x i8 vector value -def nxv4i8 : VTScalableVec<4, i8, 144>; // n x 4 x i8 vector value -def nxv8i8 : VTScalableVec<8, i8, 145>; // n x 8 x i8 vector value -def nxv16i8 : VTScalableVec<16, i8, 146>; // n x 16 x i8 vector value -def nxv32i8 : VTScalableVec<32, i8, 147>; // n x 32 x i8 vector value -def nxv64i8 : VTScalableVec<64, i8, 148>; // n x 64 x i8 vector value - -def nxv1i16 : VTScalableVec<1, i16, 149>; // n x 1 x i16 vector value -def nxv2i16 : VTScalableVec<2, i16, 150>; // n x 2 x i16 vector value -def nxv4i16 : VTScalableVec<4, i16, 151>; // n x 4 x i16 vector value -def nxv8i16 : VTScalableVec<8, i16, 152>; // n x 8 x i16 vector value -def nxv16i16 : VTScalableVec<16, i16, 153>; // n x 16 x i16 vector value -def nxv32i16 : VTScalableVec<32, i16, 154>; // n x 32 x i16 vector value - -def nxv1i32 : VTScalableVec<1, i32, 155>; // n x 1 x i32 vector value -def nxv2i32 : VTScalableVec<2, i32, 156>; // n x 2 x i32 vector value -def nxv4i32 : VTScalableVec<4, i32, 157>; // n x 4 x i32 vector value -def nxv8i32 : VTScalableVec<8, i32, 158>; // n x 8 x i32 vector value -def nxv16i32 : VTScalableVec<16, i32, 159>; // n x 16 x i32 vector value -def nxv32i32 : VTScalableVec<32, i32, 160>; // n x 32 x i32 vector value - -def nxv1i64 : VTScalableVec<1, i64, 161>; // n x 1 x i64 vector value -def nxv2i64 : VTScalableVec<2, i64, 162>; // n x 2 x i64 vector value -def nxv4i64 : VTScalableVec<4, i64, 163>; // n x 4 x i64 vector value -def nxv8i64 : VTScalableVec<8, i64, 164>; // n x 8 x i64 vector value -def nxv16i64 : VTScalableVec<16, i64, 165>; // n x 16 x i64 vector value -def nxv32i64 : VTScalableVec<32, i64, 166>; // n x 32 x i64 vector value - -def nxv1f16 : VTScalableVec<1, f16, 167>; // n x 1 x f16 vector value -def nxv2f16 : VTScalableVec<2, f16, 168>; // n x 2 x f16 vector value -def nxv4f16 : VTScalableVec<4, f16, 169>; // n x 4 x f16 vector value -def nxv8f16 : VTScalableVec<8, f16, 170>; // n x 8 x f16 vector value -def nxv16f16 : VTScalableVec<16, f16, 171>; // n x 16 x f16 vector value -def nxv32f16 : VTScalableVec<32, f16, 172>; // n x 32 x f16 vector value - -def nxv1bf16 : VTScalableVec<1, bf16, 173>; // n x 1 x bf16 vector value -def nxv2bf16 : VTScalableVec<2, bf16, 174>; // n x 2 x bf16 vector value -def nxv4bf16 : VTScalableVec<4, bf16, 175>; // n x 4 x bf16 vector value -def nxv8bf16 : VTScalableVec<8, bf16, 176>; // n x 8 x bf16 vector value -def nxv16bf16 : VTScalableVec<16, bf16, 177>; // n x 16 x bf16 vector value -def nxv32bf16 : VTScalableVec<32, bf16, 178>; // n x 32 x bf16 vector value - -def nxv1f32 : VTScalableVec<1, f32, 179>; // n x 1 x f32 vector value -def nxv2f32 : VTScalableVec<2, f32, 180>; // n x 2 x f32 vector value -def nxv4f32 : VTScalableVec<4, f32, 181>; // n x 4 x f32 vector value -def nxv8f32 : VTScalableVec<8, f32, 182>; // n x 8 x f32 vector value -def nxv16f32 : VTScalableVec<16, f32, 183>; // n x 16 x f32 vector value - -def nxv1f64 : VTScalableVec<1, f64, 184>; // n x 1 x f64 vector value -def nxv2f64 : VTScalableVec<2, f64, 185>; // n x 2 x f64 vector value -def nxv4f64 : VTScalableVec<4, f64, 186>; // n x 4 x f64 vector value -def nxv8f64 : VTScalableVec<8, f64, 187>; // n x 8 x f64 vector value - -def x86mmx : ValueType<64, 188>; // X86 MMX value -def FlagVT : ValueType<0, 189> { // Pre-RA sched glue +def v3i8 : VTVec<3, i8, 35>; // 3 x i8 vector value +def v4i8 : VTVec<4, i8, 36>; // 4 x i8 vector value +def v8i8 : VTVec<8, i8, 37>; // 8 x i8 vector value +def v16i8 : VTVec<16, i8, 38>; // 16 x i8 vector value +def v32i8 : VTVec<32, i8, 39>; // 32 x i8 vector value +def v64i8 : VTVec<64, i8, 40>; // 64 x i8 vector value +def v128i8 : VTVec<128, i8, 41>; // 128 x i8 vector value +def v256i8 : VTVec<256, i8, 42>; // 256 x i8 vector value +def v512i8 : VTVec<512, i8, 43>; // 512 x i8 vector value +def v1024i8 : VTVec<1024, i8, 44>; // 1024 x i8 vector value + +def v1i16 : VTVec<1, i16, 45>; // 1 x i16 vector value +def v2i16 : VTVec<2, i16, 46>; // 2 x i16 vector value +def v3i16 : VTVec<3, i16, 47>; // 3 x i16 vector value +def v4i16 : VTVec<4, i16, 48>; // 4 x i16 vector value +def v8i16 : VTVec<8, i16, 49>; // 8 x i16 vector value +def v16i16 : VTVec<16, i16, 50>; // 16 x i16 vector value +def v32i16 : VTVec<32, i16, 51>; // 32 x i16 vector value +def v64i16 : VTVec<64, i16, 52>; // 64 x i16 vector value +def v128i16 : VTVec<128, i16, 53>; // 128 x i16 vector value +def v256i16 : VTVec<256, i16, 54>; // 256 x i16 vector value +def v512i16 : VTVec<512, i16, 55>; // 512 x i16 vector value + +def v1i32 : VTVec<1, i32, 56>; // 1 x i32 vector value +def v2i32 : VTVec<2, i32, 57>; // 2 x i32 vector value +def v3i32 : VTVec<3, i32, 58>; // 3 x i32 vector value +def v4i32 : VTVec<4, i32, 59>; // 4 x i32 vector value +def v5i32 : VTVec<5, i32, 60>; // 5 x i32 vector value +def v6i32 : VTVec<6, i32, 61>; // 6 x f32 vector value +def v7i32 : VTVec<7, i32, 62>; // 7 x f32 vector value +def v8i32 : VTVec<8, i32, 63>; // 8 x i32 vector value +def v9i32 : VTVec<9, i32, 64>; // 9 x i32 vector value +def v10i32 : VTVec<10, i32, 65>; // 10 x i32 vector value +def v11i32 : VTVec<11, i32, 66>; // 11 x i32 vector value +def v12i32 : VTVec<12, i32, 67>; // 12 x i32 vector value +def v16i32 : VTVec<16, i32, 68>; // 16 x i32 vector value +def v32i32 : VTVec<32, i32, 69>; // 32 x i32 vector value +def v64i32 : VTVec<64, i32, 70>; // 64 x i32 vector value +def v128i32 : VTVec<128, i32, 71>; // 128 x i32 vector value +def v256i32 : VTVec<256, i32, 72>; // 256 x i32 vector value +def v512i32 : VTVec<512, i32, 73>; // 512 x i32 vector value +def v1024i32 : VTVec<1024, i32, 74>; // 1024 x i32 vector value +def v2048i32 : VTVec<2048, i32, 75>; // 2048 x i32 vector value + +def v1i64 : VTVec<1, i64, 76>; // 1 x i64 vector value +def v2i64 : VTVec<2, i64, 77>; // 2 x i64 vector value +def v3i64 : VTVec<3, i64, 78>; // 3 x i64 vector value +def v4i64 : VTVec<4, i64, 79>; // 4 x i64 vector value +def v8i64 : VTVec<8, i64, 80>; // 8 x i64 vector value +def v16i64 : VTVec<16, i64, 81>; // 16 x i64 vector value +def v32i64 : VTVec<32, i64, 82>; // 32 x i64 vector value +def v64i64 : VTVec<64, i64, 83>; // 64 x i64 vector value +def v128i64 : VTVec<128, i64, 84>; // 128 x i64 vector value +def v256i64 : VTVec<256, i64, 85>; // 256 x i64 vector value + +def v1i128 : VTVec<1, i128, 86>; // 1 x i128 vector value + +def v1f16 : VTVec<1, f16, 87>; // 1 x f16 vector value +def v2f16 : VTVec<2, f16, 88>; // 2 x f16 vector value +def v3f16 : VTVec<3, f16, 89>; // 3 x f16 vector value +def v4f16 : VTVec<4, f16, 90>; // 4 x f16 vector value +def v8f16 : VTVec<8, f16, 91>; // 8 x f16 vector value +def v16f16 : VTVec<16, f16, 92>; // 16 x f16 vector value +def v32f16 : VTVec<32, f16, 93>; // 32 x f16 vector value +def v64f16 : VTVec<64, f16, 94>; // 64 x f16 vector value +def v128f16 : VTVec<128, f16, 95>; // 128 x f16 vector value +def v256f16 : VTVec<256, f16, 96>; // 256 x f16 vector value +def v512f16 : VTVec<512, f16, 97>; // 512 x f16 vector value + +def v2bf16 : VTVec<2, bf16, 98>; // 2 x bf16 vector value +def v3bf16 : VTVec<3, bf16, 99>; // 3 x bf16 vector value +def v4bf16 : VTVec<4, bf16, 100>; // 4 x bf16 vector value +def v8bf16 : VTVec<8, bf16, 101>; // 8 x bf16 vector value +def v16bf16 : VTVec<16, bf16, 102>; // 16 x bf16 vector value +def v32bf16 : VTVec<32, bf16, 103>; // 32 x bf16 vector value +def v64bf16 : VTVec<64, bf16, 104>; // 64 x bf16 vector value +def v128bf16 : VTVec<128, bf16, 105>; // 128 x bf16 vector value + +def v1f32 : VTVec<1, f32, 106>; // 1 x f32 vector value +def v2f32 : VTVec<2, f32, 107>; // 2 x f32 vector value +def v3f32 : VTVec<3, f32, 108>; // 3 x f32 vector value +def v4f32 : VTVec<4, f32, 109>; // 4 x f32 vector value +def v5f32 : VTVec<5, f32, 110>; // 5 x f32 vector value +def v6f32 : VTVec<6, f32, 111>; // 6 x f32 vector value +def v7f32 : VTVec<7, f32, 112>; // 7 x f32 vector value +def v8f32 : VTVec<8, f32, 113>; // 8 x f32 vector value +def v9f32 : VTVec<9, f32, 114>; // 9 x f32 vector value +def v10f32 : VTVec<10, f32, 115>; // 10 x f32 vector value +def v11f32 : VTVec<11, f32, 116>; // 11 x f32 vector value +def v12f32 : VTVec<12, f32, 117>; // 12 x f32 vector value +def v16f32 : VTVec<16, f32, 118>; // 16 x f32 vector value +def v32f32 : VTVec<32, f32, 119>; // 32 x f32 vector value +def v64f32 : VTVec<64, f32, 120>; // 64 x f32 vector value +def v128f32 : VTVec<128, f32, 121>; // 128 x f32 vector value +def v256f32 : VTVec<256, f32, 122>; // 256 x f32 vector value +def v512f32 : VTVec<512, f32, 123>; // 512 x f32 vector value +def v1024f32 : VTVec<1024, f32, 124>; // 1024 x f32 vector value +def v2048f32 : VTVec<2048, f32, 125>; // 2048 x f32 vector value + +def v1f64 : VTVec<1, f64, 126>; // 1 x f64 vector value +def v2f64 : VTVec<2, f64, 127>; // 2 x f64 vector value +def v3f64 : VTVec<3, f64, 128>; // 3 x f64 vector value +def v4f64 : VTVec<4, f64, 129>; // 4 x f64 vector value +def v8f64 : VTVec<8, f64, 130>; // 8 x f64 vector value +def v16f64 : VTVec<16, f64, 131>; // 16 x f64 vector value +def v32f64 : VTVec<32, f64, 132>; // 32 x f64 vector value +def v64f64 : VTVec<64, f64, 133>; // 64 x f64 vector value +def v128f64 : VTVec<128, f64, 134>; // 128 x f64 vector value +def v256f64 : VTVec<256, f64, 135>; // 256 x f64 vector value + +def nxv1i1 : VTScalableVec<1, i1, 136>; // n x 1 x i1 vector value +def nxv2i1 : VTScalableVec<2, i1, 137>; // n x 2 x i1 vector value +def nxv4i1 : VTScalableVec<4, i1, 138>; // n x 4 x i1 vector value +def nxv8i1 : VTScalableVec<8, i1, 139>; // n x 8 x i1 vector value +def nxv16i1 : VTScalableVec<16, i1, 140>; // n x 16 x i1 vector value +def nxv32i1 : VTScalableVec<32, i1, 141>; // n x 32 x i1 vector value +def nxv64i1 : VTScalableVec<64, i1, 142>; // n x 64 x i1 vector value + +def nxv1i8 : VTScalableVec<1, i8, 143>; // n x 1 x i8 vector value +def nxv2i8 : VTScalableVec<2, i8, 144>; // n x 2 x i8 vector value +def nxv4i8 : VTScalableVec<4, i8, 145>; // n x 4 x i8 vector value +def nxv8i8 : VTScalableVec<8, i8, 146>; // n x 8 x i8 vector value +def nxv16i8 : VTScalableVec<16, i8, 147>; // n x 16 x i8 vector value +def nxv32i8 : VTScalableVec<32, i8, 148>; // n x 32 x i8 vector value +def nxv64i8 : VTScalableVec<64, i8, 149>; // n x 64 x i8 vector value + +def nxv1i16 : VTScalableVec<1, i16, 150>; // n x 1 x i16 vector value +def nxv2i16 : VTScalableVec<2, i16, 151>; // n x 2 x i16 vector value +def nxv4i16 : VTScalableVec<4, i16, 152>; // n x 4 x i16 vector value +def nxv8i16 : VTScalableVec<8, i16, 153>; // n x 8 x i16 vector value +def nxv16i16 : VTScalableVec<16, i16, 154>; // n x 16 x i16 vector value +def nxv32i16 : VTScalableVec<32, i16, 155>; // n x 32 x i16 vector value + +def nxv1i32 : VTScalableVec<1, i32, 156>; // n x 1 x i32 vector value +def nxv2i32 : VTScalableVec<2, i32, 157>; // n x 2 x i32 vector value +def nxv4i32 : VTScalableVec<4, i32, 158>; // n x 4 x i32 vector value +def nxv8i32 : VTScalableVec<8, i32, 159>; // n x 8 x i32 vector value +def nxv16i32 : VTScalableVec<16, i32, 160>; // n x 16 x i32 vector value +def nxv32i32 : VTScalableVec<32, i32, 161>; // n x 32 x i32 vector value + +def nxv1i64 : VTScalableVec<1, i64, 162>; // n x 1 x i64 vector value +def nxv2i64 : VTScalableVec<2, i64, 163>; // n x 2 x i64 vector value +def nxv4i64 : VTScalableVec<4, i64, 164>; // n x 4 x i64 vector value +def nxv8i64 : VTScalableVec<8, i64, 165>; // n x 8 x i64 vector value +def nxv16i64 : VTScalableVec<16, i64, 166>; // n x 16 x i64 vector value +def nxv32i64 : VTScalableVec<32, i64, 167>; // n x 32 x i64 vector value + +def nxv1f16 : VTScalableVec<1, f16, 168>; // n x 1 x f16 vector value +def nxv2f16 : VTScalableVec<2, f16, 169>; // n x 2 x f16 vector value +def nxv4f16 : VTScalableVec<4, f16, 170>; // n x 4 x f16 vector value +def nxv8f16 : VTScalableVec<8, f16, 171>; // n x 8 x f16 vector value +def nxv16f16 : VTScalableVec<16, f16, 172>; // n x 16 x f16 vector value +def nxv32f16 : VTScalableVec<32, f16, 173>; // n x 32 x f16 vector value + +def nxv1bf16 : VTScalableVec<1, bf16, 174>; // n x 1 x bf16 vector value +def nxv2bf16 : VTScalableVec<2, bf16, 175>; // n x 2 x bf16 vector value +def nxv4bf16 : VTScalableVec<4, bf16, 176>; // n x 4 x bf16 vector value +def nxv8bf16 : VTScalableVec<8, bf16, 177>; // n x 8 x bf16 vector value +def nxv16bf16 : VTScalableVec<16, bf16, 178>; // n x 16 x bf16 vector value +def nxv32bf16 : VTScalableVec<32, bf16, 179>; // n x 32 x bf16 vector value + +def nxv1f32 : VTScalableVec<1, f32, 180>; // n x 1 x f32 vector value +def nxv2f32 : VTScalableVec<2, f32, 181>; // n x 2 x f32 vector value +def nxv4f32 : VTScalableVec<4, f32, 182>; // n x 4 x f32 vector value +def nxv8f32 : VTScalableVec<8, f32, 183>; // n x 8 x f32 vector value +def nxv16f32 : VTScalableVec<16, f32, 184>; // n x 16 x f32 vector value + +def nxv1f64 : VTScalableVec<1, f64, 185>; // n x 1 x f64 vector value +def nxv2f64 : VTScalableVec<2, f64, 186>; // n x 2 x f64 vector value +def nxv4f64 : VTScalableVec<4, f64, 187>; // n x 4 x f64 vector value +def nxv8f64 : VTScalableVec<8, f64, 188>; // n x 8 x f64 vector value + +def x86mmx : ValueType<64, 189>; // X86 MMX value +def FlagVT : ValueType<0, 190> { // Pre-RA sched glue let LLVMName = "Glue"; } -def isVoid : ValueType<0, 190>; // Produces no value -def untyped : ValueType<8, 191> { // Produces an untyped value +def isVoid : ValueType<0, 191>; // Produces no value +def untyped : ValueType<8, 192> { // Produces an untyped value let LLVMName = "Untyped"; } -def funcref : ValueType<0, 192>; // WebAssembly's funcref type -def externref : ValueType<0, 193>; // WebAssembly's externref type -def x86amx : ValueType<8192, 194>; // X86 AMX value -def i64x8 : ValueType<512, 195>; // 8 Consecutive GPRs (AArch64) +def funcref : ValueType<0, 193>; // WebAssembly's funcref type +def externref : ValueType<0, 194>; // WebAssembly's externref type +def x86amx : ValueType<8192, 195>; // X86 AMX value +def i64x8 : ValueType<512, 196>; // 8 Consecutive GPRs (AArch64) def aarch64svcount - : ValueType<16, 196>; // AArch64 predicate-as-counter -def spirvbuiltin : ValueType<0, 197>; // SPIR-V's builtin type + : ValueType<16, 197>; // AArch64 predicate-as-counter +def spirvbuiltin : ValueType<0, 198>; // SPIR-V's builtin type def token : ValueType<0, 248>; // TokenTy def MetadataVT : ValueType<0, 249> { // Metadata diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index ba3b9e0..731fcab 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -264,6 +264,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { return FixedVectorType::get(Type::getInt8Ty(Context), 1); case MVT::v2i8: return FixedVectorType::get(Type::getInt8Ty(Context), 2); + case MVT::v3i8: + return FixedVectorType::get(Type::getInt8Ty(Context), 3); case MVT::v4i8: return FixedVectorType::get(Type::getInt8Ty(Context), 4); case MVT::v8i8: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 10569d9..528257e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -308,8 +308,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand); setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand); setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand); -- cgit v1.1 From b14731fe93d0db9a59984783051880795ae0992d Mon Sep 17 00:00:00 2001 From: erichkeane Date: Thu, 8 Feb 2024 07:57:57 -0800 Subject: [OpenACC][NFC] Fix parse result from 'set' Apparently 'set' was being parsed as 'shutdown'. There isn't really any way of detecting this without getting into a Sema implementation, however fixing this now as I noticed it. --- clang/lib/Parse/ParseOpenACC.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index 1fee9f8..e099d07 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -54,7 +54,7 @@ OpenACCDirectiveKindEx getOpenACCDirectiveKind(Token Tok) { .Case("declare", OpenACCDirectiveKind::Declare) .Case("init", OpenACCDirectiveKind::Init) .Case("shutdown", OpenACCDirectiveKind::Shutdown) - .Case("set", OpenACCDirectiveKind::Shutdown) + .Case("set", OpenACCDirectiveKind::Set) .Case("update", OpenACCDirectiveKind::Update) .Case("wait", OpenACCDirectiveKind::Wait) .Default(OpenACCDirectiveKind::Invalid); -- cgit v1.1 From 067d2779fcfc62dd429177f350b8cefe49b65b51 Mon Sep 17 00:00:00 2001 From: ian Bearman Date: Thu, 8 Feb 2024 07:59:37 -0800 Subject: [MLIR] Setting MemorySpace During Bufferization (#78484) Collection of changes with the goal of being able to convert `encoding` to `memorySpace` during bufferization - new API for encoder to allow implementation to select destination memory space - update existing bufferization implementations to support the new interface --- .../Dialect/Bufferization/IR/BufferizableOpInterface.h | 15 ++++++++++----- .../Arith/Transforms/BufferizableOpInterfaceImpl.cpp | 13 +++++++------ .../Bufferization/IR/BufferizableOpInterface.cpp | 14 ++++++++------ mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp | 4 ++-- mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp | 8 ++++++-- .../Transforms/FuncBufferizableOpInterfaceImpl.cpp | 5 +++-- .../Tensor/Transforms/BufferizableOpInterfaceImpl.cpp | 17 ++++++++++------- .../Dialect/Bufferization/TestTensorCopyInsertion.cpp | 6 ++++-- 8 files changed, 50 insertions(+), 32 deletions(-) diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h index 226a2fb..d8cfeee 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h @@ -257,6 +257,9 @@ struct BufferizationOptions { /// Parameters: Value, memory space, bufferization options using UnknownTypeConverterFn = std::function; + // Produce a MemorySpace attribute from a tensor type + using DefaultMemorySpaceFn = + std::function(TensorType t)>; BufferizationOptions(); @@ -296,11 +299,6 @@ struct BufferizationOptions { /// bufferized or not. bool bufferizeFunctionBoundaries = false; - /// The default memory space that should be used when it cannot be inferred - /// from the context. If case of std::nullopt, bufferization fails when the - /// memory space cannot be inferred at any point. - std::optional defaultMemorySpace = Attribute(); - /// Certain ops have aliasing OpOperand/OpResult invariants (e.g., scf.for). /// If this flag is set to `false`, those invariants are no longer enforced /// with buffer copies. @@ -351,6 +349,13 @@ struct BufferizationOptions { /// used. UnknownTypeConverterFn unknownTypeConverterFn = nullptr; + // Use during type conversion to determine the memory space for memref based + // on the original tensor type if the memory space cannot be inferred. + // Returning std::nullopt will cause bufferization to fail (useful to indicate + // failure to determine memory space for a tensor type). + DefaultMemorySpaceFn defaultMemorySpaceFn = + [](TensorType t) -> std::optional { return Attribute(); }; + /// Seed for the analysis fuzzer. If set to `0`, the fuzzer is deactivated. /// Should be used only with `testAnalysisOnly = true`. unsigned analysisFuzzerSeed = 0; diff --git a/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp index f69b255..d7492c9 100644 --- a/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp @@ -26,17 +26,18 @@ struct ConstantOpInterface LogicalResult bufferize(Operation *op, RewriterBase &rewriter, const BufferizationOptions &options) const { auto constantOp = cast(op); + auto type = constantOp.getType().dyn_cast(); + + // Only ranked tensors are supported. + if (!type) + return failure(); Attribute memorySpace; - if (options.defaultMemorySpace.has_value()) - memorySpace = *options.defaultMemorySpace; + if (auto memSpace = options.defaultMemorySpaceFn(type)) + memorySpace = *memSpace; else return constantOp->emitError("could not infer memory space"); - // Only ranked tensors are supported. - if (!isa(constantOp.getType())) - return failure(); - // Only constants inside a module are supported. auto moduleOp = constantOp->getParentOfType(); if (!moduleOp) diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp index 6ca9702..8f0f6d1 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp @@ -682,11 +682,12 @@ bufferization::getBufferType(Value value, const BufferizationOptions &options, return bufferizableOp.getBufferType(value, options, invocationStack); // Op is not bufferizable. - if (!options.defaultMemorySpace.has_value()) + auto memSpace = + options.defaultMemorySpaceFn(value.getType().cast()); + if (!memSpace.has_value()) return op->emitError("could not infer memory space"); - return getMemRefType(value, options, /*layout=*/{}, - *options.defaultMemorySpace); + return getMemRefType(value, options, /*layout=*/{}, *memSpace); } bool bufferization::hasTensorSemantics(Operation *op) { @@ -936,11 +937,12 @@ FailureOr bufferization::detail::defaultGetBufferType( // If we do not know the memory space and there is no default memory space, // report a failure. - if (!options.defaultMemorySpace.has_value()) + auto memSpace = + options.defaultMemorySpaceFn(value.getType().cast()); + if (!memSpace.has_value()) return op->emitError("could not infer memory space"); - return getMemRefType(value, options, /*layout=*/{}, - *options.defaultMemorySpace); + return getMemRefType(value, options, /*layout=*/{}, *memSpace); } bool bufferization::detail::defaultIsRepetitiveRegion( diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp index eb4a96f..34a0c59 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp @@ -234,8 +234,8 @@ AllocTensorOp::getBufferType(Value value, const BufferizationOptions &options, if (failed(copyBufferType)) return failure(); memorySpace = copyBufferType->getMemorySpace(); - } else if (options.defaultMemorySpace.has_value()) { - memorySpace = *options.defaultMemorySpace; + } else if (auto ms = options.defaultMemorySpaceFn(getType())) { + memorySpace = *ms; } else { return getOperation()->emitError("could not infer memory space"); } diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp index dc94b72..208cbda 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp @@ -210,8 +210,12 @@ struct OneShotBufferizePass opt.dumpAliasSets = dumpAliasSets; opt.setFunctionBoundaryTypeConversion( parseLayoutMapOption(functionBoundaryTypeConversion)); - if (mustInferMemorySpace) - opt.defaultMemorySpace = std::nullopt; + if (mustInferMemorySpace) { + opt.defaultMemorySpaceFn = + [](TensorType t) -> std::optional { + return std::nullopt; + }; + } opt.printConflicts = printConflicts; opt.testAnalysisOnly = testAnalysisOnly; opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries; diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp index 07cd1f9..4cdbbf3 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp @@ -66,7 +66,7 @@ getBufferizedFunctionArgType(FuncOp funcOp, int64_t index, assert(tensorType && "expected TensorType"); BaseMemRefType memrefType = options.functionArgTypeConverterFn( - tensorType, *options.defaultMemorySpace, funcOp, options); + tensorType, *options.defaultMemorySpaceFn(tensorType), funcOp, options); auto layoutAttr = funcOp.getArgAttrOfType( index, BufferizationDialect::kBufferLayoutAttrName); @@ -443,7 +443,8 @@ struct FuncOpInterface // Note: If `inferFunctionResultLayout = true`, cast are later folded // away. BaseMemRefType resultType = options.functionArgTypeConverterFn( - tensorType, *options.defaultMemorySpace, funcOp, options); + tensorType, *options.defaultMemorySpaceFn(tensorType), funcOp, + options); Value toMemrefOp = rewriter.create( loc, resultType, returnVal); returnValues.push_back(toMemrefOp); diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp index 678b7c0..957f631 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp @@ -473,14 +473,14 @@ struct FromElementsOpInterface LogicalResult bufferize(Operation *op, RewriterBase &rewriter, const BufferizationOptions &options) const { auto fromElementsOp = cast(op); + auto tensorType = cast(fromElementsOp.getType()); // TODO: Implement memory space for this op. - if (options.defaultMemorySpace != Attribute()) + if (options.defaultMemorySpaceFn(tensorType) != Attribute()) return op->emitError("memory space not implemented yet"); // Allocate a buffer for the result. Location loc = op->getLoc(); - auto tensorType = cast(fromElementsOp.getType()); auto shape = tensorType.getShape(); // TODO: Create alloc_tensor ops during TensorCopyInsertion. FailureOr tensorAlloc = allocateTensorForShapedValue( @@ -588,8 +588,10 @@ struct GenerateOpInterface const BufferizationOptions &options) const { auto generateOp = cast(op); + auto type = generateOp.getResult().getType(); + // TODO: Implement memory space for this op. - if (options.defaultMemorySpace != Attribute()) + if (options.defaultMemorySpaceFn(type) != Attribute()) return op->emitError("memory space not implemented yet"); // Allocate memory. @@ -1007,10 +1009,6 @@ struct SplatOpInterface OpBuilder::InsertionGuard g(rewriter); auto splatOp = cast(op); - // TODO: Implement memory space for this op. - if (options.defaultMemorySpace != Attribute()) - return op->emitError("memory space not implemented yet"); - // Allocate memory. Location loc = op->getLoc(); FailureOr tensorAlloc = allocateTensorForShapedValue( @@ -1021,6 +1019,11 @@ struct SplatOpInterface // Create linalg::MapOp. auto tensorType = cast(tensorAlloc->getType()); + + // TODO: Implement memory space for this op. + if (options.defaultMemorySpaceFn(tensorType) != Attribute()) + return op->emitError("memory space not implemented yet"); + auto linalgOp = rewriter.create(loc, tensorType, /*inputs=*/ValueRange(), /*init=*/*tensorAlloc); diff --git a/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp b/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp index fedfbe3..2991a3c 100644 --- a/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp +++ b/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp @@ -44,8 +44,10 @@ struct TestTensorCopyInsertionPass bufferization::OneShotBufferizationOptions options; options.allowReturnAllocsFromLoops = allowReturnAllocsFromLoops; options.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries; - if (mustInferMemorySpace) - options.defaultMemorySpace = std::nullopt; + if (mustInferMemorySpace) { + options.defaultMemorySpaceFn = + [](TensorType t) -> std::optional { return std::nullopt; }; + } if (failed(bufferization::insertTensorCopies(getOperation(), options))) signalPassFailure(); } -- cgit v1.1 From 92eaf036bf22ecc276146cd073208e6a867af8d4 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 8 Feb 2024 16:13:22 +0000 Subject: [NFC][RemoveDIs] Remove conditional compilation for RemoveDIs (#81149) A colleague observes that switching the default value of LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS to "On" hasn't flipped the value in their CMakeCache.txt. This probably means that everyone with an existing build tree is going to not have support built in, meaning everyone in LLVM would need to clean+rebuild their worktree when we flip the switch on... which doesn't sound good. So instead, just delete the flag and everything it does, making everyone build and run ~400 lit tests in RemoveDIs mode. None of the buildbots have had trouble with this, so it Should Be Fine (TM). (Sending for review as this is changing various comments, and touches several different areas -- I don't want to get too punchy). --- llvm/CMakeLists.txt | 3 --- llvm/cmake/modules/HandleLLVMOptions.cmake | 4 ---- llvm/include/llvm/ADT/ilist_iterator.h | 23 ----------------------- llvm/tools/llc/llc.cpp | 8 ++------ llvm/tools/llvm-link/llvm-link.cpp | 8 ++------ llvm/tools/llvm-lto/llvm-lto.cpp | 8 ++------ llvm/tools/llvm-lto2/llvm-lto2.cpp | 8 ++------ llvm/tools/llvm-reduce/llvm-reduce.cpp | 8 ++------ llvm/tools/opt/optdriver.cpp | 8 ++------ llvm/unittests/ADT/IListIteratorBitsTest.cpp | 18 ++---------------- llvm/unittests/IR/BasicBlockDbgInfoTest.cpp | 6 ------ 11 files changed, 14 insertions(+), 88 deletions(-) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index c31980a..81f2753 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -653,9 +653,6 @@ option(LLVM_USE_OPROFILE option(LLVM_EXTERNALIZE_DEBUGINFO "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF) -option(LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS - "Add extra Booleans to ilist_iterators to communicate facts for debug-info" ON) - set(LLVM_CODESIGNING_IDENTITY "" CACHE STRING "Sign executables and dylibs with the given identity or skip if empty (Darwin Only)") diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index 0699a85..486df22 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -140,10 +140,6 @@ if(LLVM_ENABLE_EXPENSIVE_CHECKS) endif() endif() -if(LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS) - add_compile_definitions(EXPERIMENTAL_DEBUGINFO_ITERATORS) -endif() - if (LLVM_ENABLE_STRICT_FIXED_SIZE_VECTORS) add_compile_definitions(STRICT_FIXED_SIZE_VECTORS) endif() diff --git a/llvm/include/llvm/ADT/ilist_iterator.h b/llvm/include/llvm/ADT/ilist_iterator.h index 9047b9b..2393c4d 100644 --- a/llvm/include/llvm/ADT/ilist_iterator.h +++ b/llvm/include/llvm/ADT/ilist_iterator.h @@ -202,17 +202,12 @@ private: node_pointer NodePtr = nullptr; -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS - // (Default: Off) Allow extra position-information flags to be stored - // in iterators, in aid of removing debug-info intrinsics from LLVM. - /// Is this position intended to contain any debug-info immediately before /// the position? mutable bool HeadInclusiveBit = false; /// Is this position intended to contain any debug-info immediately after /// the position? mutable bool TailInclusiveBit = false; -#endif public: /// Create from an ilist_node. @@ -231,10 +226,8 @@ public: const ilist_iterator_w_bits &RHS, std::enable_if_t = nullptr) : NodePtr(RHS.NodePtr) { -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS HeadInclusiveBit = RHS.HeadInclusiveBit; TailInclusiveBit = RHS.TailInclusiveBit; -#endif } // This is templated so that we can allow assigning to a const iterator from @@ -243,10 +236,8 @@ public: std::enable_if_t operator=(const ilist_iterator_w_bits &RHS) { NodePtr = RHS.NodePtr; -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS HeadInclusiveBit = RHS.HeadInclusiveBit; TailInclusiveBit = RHS.TailInclusiveBit; -#endif return *this; } @@ -280,10 +271,8 @@ public: const_cast::node_reference>( *NodePtr)); -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS New.HeadInclusiveBit = HeadInclusiveBit; New.TailInclusiveBit = TailInclusiveBit; -#endif return New; } return ilist_iterator_w_bits(); @@ -309,18 +298,14 @@ public: // Increment and decrement operators... ilist_iterator_w_bits &operator--() { NodePtr = IsReverse ? NodePtr->getNext() : NodePtr->getPrev(); -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS HeadInclusiveBit = false; TailInclusiveBit = false; -#endif return *this; } ilist_iterator_w_bits &operator++() { NodePtr = IsReverse ? NodePtr->getPrev() : NodePtr->getNext(); -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS HeadInclusiveBit = false; TailInclusiveBit = false; -#endif return *this; } ilist_iterator_w_bits operator--(int) { @@ -340,18 +325,10 @@ public: /// Check for end. Only valid if ilist_sentinel_tracking. bool isEnd() const { return NodePtr ? NodePtr->isSentinel() : false; } -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS bool getHeadBit() const { return HeadInclusiveBit; } bool getTailBit() const { return TailInclusiveBit; } void setHeadBit(bool SetBit) const { HeadInclusiveBit = SetBit; } void setTailBit(bool SetBit) const { TailInclusiveBit = SetBit; } -#else - // Store and return no information if we're not using this feature. - bool getHeadBit() const { return false; } - bool getTailBit() const { return false; } - void setHeadBit(bool SetBit) const { (void)SetBit; } - void setTailBit(bool SetBit) const { (void)SetBit; } -#endif }; template struct simplify_type; diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp index 3e2567c..b292f70 100644 --- a/llvm/tools/llc/llc.cpp +++ b/llvm/tools/llc/llc.cpp @@ -365,15 +365,11 @@ int main(int argc, char **argv) { } // RemoveDIs debug-info transition: tests may request that we /try/ to use the - // new debug-info format, if it's built in. -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS + // new debug-info format. if (TryUseNewDbgInfoFormat) { - // If LLVM was built with support for this, turn the new debug-info format - // on. + // Turn the new debug-info format on. UseNewDbgInfoFormat = true; } -#endif - (void)TryUseNewDbgInfoFormat; if (TimeTrace) timeTraceProfilerInitialize(TimeTraceGranularity, argv[0]); diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp index d50e067..e6c219a 100644 --- a/llvm/tools/llvm-link/llvm-link.cpp +++ b/llvm/tools/llvm-link/llvm-link.cpp @@ -473,15 +473,11 @@ int main(int argc, char **argv) { cl::ParseCommandLineOptions(argc, argv, "llvm linker\n"); // RemoveDIs debug-info transition: tests may request that we /try/ to use the - // new debug-info format, if it's built in. -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS + // new debug-info format. if (TryUseNewDbgInfoFormat) { - // If LLVM was built with support for this, turn the new debug-info format - // on. + // Turn the new debug-info format on. UseNewDbgInfoFormat = true; } -#endif - (void)TryUseNewDbgInfoFormat; LLVMContext Context; Context.setDiagnosticHandler(std::make_unique(), diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp index f272814..7943d69 100644 --- a/llvm/tools/llvm-lto/llvm-lto.cpp +++ b/llvm/tools/llvm-lto/llvm-lto.cpp @@ -945,15 +945,11 @@ int main(int argc, char **argv) { cl::ParseCommandLineOptions(argc, argv, "llvm LTO linker\n"); // RemoveDIs debug-info transition: tests may request that we /try/ to use the - // new debug-info format, if it's built in. -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS + // new debug-info format. if (TryUseNewDbgInfoFormat) { - // If LLVM was built with support for this, turn the new debug-info format - // on. + // Turn the new debug-info format on. UseNewDbgInfoFormat = true; } -#endif - (void)TryUseNewDbgInfoFormat; if (OptLevel < '0' || OptLevel > '3') error("optimization level must be between 0 and 3"); diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp index c212374..d5de4f6 100644 --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -230,15 +230,11 @@ static int run(int argc, char **argv) { cl::ParseCommandLineOptions(argc, argv, "Resolution-based LTO test harness"); // RemoveDIs debug-info transition: tests may request that we /try/ to use the - // new debug-info format, if it's built in. -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS + // new debug-info format. if (TryUseNewDbgInfoFormat) { - // If LLVM was built with support for this, turn the new debug-info format - // on. + // Turn the new debug-info format on. UseNewDbgInfoFormat = true; } -#endif - (void)TryUseNewDbgInfoFormat; // FIXME: Workaround PR30396 which means that a symbol can appear // more than once if it is defined in module-level assembly and diff --git a/llvm/tools/llvm-reduce/llvm-reduce.cpp b/llvm/tools/llvm-reduce/llvm-reduce.cpp index 71ce0ca5..f913771 100644 --- a/llvm/tools/llvm-reduce/llvm-reduce.cpp +++ b/llvm/tools/llvm-reduce/llvm-reduce.cpp @@ -151,15 +151,11 @@ int main(int Argc, char **Argv) { cl::ParseCommandLineOptions(Argc, Argv, "LLVM automatic testcase reducer.\n"); // RemoveDIs debug-info transition: tests may request that we /try/ to use the - // new debug-info format, if it's built in. -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS + // new debug-info format. if (TryUseNewDbgInfoFormat) { - // If LLVM was built with support for this, turn the new debug-info format - // on. + // Turn the new debug-info format on. UseNewDbgInfoFormat = true; } -#endif - (void)TryUseNewDbgInfoFormat; if (Argc == 1) { cl::PrintHelpMessage(); diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp index 3f66bfc..85f5294 100644 --- a/llvm/tools/opt/optdriver.cpp +++ b/llvm/tools/opt/optdriver.cpp @@ -462,15 +462,11 @@ extern "C" int optMain( argc, argv, "llvm .bc -> .bc modular optimizer and analysis printer\n"); // RemoveDIs debug-info transition: tests may request that we /try/ to use the - // new debug-info format, if it's built in. -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS + // new debug-info format. if (TryUseNewDbgInfoFormat) { - // If LLVM was built with support for this, turn the new debug-info format - // on. + // Turn the new debug-info format on. UseNewDbgInfoFormat = true; } -#endif - (void)TryUseNewDbgInfoFormat; LLVMContext Context; diff --git a/llvm/unittests/ADT/IListIteratorBitsTest.cpp b/llvm/unittests/ADT/IListIteratorBitsTest.cpp index 167b30a..8ae73b1 100644 --- a/llvm/unittests/ADT/IListIteratorBitsTest.cpp +++ b/llvm/unittests/ADT/IListIteratorBitsTest.cpp @@ -55,10 +55,8 @@ TEST(IListIteratorBitsTest, ConsAndAssignment) { simple_ilist>::iterator I, I2; -// Two sets of tests: if we've compiled in the iterator bits, then check that -// HeadInclusiveBit and TailInclusiveBit are preserved on assignment and copy -// construction, but not on other operations. -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS + // Check that HeadInclusiveBit and TailInclusiveBit are preserved on + // assignment and copy construction, but not on other operations. I = L.begin(); EXPECT_FALSE(I.getHeadBit()); EXPECT_FALSE(I.getTailBit()); @@ -85,18 +83,6 @@ TEST(IListIteratorBitsTest, ConsAndAssignment) { simple_ilist>::iterator I3(I); EXPECT_TRUE(I3.getHeadBit()); EXPECT_TRUE(I3.getTailBit()); -#else - // The calls should be available, but shouldn't actually store information. - I = L.begin(); - EXPECT_FALSE(I.getHeadBit()); - EXPECT_FALSE(I.getTailBit()); - I.setHeadBit(true); - I.setTailBit(true); - EXPECT_FALSE(I.getHeadBit()); - EXPECT_FALSE(I.getTailBit()); - // Suppress warnings as we don't test with this variable. - (void)I2; -#endif } class dummy { diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp index ef2b288..53b191c 100644 --- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp +++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp @@ -27,11 +27,6 @@ using namespace llvm; extern cl::opt UseNewDbgInfoFormat; -// None of these tests are meaningful or do anything if we do not have the -// experimental "head" bit compiled into ilist_iterator (aka -// ilist_iterator_w_bits), thus there's no point compiling these tests in. -#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS - static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { SMDiagnostic Err; std::unique_ptr Mod = parseAssemblyString(IR, Err, C); @@ -1535,4 +1530,3 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) { } } // End anonymous namespace. -#endif // EXPERIMENTAL_DEBUGINFO_ITERATORS -- cgit v1.1 From 7d19dc50de2c81ead6af750bcddd139ac99a48b5 Mon Sep 17 00:00:00 2001 From: Ivan Kosarev Date: Thu, 8 Feb 2024 18:23:00 +0200 Subject: [AMDGPU][True16] Support VOP3 source DPP operands. (#80892) --- .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 43 ++++++-- .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 38 +++++++ .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 1 + llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 32 ++++-- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 23 ++++- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 6 ++ .../AMDGPU/GlobalISel/inst-select-fceil.s16.mir | 6 +- .../AMDGPU/GlobalISel/inst-select-ffloor.s16.mir | 6 +- llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir | 4 +- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s | 85 ++++++++++++++++ .../MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 64 ++++++------ .../AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s | 25 +++++ .../test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 24 +++-- .../AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt | 111 +++++++++++++++------ .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 51 ++++++++-- 15 files changed, 410 insertions(+), 109 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 225e781..a94da99 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -314,8 +314,9 @@ public: return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64); } - bool isRegOrInlineImmWithFP16InputMods() const { - return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f16); + template bool isRegOrInlineImmWithFP16InputMods() const { + return isRegOrInline( + IsFake16 ? AMDGPU::VS_32RegClassID : AMDGPU::VS_16RegClassID, MVT::f16); } bool isRegOrInlineImmWithFP32InputMods() const { @@ -8151,7 +8152,7 @@ ParseStatus AMDGPUAsmParser::parseOModSI(OperandVector &Operands) { // Determines which bit DST_OP_SEL occupies in the op_sel operand according to // the number of src operands present, then copies that bit into src0_modifiers. -void cvtVOP3DstOpSelOnly(MCInst &Inst) { +static void cvtVOP3DstOpSelOnly(MCInst &Inst, const MCRegisterInfo &MRI) { int Opc = Inst.getOpcode(); int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); if (OpSelIdx == -1) @@ -8168,23 +8169,34 @@ void cvtVOP3DstOpSelOnly(MCInst &Inst) { unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); - if ((OpSel & (1 << SrcNum)) != 0) { - int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); - uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); - Inst.getOperand(ModIdx).setImm(ModVal | SISrcMods::DST_OP_SEL); + int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (DstIdx == -1) + return; + + const MCOperand &DstOp = Inst.getOperand(DstIdx); + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); + if (DstOp.isReg() && + MRI.getRegClass(AMDGPU::VGPR_16RegClassID).contains(DstOp.getReg())) { + if (AMDGPU::isHi(DstOp.getReg(), MRI)) + ModVal |= SISrcMods::DST_OP_SEL; + } else { + if ((OpSel & (1 << SrcNum)) != 0) + ModVal |= SISrcMods::DST_OP_SEL; } + Inst.getOperand(ModIdx).setImm(ModVal); } void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands) { cvtVOP3P(Inst, Operands); - cvtVOP3DstOpSelOnly(Inst); + cvtVOP3DstOpSelOnly(Inst, *getMRI()); } void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx) { cvtVOP3P(Inst, Operands, OptionalIdx); - cvtVOP3DstOpSelOnly(Inst); + cvtVOP3DstOpSelOnly(Inst, *getMRI()); } static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { @@ -8433,8 +8445,17 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, uint32_t ModVal = 0; - if ((OpSel & (1 << J)) != 0) - ModVal |= SISrcMods::OP_SEL_0; + const MCOperand &SrcOp = Inst.getOperand(OpIdx); + if (SrcOp.isReg() && getMRI() + ->getRegClass(AMDGPU::VGPR_16RegClassID) + .contains(SrcOp.getReg())) { + bool VGPRSuffixIsHi = AMDGPU::isHi(SrcOp.getReg(), *getMRI()); + if (VGPRSuffixIsHi) + ModVal |= SISrcMods::OP_SEL_0; + } else { + if ((OpSel & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_0; + } if ((OpSelHi & (1 << J)) != 0) ModVal |= SISrcMods::OP_SEL_1; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index fba9eb5..85377d0 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -913,6 +913,41 @@ static VOPModifiers collectVOPModifiers(const MCInst &MI, return Modifiers; } +// Instructions decode the op_sel/suffix bits into the src_modifier +// operands. Copy those bits into the src operands for true16 VGPRs. +void AMDGPUDisassembler::convertTrue16OpSel(MCInst &MI) const { + const unsigned Opc = MI.getOpcode(); + const MCRegisterClass &ConversionRC = + MRI.getRegClass(AMDGPU::VGPR_16RegClassID); + constexpr std::array, 4> OpAndOpMods = { + {{AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers, + SISrcMods::OP_SEL_0}, + {AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers, + SISrcMods::OP_SEL_0}, + {AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers, + SISrcMods::OP_SEL_0}, + {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers, + SISrcMods::DST_OP_SEL}}}; + for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, OpName); + int OpModsIdx = AMDGPU::getNamedOperandIdx(Opc, OpModsName); + if (OpIdx == -1 || OpModsIdx == -1) + continue; + MCOperand &Op = MI.getOperand(OpIdx); + if (!Op.isReg()) + continue; + if (!ConversionRC.contains(Op.getReg())) + continue; + unsigned OpEnc = MRI.getEncodingValue(Op.getReg()); + const MCOperand &OpMods = MI.getOperand(OpModsIdx); + unsigned ModVal = OpMods.getImm(); + if (ModVal & OpSelMask) { // isHi + unsigned RegIdx = OpEnc & AMDGPU::HWEncoding::REG_IDX_MASK; + Op.setReg(ConversionRC.getRegister(RegIdx * 2 + 1)); + } + } +} + // MAC opcodes have special old and src2 operands. // src2 is tied to dst, while old is not tied (but assumed to be). bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const { @@ -968,6 +1003,7 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { + convertTrue16OpSel(MI); auto Mods = collectVOPModifiers(MI); insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), AMDGPU::OpName::op_sel); @@ -991,6 +1027,8 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { if (isMacDPP(MI)) convertMacDPPInst(MI); + convertTrue16OpSel(MI); + int VDstInIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); if (VDstInIdx != -1) diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 5a89b30..02feaf55 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -203,6 +203,7 @@ public: DecodeStatus convertVOP3PDPPInst(MCInst &MI) const; DecodeStatus convertVOPCDPPInst(MCInst &MI) const; void convertMacDPPInst(MCInst &MI) const; + void convertTrue16OpSel(MCInst &MI) const; enum OpWidthTy { OPW32, diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index a812cdc..8bf0568 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -756,14 +756,14 @@ void SIFoldOperands::foldOperand( int UseOpIdx, SmallVectorImpl &FoldList, SmallVectorImpl &CopiesToReplace) const { - const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); + const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx); - if (!isUseSafeToFold(*UseMI, UseOp)) + if (!isUseSafeToFold(*UseMI, *UseOp)) return; // FIXME: Fold operands with subregs. - if (UseOp.isReg() && OpToFold.isReg() && - (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)) + if (UseOp->isReg() && OpToFold.isReg() && + (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister)) return; // Special case for REG_SEQUENCE: We can't fold literals into @@ -859,7 +859,6 @@ void SIFoldOperands::foldOperand( if (MovOp == AMDGPU::COPY) return; - UseMI->setDesc(TII->get(MovOp)); MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin(); MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end(); while (ImpOpI != ImpOpE) { @@ -867,6 +866,19 @@ void SIFoldOperands::foldOperand( ImpOpI++; UseMI->removeOperand(UseMI->getOperandNo(Tmp)); } + UseMI->setDesc(TII->get(MovOp)); + + if (MovOp == AMDGPU::V_MOV_B16_t16_e64) { + const auto &SrcOp = UseMI->getOperand(UseOpIdx); + MachineOperand NewSrcOp(SrcOp); + MachineFunction *MF = UseMI->getParent()->getParent(); + UseMI->removeOperand(1); + UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers + UseMI->addOperand(NewSrcOp); // src0 + UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel + UseOpIdx = 2; + UseOp = &UseMI->getOperand(UseOpIdx); + } CopiesToReplace.push_back(UseMI); } else { if (UseMI->isCopy() && OpToFold.isReg() && @@ -1027,7 +1039,7 @@ void SIFoldOperands::foldOperand( // Don't fold into target independent nodes. Target independent opcodes // don't have defined register classes. - if (UseDesc.isVariadic() || UseOp.isImplicit() || + if (UseDesc.isVariadic() || UseOp->isImplicit() || UseDesc.operands()[UseOpIdx].RegClass == -1) return; } @@ -1062,17 +1074,17 @@ void SIFoldOperands::foldOperand( TRI->getRegClass(FoldDesc.operands()[0].RegClass); // Split 64-bit constants into 32-bits for folding. - if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) { - Register UseReg = UseOp.getReg(); + if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) { + Register UseReg = UseOp->getReg(); const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); if (AMDGPU::getRegBitWidth(*UseRC) != 64) return; APInt Imm(64, OpToFold.getImm()); - if (UseOp.getSubReg() == AMDGPU::sub0) { + if (UseOp->getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { - assert(UseOp.getSubReg() == AMDGPU::sub1); + assert(UseOp->getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 7edec5a..2259977 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1148,7 +1148,13 @@ def FPT16InputModsMatchClass : FPInputModsMatchClass<16> { def FP32InputModsMatchClass : FPInputModsMatchClass<32>; def FP64InputModsMatchClass : FPInputModsMatchClass<64>; -def FP16VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<16>; +class FP16VCSrcInputModsMatchClass + : FPVCSrcInputModsMatchClass<16> { + let Name = !if(IsFake16, "RegOrInlineImmWithFPFake16InputMods", + "RegOrInlineImmWithFPT16InputMods"); + let PredicateMethod = "isRegOrInlineImmWithFP16InputMods<" # + !if(IsFake16, "true", "false") # ">"; +} def FP32VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<32>; class InputMods : Operand { @@ -1166,7 +1172,8 @@ def FPT16InputMods : FPInputMods; def FP32InputMods : FPInputMods; def FP64InputMods : FPInputMods; -def FP16VCSrcInputMods : FPInputMods; +class FP16VCSrcInputMods + : FPInputMods>; def FP32VCSrcInputMods : FPInputMods; class IntInputModsMatchClass : AsmOperandClass { @@ -1653,11 +1660,11 @@ class getSrcModDPP_t16 { } // Return type of input modifiers operand for specified input operand for DPP -class getSrcModVOP3DPP { +class getSrcModVOP3DPP { Operand ret = !if (VT.isFP, !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), - FP16VCSrcInputMods, FP32VCSrcInputMods), + FP16VCSrcInputMods, FP32VCSrcInputMods), Int32VCSrcInputMods); } @@ -2450,6 +2457,10 @@ class VOP_PAT_GEN : VOPProfile : VOPProfile { let IsTrue16 = 1; let IsRealTrue16 = 1; + + let HasOpSel = 1; + let HasModifiers = 1; // All instructions at least have OpSel. + // Most DstVT are 16-bit, but not all. let DstRC = getVALUDstForVT.ret; let DstRC64 = getVALUDstForVT.ret; @@ -2461,6 +2472,10 @@ class VOPProfile_True16 : VOPProfile { let Src0ModDPP = getSrcModDPP_t16.ret; let Src1ModDPP = getSrcModDPP_t16.ret; let Src2ModDPP = getSrcModDPP_t16.ret; + let Src0VOP3DPP = VGPRSrc_16; + let Src0ModVOP3DPP = getSrcModVOP3DPP.ret; + let Src1ModVOP3DPP = getSrcModVOP3DPP.ret; + let Src2ModVOP3DPP = getSrcModVOP3DPP.ret; let DstRC64 = getVALUDstForVT.ret; let Src0RC64 = getVOP3SrcForVT.ret; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index c9dbe02..aabb6c2 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1235,6 +1235,12 @@ def VGPRSrc_16_Lo128 : RegisterOperand { let EncoderMethod = "getMachineOpValueT16Lo128"; } +// True 16 operands. +def VGPRSrc_16 : RegisterOperand { + let DecoderMethod = "DecodeVGPR_16RegisterClass"; + let EncoderMethod = "getMachineOpValueT16"; +} + //===----------------------------------------------------------------------===// // ASrc_* Operands with an AccVGPR //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir index 84da311..014534a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir @@ -50,7 +50,7 @@ body: | ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]] - ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]] ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]] ; @@ -88,7 +88,7 @@ body: | ; GFX11: liveins: $sgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]] ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; @@ -127,7 +127,7 @@ body: | ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]] - ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]] ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir index 30975a8..dcf9e16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir @@ -59,7 +59,7 @@ body: | ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]] - ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]] ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]] ; @@ -97,7 +97,7 @@ body: | ; GFX11: liveins: $sgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]] ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; @@ -136,7 +136,7 @@ body: | ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]] - ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]] ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]] ; diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir index 7767aa5..9ae5f55 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir @@ -66,7 +66,7 @@ body: | ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec + ; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec ; ; FAKE16-LABEL: name: ceil_f16 ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF @@ -87,7 +87,7 @@ body: | ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec + ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec ; ; FAKE16-LABEL: name: floor_f16 ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s new file mode 100644 index 0000000..1871a41 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s + +v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_mirror +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_half_mirror +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_shl:1 +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_shl:15 +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_shr:1 +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_shr:15 +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_ror:1 +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_ror:15 +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +v_floor_f16_e64_dpp v5, v1 row_mirror +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_half_mirror +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_shl:1 +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_shl:15 +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_shr:1 +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_shr:15 +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_ror:1 +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_ror:15 +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 9a65c66..701a725 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] @@ -42,46 +42,52 @@ v_bfrev_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f v_bfrev_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +v_ceil_f16_e64_dpp v5.l, v1.h quad_perm:[3,2,1,0] +// GFX11: [0x05,0x08,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_ceil_f16_e64_dpp v5.h, v1.l quad_perm:[3,2,1,0] +// GFX11: [0x05,0x40,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_ceil_f16_e64_dpp v5, v1 row_mirror +v_ceil_f16_e64_dpp v5.l, v1.l row_mirror // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_ceil_f16_e64_dpp v5, v1 row_half_mirror +v_ceil_f16_e64_dpp v5.l, v1.l row_half_mirror // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_ceil_f16_e64_dpp v5, v1 row_shl:1 +v_ceil_f16_e64_dpp v5.l, v1.l row_shl:1 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_ceil_f16_e64_dpp v5, v1 row_shl:15 +v_ceil_f16_e64_dpp v5.l, v1.l row_shl:15 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_ceil_f16_e64_dpp v5, v1 row_shr:1 +v_ceil_f16_e64_dpp v5.l, v1.l row_shr:1 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_ceil_f16_e64_dpp v5, v1 row_shr:15 +v_ceil_f16_e64_dpp v5.l, v1.l row_shr:15 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_ceil_f16_e64_dpp v5, v1 row_ror:1 +v_ceil_f16_e64_dpp v5.l, v1.l row_ror:1 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_ceil_f16_e64_dpp v5, v1 row_ror:15 +v_ceil_f16_e64_dpp v5.l, v1.l row_ror:15 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +v_ceil_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +v_ceil_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +v_ceil_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_ceil_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] @@ -1512,46 +1518,46 @@ v_ffbl_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi v_ffbl_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_floor_f16_e64_dpp v5, v1 row_mirror +v_floor_f16_e64_dpp v5.l, v1.l row_mirror // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_floor_f16_e64_dpp v5, v1 row_half_mirror +v_floor_f16_e64_dpp v5.l, v1.l row_half_mirror // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_floor_f16_e64_dpp v5, v1 row_shl:1 +v_floor_f16_e64_dpp v5.l, v1.l row_shl:1 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_floor_f16_e64_dpp v5, v1 row_shl:15 +v_floor_f16_e64_dpp v5.l, v1.l row_shl:15 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_floor_f16_e64_dpp v5, v1 row_shr:1 +v_floor_f16_e64_dpp v5.l, v1.l row_shr:1 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_floor_f16_e64_dpp v5, v1 row_shr:15 +v_floor_f16_e64_dpp v5.l, v1.l row_shr:15 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_floor_f16_e64_dpp v5, v1 row_ror:1 +v_floor_f16_e64_dpp v5.l, v1.l row_ror:1 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_floor_f16_e64_dpp v5, v1 row_ror:15 +v_floor_f16_e64_dpp v5.l, v1.l row_ror:15 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +v_floor_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +v_floor_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +v_floor_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_floor_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s new file mode 100644 index 0000000..1bef1fe2 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s @@ -0,0 +1,25 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s + +v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x00,0xdc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0x81,0xdc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x00,0xdb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0x81,0xdb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index 3897b82..043e0f9 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] @@ -9,16 +9,22 @@ v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_bfrev_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +v_ceil_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +v_ceil_f16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x08,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_ceil_f16_e64_dpp v5.h, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x40,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_ceil_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +v_ceil_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 // GFX11: [0x05,0x00,0xdc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0x81,0xdc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_ceil_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] @@ -375,16 +381,16 @@ v_ffbl_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_ffbl_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0x00,0xba,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +v_floor_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +v_floor_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +v_floor_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 // GFX11: [0x05,0x00,0xdb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0x81,0xdb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_floor_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index cf29efa..fe50845 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -1,4 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s # GFX11: v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff @@ -42,48 +43,74 @@ # GFX11: v_bfrev_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0x05,0x48,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 + +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0x05,0x08,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 + +# GFX11-REAL16: v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_ceil_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0xff,0xc1,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 + # GFX11: v_ceil_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xa2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff @@ -1302,48 +1329,74 @@ # GFX11: v_exp_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xa5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0x05,0x48,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 + +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0x05,0x08,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 + +# GFX11-REAL16: v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_floor_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0xff,0xc1,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 + # GFX11: v_floor_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xa4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index bfda6d1..c1b500e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -1,4 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s # GFX11: v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 @@ -6,18 +7,34 @@ # GFX11: v_bfrev_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb8,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0xff,0x00,0xb8,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0x05,0x48,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 + +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0x05,0x08,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 + +# GFX11-REAL16: v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_ceil_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0xff,0xc1,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 + # GFX11: v_ceil_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xa2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 @@ -288,18 +305,34 @@ # GFX11: v_exp_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xa5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0x05,0x48,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 + +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0x05,0x08,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 + +# GFX11-REAL16: v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_floor_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# COM: GFX11-FAKE16: warning: invalid instruction encoding +0xff,0xc1,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 + # GFX11: v_floor_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xa4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -- cgit v1.1 From b846613837d83989d99d33f4b90db7bad019aa8c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Feb 2024 14:01:38 +0000 Subject: [X86] X86FixupVectorConstants - add destination register width to rebuildSplatCst/rebuildZeroUpperCst/rebuildExtCst callbacks As found on #81136 - we aren't correctly handling for cases where the constant pool entry is wider than the destination register width, causing incorrect scaling of the truncated constant for load-extension cases. This first patch just pulls out the destination register width argument, its still currently driven by the constant pool entry but that will be addressed in a followup. --- llvm/lib/Target/X86/X86FixupVectorConstants.cpp | 52 ++++++++++++++----------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index 9c46cee..9b90b5e 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -121,6 +121,13 @@ static std::optional extractConstantBits(const Constant *C) { return std::nullopt; } +static std::optional extractConstantBits(const Constant *C, + unsigned NumBits) { + if (std::optional Bits = extractConstantBits(C)) + return Bits->zextOrTrunc(NumBits); + return std::nullopt; +} + // Attempt to compute the splat width of bits data by normalizing the splat to // remove undefs. static std::optional getSplatableConstant(const Constant *C, @@ -217,16 +224,15 @@ static Constant *rebuildConstant(LLVMContext &Ctx, Type *SclTy, // Attempt to rebuild a normalized splat vector constant of the requested splat // width, built up of potentially smaller scalar values. -static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumElts*/, - unsigned SplatBitWidth) { +static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumBits*/, + unsigned /*NumElts*/, unsigned SplatBitWidth) { std::optional Splat = getSplatableConstant(C, SplatBitWidth); if (!Splat) return nullptr; // Determine scalar size to use for the constant splat vector, clamping as we // might have found a splat smaller than the original constant data. - const Type *OriginalType = C->getType(); - Type *SclTy = OriginalType->getScalarType(); + Type *SclTy = C->getType()->getScalarType(); unsigned NumSclBits = SclTy->getPrimitiveSizeInBits(); NumSclBits = std::min(NumSclBits, SplatBitWidth); @@ -236,20 +242,19 @@ static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumElts*/, : 64; // Extract per-element bits. - return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits); + return rebuildConstant(C->getContext(), SclTy, *Splat, NumSclBits); } -static Constant *rebuildZeroUpperCst(const Constant *C, unsigned /*NumElts*/, +static Constant *rebuildZeroUpperCst(const Constant *C, unsigned NumBits, + unsigned /*NumElts*/, unsigned ScalarBitWidth) { - Type *Ty = C->getType(); - Type *SclTy = Ty->getScalarType(); - unsigned NumBits = Ty->getPrimitiveSizeInBits(); + Type *SclTy = C->getType()->getScalarType(); unsigned NumSclBits = SclTy->getPrimitiveSizeInBits(); LLVMContext &Ctx = C->getContext(); if (NumBits > ScalarBitWidth) { // Determine if the upper bits are all zero. - if (std::optional Bits = extractConstantBits(C)) { + if (std::optional Bits = extractConstantBits(C, NumBits)) { if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) { // If the original constant was made of smaller elements, try to retain // those types. @@ -266,16 +271,15 @@ static Constant *rebuildZeroUpperCst(const Constant *C, unsigned /*NumElts*/, return nullptr; } -static Constant *rebuildExtCst(const Constant *C, bool IsSExt, unsigned NumElts, +static Constant *rebuildExtCst(const Constant *C, bool IsSExt, + unsigned NumBits, unsigned NumElts, unsigned SrcEltBitWidth) { - Type *Ty = C->getType(); - unsigned NumBits = Ty->getPrimitiveSizeInBits(); unsigned DstEltBitWidth = NumBits / NumElts; assert((NumBits % NumElts) == 0 && (NumBits % SrcEltBitWidth) == 0 && (DstEltBitWidth % SrcEltBitWidth) == 0 && (DstEltBitWidth > SrcEltBitWidth) && "Illegal extension width"); - if (std::optional Bits = extractConstantBits(C)) { + if (std::optional Bits = extractConstantBits(C, NumBits)) { assert((Bits->getBitWidth() / DstEltBitWidth) == NumElts && (Bits->getBitWidth() % DstEltBitWidth) == 0 && "Unexpected constant extension"); @@ -290,19 +294,20 @@ static Constant *rebuildExtCst(const Constant *C, bool IsSExt, unsigned NumElts, TruncBits.insertBits(Elt.trunc(SrcEltBitWidth), I * SrcEltBitWidth); } + Type *Ty = C->getType(); return rebuildConstant(Ty->getContext(), Ty->getScalarType(), TruncBits, SrcEltBitWidth); } return nullptr; } -static Constant *rebuildSExtCst(const Constant *C, unsigned NumElts, - unsigned SrcEltBitWidth) { - return rebuildExtCst(C, true, NumElts, SrcEltBitWidth); +static Constant *rebuildSExtCst(const Constant *C, unsigned NumBits, + unsigned NumElts, unsigned SrcEltBitWidth) { + return rebuildExtCst(C, true, NumBits, NumElts, SrcEltBitWidth); } -static Constant *rebuildZExtCst(const Constant *C, unsigned NumElts, - unsigned SrcEltBitWidth) { - return rebuildExtCst(C, false, NumElts, SrcEltBitWidth); +static Constant *rebuildZExtCst(const Constant *C, unsigned NumBits, + unsigned NumElts, unsigned SrcEltBitWidth) { + return rebuildExtCst(C, false, NumBits, NumElts, SrcEltBitWidth); } bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, @@ -320,7 +325,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, int Op; int NumCstElts; int BitWidth; - std::function + std::function RebuildConstant; }; auto FixupConstant = [&](ArrayRef Fixups, unsigned OperandNo) { @@ -335,12 +340,13 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) && "Unexpected number of operands!"); if (auto *C = X86::getConstantFromPool(MI, OperandNo)) { + unsigned NumBits = C->getType()->getPrimitiveSizeInBits(); for (const FixupEntry &Fixup : Fixups) { if (Fixup.Op) { // Construct a suitable constant and adjust the MI to use the new // constant pool entry. - if (Constant *NewCst = - Fixup.RebuildConstant(C, Fixup.NumCstElts, Fixup.BitWidth)) { + if (Constant *NewCst = Fixup.RebuildConstant( + C, NumBits, Fixup.NumCstElts, Fixup.BitWidth)) { unsigned NewCPI = CP->getConstantPoolIndex(NewCst, Align(Fixup.BitWidth / 8)); MI.setDesc(TII->get(Fixup.Op)); -- cgit v1.1 From eb85c8edf576d27254fa37bf9ed72ec0867756f7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Feb 2024 15:59:19 +0000 Subject: [X86] Add test case for #81136 --- llvm/test/CodeGen/X86/pr81136.ll | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 llvm/test/CodeGen/X86/pr81136.ll diff --git a/llvm/test/CodeGen/X86/pr81136.ll b/llvm/test/CodeGen/X86/pr81136.ll new file mode 100644 index 0000000..8843adc --- /dev/null +++ b/llvm/test/CodeGen/X86/pr81136.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s + +; FIXME: Should be vpmovzxbq[128,1] instead of vpmovzxbd[128,1,0,0] +define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) { +; CHECK-LABEL: PR81136: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovd %edi, %xmm0 +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vmovdqa (%rdx), %ymm2 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = [128,1,0,0] +; CHECK-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 +; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2 +; CHECK-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vandnpd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovmskpd %ymm0, %eax +; CHECK-NEXT: popcntl %eax, %eax +; CHECK-NEXT: negq %rax +; CHECK-NEXT: retq + %v0 = bitcast i32 %a0 to <2 x i16> + %v1 = bitcast i32 %a1 to <2 x i16> + %cmp15 = icmp sle <2 x i16> %v1, %v0 + %conv16 = sext <2 x i1> %cmp15 to <2 x i64> + %shuffle29 = shufflevector <2 x i64> %conv16, <2 x i64> , <4 x i32> + %data = load volatile <4 x i64>, ptr %a2, align 32 + %cmp65 = icmp ne <4 x i64> %data, + %cmp67 = icmp ne <4 x i64> %shuffle29, zeroinitializer + %and = and <4 x i1> %cmp65, %cmp67 + %mask = bitcast <4 x i1> %and to i4 + %cnt = tail call i4 @llvm.ctpop.i4(i4 %mask) + %cntz = zext i4 %cnt to i64 + %res = sub nsw i64 0, %cntz + ret i64 %res +} +declare i4 @llvm.ctpop.i4(i4) -- cgit v1.1 From f407be32fe8084fe02c4f16842548d21afdb447f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Feb 2024 16:31:09 +0000 Subject: [X86] X86FixupVectorConstants - rename FixupEntry::BitWidth to FixupEntry::MemBitWidth NFC. Make it clearer that this refers to the width of the constant element stored in memory - which won't match the register element width after a sext/zextload --- llvm/lib/Target/X86/X86FixupVectorConstants.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index 9b90b5e..32ca9c16 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -324,7 +324,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, struct FixupEntry { int Op; int NumCstElts; - int BitWidth; + int MemBitWidth; std::function RebuildConstant; }; @@ -332,23 +332,23 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, #ifdef EXPENSIVE_CHECKS assert(llvm::is_sorted(Fixups, [](const FixupEntry &A, const FixupEntry &B) { - return (A.NumCstElts * A.BitWidth) < - (B.NumCstElts * B.BitWidth); + return (A.NumCstElts * A.MemBitWidth) < + (B.NumCstElts * B.MemBitWidth); }) && "Constant fixup table not sorted in ascending constant size"); #endif assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) && "Unexpected number of operands!"); if (auto *C = X86::getConstantFromPool(MI, OperandNo)) { - unsigned NumBits = C->getType()->getPrimitiveSizeInBits(); + unsigned RegBitWidth = C->getType()->getPrimitiveSizeInBits(); for (const FixupEntry &Fixup : Fixups) { if (Fixup.Op) { // Construct a suitable constant and adjust the MI to use the new // constant pool entry. if (Constant *NewCst = Fixup.RebuildConstant( - C, NumBits, Fixup.NumCstElts, Fixup.BitWidth)) { + C, RegBitWidth, Fixup.NumCstElts, Fixup.MemBitWidth)) { unsigned NewCPI = - CP->getConstantPoolIndex(NewCst, Align(Fixup.BitWidth / 8)); + CP->getConstantPoolIndex(NewCst, Align(Fixup.MemBitWidth / 8)); MI.setDesc(TII->get(Fixup.Op)); MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI); return true; -- cgit v1.1 From 5aeabf2df92b92c71b5dbdb9ae82a37431aa2ee4 Mon Sep 17 00:00:00 2001 From: stephenpeckham <118857872+stephenpeckham@users.noreply.github.com> Date: Thu, 8 Feb 2024 10:44:19 -0600 Subject: [XCOFF][obj2yaml] Support SymbolAlignmentAndType as 2 separate fields in YAML. (#76828) XCOFF encodes a symbol type and alignment in a single 8-bit field. It is easier to read and write YAML files if the fields can be specified separately. This PR causes obj2yaml to write the fields separately and allows yaml2obj to read either the single combined field or the separate fields. --- llvm/include/llvm/ObjectYAML/XCOFFYAML.h | 7 ++ llvm/lib/ObjectYAML/XCOFFEmitter.cpp | 99 +++++++++++++------ llvm/lib/ObjectYAML/XCOFFYAML.cpp | 16 ++- llvm/test/tools/obj2yaml/XCOFF/aix.yaml | 12 ++- llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml | 12 ++- llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml | 114 ++++++++++++++++++++++ llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml | 25 +++++ llvm/tools/obj2yaml/xcoff2yaml.cpp | 4 +- 8 files changed, 250 insertions(+), 39 deletions(-) create mode 100644 llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml diff --git a/llvm/include/llvm/ObjectYAML/XCOFFYAML.h b/llvm/include/llvm/ObjectYAML/XCOFFYAML.h index f1e821f..dd359ac 100644 --- a/llvm/include/llvm/ObjectYAML/XCOFFYAML.h +++ b/llvm/include/llvm/ObjectYAML/XCOFFYAML.h @@ -121,6 +121,9 @@ struct CsectAuxEnt : AuxSymbolEnt { // Common fields for both XCOFF32 and XCOFF64. std::optional ParameterHashIndex; std::optional TypeChkSectNum; + std::optional SymbolType; + std::optional SymbolAlignment; + // The two previous values can be encoded as a single value. std::optional SymbolAlignmentAndType; std::optional StorageMappingClass; @@ -237,6 +240,10 @@ template <> struct ScalarEnumerationTraits { static void enumeration(IO &IO, XCOFF::StorageMappingClass &Value); }; +template <> struct ScalarEnumerationTraits { + static void enumeration(IO &IO, XCOFF::SymbolType &Value); +}; + template <> struct ScalarEnumerationTraits { static void enumeration(IO &IO, XCOFF::CFileStringType &Type); }; diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp index ccf768c..5b244ff 100644 --- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp +++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp @@ -23,6 +23,7 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; +using namespace llvm::object; namespace { @@ -56,14 +57,14 @@ private: bool writeSymbols(); void writeStringTable(); - void writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym); - void writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym); - void writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym); - void writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym); - void writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym); - void writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym); - void writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym); - void writeAuxSymbol(const std::unique_ptr &AuxSym); + bool writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym); + bool writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym); + bool writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym); + bool writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym); + bool writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym); + bool writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym); + bool writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym); + bool writeAuxSymbol(const std::unique_ptr &AuxSym); XCOFFYAML::Object &Obj; bool Is64Bit = false; @@ -181,7 +182,7 @@ bool XCOFFWriter::initStringTable() { StrTblBuilder.clear(); if (Obj.StrTbl.Strings) { - // All specified strings should be added to the string table. + // Add all specified strings to the string table. for (StringRef StringEnt : *Obj.StrTbl.Strings) StrTblBuilder.add(StringEnt); @@ -524,12 +525,44 @@ bool XCOFFWriter::writeRelocations() { return true; } -void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) { +bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) { + uint8_t SymAlignAndType = 0; + if (AuxSym.SymbolAlignmentAndType) { + if (AuxSym.SymbolType || AuxSym.SymbolAlignment) { + ErrHandler("cannot specify SymbolType or SymbolAlignment if " + "SymbolAlignmentAndType is specified"); + return false; + } + SymAlignAndType = *AuxSym.SymbolAlignmentAndType; + } else { + if (AuxSym.SymbolType) { + uint8_t SymbolType = *AuxSym.SymbolType; + if (SymbolType & ~XCOFFCsectAuxRef::SymbolTypeMask) { + ErrHandler("symbol type must be less than " + + Twine(1 + XCOFFCsectAuxRef::SymbolTypeMask)); + return false; + } + SymAlignAndType = SymbolType; + } + if (AuxSym.SymbolAlignment) { + const uint8_t ShiftedSymbolAlignmentMask = + XCOFFCsectAuxRef::SymbolAlignmentMask >> + XCOFFCsectAuxRef::SymbolAlignmentBitOffset; + + if (*AuxSym.SymbolAlignment & ~ShiftedSymbolAlignmentMask) { + ErrHandler("symbol alignment must be less than " + + Twine(1 + ShiftedSymbolAlignmentMask)); + return false; + } + SymAlignAndType |= (*AuxSym.SymbolAlignment + << XCOFFCsectAuxRef::SymbolAlignmentBitOffset); + } + } if (Is64Bit) { W.write(AuxSym.SectionOrLengthLo.value_or(0)); W.write(AuxSym.ParameterHashIndex.value_or(0)); W.write(AuxSym.TypeChkSectNum.value_or(0)); - W.write(AuxSym.SymbolAlignmentAndType.value_or(0)); + W.write(SymAlignAndType); W.write(AuxSym.StorageMappingClass.value_or(XCOFF::XMC_PR)); W.write(AuxSym.SectionOrLengthHi.value_or(0)); W.write(0); @@ -538,23 +571,25 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) { W.write(AuxSym.SectionOrLength.value_or(0)); W.write(AuxSym.ParameterHashIndex.value_or(0)); W.write(AuxSym.TypeChkSectNum.value_or(0)); - W.write(AuxSym.SymbolAlignmentAndType.value_or(0)); + W.write(SymAlignAndType); W.write(AuxSym.StorageMappingClass.value_or(XCOFF::XMC_PR)); W.write(AuxSym.StabInfoIndex.value_or(0)); W.write(AuxSym.StabSectNum.value_or(0)); } + return true; } -void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym) { +bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym) { assert(Is64Bit && "can't write the exception auxiliary symbol for XCOFF32"); W.write(AuxSym.OffsetToExceptionTbl.value_or(0)); W.write(AuxSym.SizeOfFunction.value_or(0)); W.write(AuxSym.SymIdxOfNextBeyond.value_or(0)); W.write(0); W.write(XCOFF::AUX_EXCEPT); + return true; } -void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) { +bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) { if (Is64Bit) { W.write(AuxSym.PtrToLineNum.value_or(0)); W.write(AuxSym.SizeOfFunction.value_or(0)); @@ -568,9 +603,10 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) { W.write(AuxSym.SymIdxOfNextBeyond.value_or(0)); W.OS.write_zeros(2); } + return true; } -void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) { +bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) { StringRef FileName = AuxSym.FileNameOrString.value_or(""); if (nameShouldBeInStringTable(FileName)) { W.write(0); @@ -586,9 +622,10 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) { } else { W.OS.write_zeros(3); } + return true; } -void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) { +bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) { if (Is64Bit) { W.write(AuxSym.LineNum.value_or(0)); W.OS.write_zeros(13); @@ -599,9 +636,10 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) { W.write(AuxSym.LineNumLo.value_or(0)); W.OS.write_zeros(12); } + return true; } -void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) { +bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) { if (Is64Bit) { W.write(AuxSym.LengthOfSectionPortion.value_or(0)); W.write(AuxSym.NumberOfRelocEnt.value_or(0)); @@ -613,34 +651,36 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) { W.write(AuxSym.NumberOfRelocEnt.value_or(0)); W.OS.write_zeros(6); } + return true; } -void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym) { +bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym) { assert(!Is64Bit && "can't write the stat auxiliary symbol for XCOFF64"); W.write(AuxSym.SectionLength.value_or(0)); W.write(AuxSym.NumberOfRelocEnt.value_or(0)); W.write(AuxSym.NumberOfLineNum.value_or(0)); W.OS.write_zeros(10); + return true; } -void XCOFFWriter::writeAuxSymbol( +bool XCOFFWriter::writeAuxSymbol( const std::unique_ptr &AuxSym) { if (auto AS = dyn_cast(AuxSym.get())) - writeAuxSymbol(*AS); + return writeAuxSymbol(*AS); else if (auto AS = dyn_cast(AuxSym.get())) - writeAuxSymbol(*AS); + return writeAuxSymbol(*AS); else if (auto AS = dyn_cast(AuxSym.get())) - writeAuxSymbol(*AS); + return writeAuxSymbol(*AS); else if (auto AS = dyn_cast(AuxSym.get())) - writeAuxSymbol(*AS); + return writeAuxSymbol(*AS); else if (auto AS = dyn_cast(AuxSym.get())) - writeAuxSymbol(*AS); + return writeAuxSymbol(*AS); else if (auto AS = dyn_cast(AuxSym.get())) - writeAuxSymbol(*AS); + return writeAuxSymbol(*AS); else if (auto AS = dyn_cast(AuxSym.get())) - writeAuxSymbol(*AS); - else - llvm_unreachable("unknown auxiliary symbol type"); + return writeAuxSymbol(*AS); + llvm_unreachable("unknown auxiliary symbol type"); + return false; } bool XCOFFWriter::writeSymbols() { @@ -698,7 +738,8 @@ bool XCOFFWriter::writeSymbols() { } else { for (const std::unique_ptr &AuxSym : YamlSym.AuxEntries) { - writeAuxSymbol(AuxSym); + if (!writeAuxSymbol(AuxSym)) + return false; } // Pad with zeros. if (NumOfAuxSym > YamlSym.AuxEntries.size()) diff --git a/llvm/lib/ObjectYAML/XCOFFYAML.cpp b/llvm/lib/ObjectYAML/XCOFFYAML.cpp index 398b09c..83bf613 100644 --- a/llvm/lib/ObjectYAML/XCOFFYAML.cpp +++ b/llvm/lib/ObjectYAML/XCOFFYAML.cpp @@ -127,6 +127,17 @@ void ScalarEnumerationTraits::enumeration( #undef ECase } +void ScalarEnumerationTraits::enumeration( + IO &IO, XCOFF::SymbolType &Value) { +#define ECase(X) IO.enumCase(Value, #X, XCOFF::X) + ECase(XTY_ER); + ECase(XTY_SD); + ECase(XTY_LD); + ECase(XTY_CM); +#undef ECase + IO.enumFallback(Value); +} + void ScalarEnumerationTraits::enumeration( IO &IO, XCOFFYAML::AuxSymbolType &Type) { #define ECase(X) IO.enumCase(Type, #X, XCOFFYAML::X) @@ -229,6 +240,8 @@ static void auxSymMapping(IO &IO, XCOFFYAML::CsectAuxEnt &AuxSym, bool Is64) { IO.mapOptional("ParameterHashIndex", AuxSym.ParameterHashIndex); IO.mapOptional("TypeChkSectNum", AuxSym.TypeChkSectNum); IO.mapOptional("SymbolAlignmentAndType", AuxSym.SymbolAlignmentAndType); + IO.mapOptional("SymbolType", AuxSym.SymbolType); + IO.mapOptional("SymbolAlignment", AuxSym.SymbolAlignment); IO.mapOptional("StorageMappingClass", AuxSym.StorageMappingClass); if (Is64) { IO.mapOptional("SectionOrLengthLo", AuxSym.SectionOrLengthLo); @@ -350,7 +363,8 @@ void MappingTraits::mapping(IO &IO, XCOFFYAML::Symbol &S) { IO.mapOptional("AuxEntries", S.AuxEntries); } -void MappingTraits::mapping(IO &IO, XCOFFYAML::StringTable &Str) { +void MappingTraits::mapping( + IO &IO, XCOFFYAML::StringTable &Str) { IO.mapOptional("ContentSize", Str.ContentSize); IO.mapOptional("Length", Str.Length); IO.mapOptional("Strings", Str.Strings); diff --git a/llvm/test/tools/obj2yaml/XCOFF/aix.yaml b/llvm/test/tools/obj2yaml/XCOFF/aix.yaml index fbd5fa0..9f2f68b 100644 --- a/llvm/test/tools/obj2yaml/XCOFF/aix.yaml +++ b/llvm/test/tools/obj2yaml/XCOFF/aix.yaml @@ -56,7 +56,8 @@ # CHECK32-NEXT: - Type: AUX_CSECT # CHECK32-NEXT: ParameterHashIndex: 0 # CHECK32-NEXT: TypeChkSectNum: 0 -# CHECK32-NEXT: SymbolAlignmentAndType: 0 +# CHECK32-NEXT: SymbolType: XTY_ER +# CHECK32-NEXT: SymbolAlignment: 0 # CHECK32-NEXT: StorageMappingClass: XMC_PR # CHECK32-NEXT: SectionOrLength: 0 # CHECK32-NEXT: StabInfoIndex: 0 @@ -71,7 +72,8 @@ # CHECK32-NEXT: - Type: AUX_CSECT # CHECK32-NEXT: ParameterHashIndex: 0 # CHECK32-NEXT: TypeChkSectNum: 0 -# CHECK32-NEXT: SymbolAlignmentAndType: 0 +# CHECK32-NEXT: SymbolType: XTY_ER +# CHECK32-NEXT: SymbolAlignment: 0 # CHECK32-NEXT: StorageMappingClass: XMC_PR # CHECK32-NEXT: SectionOrLength: 0 # CHECK32-NEXT: StabInfoIndex: 0 @@ -128,7 +130,8 @@ # CHECK64-NEXT: - Type: AUX_CSECT # CHECK64-NEXT: ParameterHashIndex: 0 # CHECK64-NEXT: TypeChkSectNum: 0 -# CHECK64-NEXT: SymbolAlignmentAndType: 0 +# CHECK64-NEXT: SymbolType: XTY_ER +# CHECK64-NEXT: SymbolAlignment: 0 # CHECK64-NEXT: StorageMappingClass: XMC_PR # CHECK64-NEXT: SectionOrLengthLo: 0 # CHECK64-NEXT: SectionOrLengthHi: 0 @@ -142,7 +145,8 @@ # CHECK64-NEXT: - Type: AUX_CSECT # CHECK64-NEXT: ParameterHashIndex: 0 # CHECK64-NEXT: TypeChkSectNum: 0 -# CHECK64-NEXT: SymbolAlignmentAndType: 0 +# CHECK64-NEXT: SymbolType: XTY_ER +# CHECK64-NEXT: SymbolAlignment: 0 # CHECK64-NEXT: StorageMappingClass: XMC_PR # CHECK64-NEXT: SectionOrLengthLo: 0 # CHECK64-NEXT: SectionOrLengthHi: 0 diff --git a/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml b/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml index 7f93b8d..8155ac1 100644 --- a/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml +++ b/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml @@ -34,7 +34,8 @@ # CHECK32-NEXT: - Type: AUX_CSECT # CHECK32-NEXT: ParameterHashIndex: 1 # CHECK32-NEXT: TypeChkSectNum: 2 -# CHECK32-NEXT: SymbolAlignmentAndType: 41 +# CHECK32-NEXT: SymbolType: XTY_SD +# CHECK32-NEXT: SymbolAlignment: 5 # CHECK32-NEXT: StorageMappingClass: XMC_PR # CHECK32-NEXT: SectionOrLength: 3 # CHECK32-NEXT: StabInfoIndex: 4 @@ -54,7 +55,8 @@ # CHECK32-NEXT: - Type: AUX_CSECT # CHECK32-NEXT: ParameterHashIndex: 1 # CHECK32-NEXT: TypeChkSectNum: 2 -# CHECK32-NEXT: SymbolAlignmentAndType: 17 +# CHECK32-NEXT: SymbolType: XTY_SD +# CHECK32-NEXT: SymbolAlignment: 2 # CHECK32-NEXT: StorageMappingClass: XMC_PR # CHECK32-NEXT: SectionOrLength: 4 # CHECK32-NEXT: StabInfoIndex: 5 @@ -174,7 +176,8 @@ Symbols: # CHECK64-NEXT: - Type: AUX_CSECT # CHECK64-NEXT: ParameterHashIndex: 1 # CHECK64-NEXT: TypeChkSectNum: 2 -# CHECK64-NEXT: SymbolAlignmentAndType: 41 +# CHECK64-NEXT: SymbolType: XTY_SD +# CHECK64-NEXT: SymbolAlignment: 5 # CHECK64-NEXT: StorageMappingClass: XMC_PR # CHECK64-NEXT: SectionOrLengthLo: 3 # CHECK64-NEXT: SectionOrLengthHi: 4 @@ -196,7 +199,8 @@ Symbols: # CHECK64-NEXT: - Type: AUX_CSECT # CHECK64-NEXT: ParameterHashIndex: 1 # CHECK64-NEXT: TypeChkSectNum: 2 -# CHECK64-NEXT: SymbolAlignmentAndType: 17 +# CHECK64-NEXT: SymbolType: XTY_SD +# CHECK64-NEXT: SymbolAlignment: 2 # CHECK64-NEXT: StorageMappingClass: XMC_PR # CHECK64-NEXT: SectionOrLengthLo: 3 # CHECK64-NEXT: SectionOrLengthHi: 4 diff --git a/llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml b/llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml new file mode 100644 index 0000000..190224d --- /dev/null +++ b/llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml @@ -0,0 +1,114 @@ +## Check that yaml2obj can parse SymbolAlignmentAndType, SymbolAlignment, +## and SymbolType. + +# RUN: yaml2obj %s --docnum=1 -DMAGIC=0x01DF -o %t32 +# RUN: obj2yaml %t32 | FileCheck %s --check-prefix=CHECK +# RUN: yaml2obj %s --docnum=1 -DMAGIC=0x01F7 -o %t64 +# RUN: obj2yaml %t64 | FileCheck %s --check-prefix=CHECK + +# CHECK: --- !XCOFF +# CHECK-NEXT: FileHeader: +# CHECK-NEXT: MagicNumber: +# CHECK: Symbols: +# CHECK: - Name: .fcn1 +# CHECK: NumberOfAuxEntries: 1 +# CHECK-NEXT: AuxEntries: +# CHECK-NEXT: - Type: AUX_CSECT +# CHECK: SymbolType: XTY_ER +# CHECK-NEXT: SymbolAlignment: 4 +# CHECK: - Name: .fcn2 +# CHECK: NumberOfAuxEntries: 1 +# CHECK-NEXT: AuxEntries: +# CHECK-NEXT: - Type: AUX_CSECT +# CHECK: SymbolType: XTY_SD +# CHECK-NEXT: SymbolAlignment: 2 +# CHECK: - Name: .fcn3 +# CHECK: NumberOfAuxEntries: 1 +# CHECK-NEXT: AuxEntries: +# CHECK-NEXT: - Type: AUX_CSECT +# CHECK: SymbolType: XTY_SD +# CHECK-NEXT: SymbolAlignment: 0 + +--- !XCOFF +FileHeader: + MagicNumber: [[MAGIC]] +Symbols: + - StorageClass: C_EXT + Name: .fcn1 + AuxEntries: + - Type: AUX_CSECT + SymbolAlignment: 4 + - StorageClass: C_EXT + Name: .fcn2 + AuxEntries: + - Type: AUX_CSECT + SymbolAlignment: 2 + SymbolType: XTY_SD + - StorageClass: C_EXT + Name: .fcn3 + AuxEntries: + - Type: AUX_CSECT + SymbolType: XTY_SD + +## Ensure that SymbolAlignment is in range. +# RUN: not yaml2obj %s --docnum=2 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR1 +# ERROR1: symbol alignment must be less than 32 + +--- !XCOFF +FileHeader: + MagicNumber: 0x1F7 +Symbols: + - StorageClass: C_EXT + Name: .fcn1 + AuxEntries: + - Type: AUX_CSECT + SymbolType: XTY_SD + SymbolAlignment: 32 + SectionOrLengthLo: 4 + +## Ensure that neither SymbolAlignment nor SymbolType can be specified if +## SymbolAlignmentAndType is specified. +# RUN: not yaml2obj %s --docnum=3 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR2 +# ERROR2: cannot specify SymbolType or SymbolAlignment if SymbolAlignmentAndType is specified + +--- !XCOFF +FileHeader: + MagicNumber: 0x1DF +Symbols: + - StorageClass: C_EXT + Name: .fcn1 + AuxEntries: + - Type: AUX_CSECT + SymbolAlignmentAndType: 17 + SymbolAlignment: 4 + SectionOrLength: 4 + +# RUN: not yaml2obj %s --docnum=4 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR2 + +--- !XCOFF +FileHeader: + MagicNumber: 0x1DF +Symbols: + - StorageClass: C_EXT + Name: .fcn1 + AuxEntries: + - Type: AUX_CSECT + SymbolAlignmentAndType: 17 + SymbolAlignment: 4 + SymbolType: XTY_CM + SectionOrLength: 4 + +# RUN: not yaml2obj %s --docnum=5 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR2 + +--- !XCOFF +FileHeader: + MagicNumber: 0x1F7 +Symbols: + - StorageClass: C_EXT + - StorageClass: C_EXT + Name: .fcn2 + AuxEntries: + - Type: AUX_CSECT + SymbolAlignmentAndType: 18 + SymbolType: XTY_SD + SectionOrLengthLo: 4 diff --git a/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml b/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml index fe75c19..04c774d 100644 --- a/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml +++ b/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml @@ -579,3 +579,28 @@ Symbols: AuxEntries: - Type: AUX_FILE FileNameOrString: foo + +## Case10: Specify a SymbolType outside the range of field definition. +# RUN: not yaml2obj %s -DSYMTYPE=8 --docnum=8 -o %t10 2>&1 | \ +# RUN: FileCheck %s --check-prefix BADSYMTYPE + +# BADSYMTYPE: error: symbol type must be less than 8 + +## Case11: Specify a SymbolType outside the range of its enumeration. +# RUN: yaml2obj %s -DSYMTYPE=7 --docnum=8 -o %t11 +# RUN: llvm-readobj --syms %t11 | FileCheck %s --check-prefix=STYPE + +--- !XCOFF +FileHeader: + MagicNumber: 0x1DF +Symbols: + - Name: aux_fcn_csect + StorageClass: C_EXT + Type: 0x20 + AuxEntries: + - Type: AUX_CSECT + SymbolAlignment: 4 + SymbolType: [[SYMTYPE=]] + +# STYPE: SymbolAlignmentLog2: 4 +# STYPE-NEXT: SymbolType: 0x7 diff --git a/llvm/tools/obj2yaml/xcoff2yaml.cpp b/llvm/tools/obj2yaml/xcoff2yaml.cpp index 0acbf48..e426b64 100644 --- a/llvm/tools/obj2yaml/xcoff2yaml.cpp +++ b/llvm/tools/obj2yaml/xcoff2yaml.cpp @@ -209,7 +209,9 @@ void XCOFFDumper::dumpCsectAuxSym(XCOFFYAML::Symbol &Sym, XCOFFYAML::CsectAuxEnt CsectAuxSym; CsectAuxSym.ParameterHashIndex = AuxEntPtr.getParameterHashIndex(); CsectAuxSym.TypeChkSectNum = AuxEntPtr.getTypeChkSectNum(); - CsectAuxSym.SymbolAlignmentAndType = AuxEntPtr.getSymbolAlignmentAndType(); + CsectAuxSym.SymbolAlignment = AuxEntPtr.getAlignmentLog2(); + CsectAuxSym.SymbolType = + static_cast(AuxEntPtr.getSymbolType()); CsectAuxSym.StorageMappingClass = AuxEntPtr.getStorageMappingClass(); if (Obj.is64Bit()) { -- cgit v1.1 From 58e8147d1690485ed0a6fcb59c7b6ea4b8cd2936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 8 Feb 2024 08:49:11 -0800 Subject: [flang][openacc] Use original input for base address with optional (#80931) In #80317 the data op generation was updated to use correctly the #0 result from the hlfir.delcare op. In case of optional that are not descriptor, it is preferable to use the original input for the varPtr value of the OpenACC data op. This patch also make sure that the descriptor value of optional is only accessed when present. --- flang/lib/Lower/DirectivesCommon.h | 93 ++++++++++++++++++++++++++------- flang/lib/Lower/OpenACC.cpp | 20 +++++-- flang/test/Lower/OpenACC/acc-bounds.f90 | 38 +++++++++++++- 3 files changed, 124 insertions(+), 27 deletions(-) diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h index bd88037..8d560db 100644 --- a/flang/lib/Lower/DirectivesCommon.h +++ b/flang/lib/Lower/DirectivesCommon.h @@ -52,10 +52,13 @@ namespace lower { /// operations. struct AddrAndBoundsInfo { explicit AddrAndBoundsInfo() {} - explicit AddrAndBoundsInfo(mlir::Value addr) : addr(addr) {} - explicit AddrAndBoundsInfo(mlir::Value addr, mlir::Value isPresent) - : addr(addr), isPresent(isPresent) {} + explicit AddrAndBoundsInfo(mlir::Value addr, mlir::Value rawInput) + : addr(addr), rawInput(rawInput) {} + explicit AddrAndBoundsInfo(mlir::Value addr, mlir::Value rawInput, + mlir::Value isPresent) + : addr(addr), rawInput(rawInput), isPresent(isPresent) {} mlir::Value addr = nullptr; + mlir::Value rawInput = nullptr; mlir::Value isPresent = nullptr; }; @@ -615,20 +618,30 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter, fir::FirOpBuilder &builder, Fortran::lower::SymbolRef sym, mlir::Location loc) { mlir::Value symAddr = converter.getSymbolAddress(sym); + mlir::Value rawInput = symAddr; if (auto declareOp = - mlir::dyn_cast_or_null(symAddr.getDefiningOp())) + mlir::dyn_cast_or_null(symAddr.getDefiningOp())) { symAddr = declareOp.getResults()[0]; + rawInput = declareOp.getResults()[1]; + } // TODO: Might need revisiting to handle for non-shared clauses if (!symAddr) { if (const auto *details = - sym->detailsIf()) + sym->detailsIf()) { symAddr = converter.getSymbolAddress(details->symbol()); + rawInput = symAddr; + } } if (!symAddr) llvm::report_fatal_error("could not retrieve symbol address"); + mlir::Value isPresent; + if (Fortran::semantics::IsOptional(sym)) + isPresent = + builder.create(loc, builder.getI1Type(), rawInput); + if (auto boxTy = fir::unwrapRefType(symAddr.getType()).dyn_cast()) { if (boxTy.getEleTy().isa()) @@ -638,8 +651,6 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter, // `fir.ref>` type. if (symAddr.getType().isa()) { if (Fortran::semantics::IsOptional(sym)) { - mlir::Value isPresent = - builder.create(loc, builder.getI1Type(), symAddr); mlir::Value addr = builder.genIfOp(loc, {boxTy}, isPresent, /*withElseRegion=*/true) .genThen([&]() { @@ -652,14 +663,13 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter, builder.create(loc, mlir::ValueRange{absent}); }) .getResults()[0]; - return AddrAndBoundsInfo(addr, isPresent); + return AddrAndBoundsInfo(addr, rawInput, isPresent); } mlir::Value addr = builder.create(loc, symAddr); - return AddrAndBoundsInfo(addr); - ; + return AddrAndBoundsInfo(addr, rawInput, isPresent); } } - return AddrAndBoundsInfo(symAddr); + return AddrAndBoundsInfo(symAddr, rawInput, isPresent); } template @@ -807,7 +817,7 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc, Fortran::lower::StatementContext &stmtCtx, const std::list &subscripts, std::stringstream &asFortran, fir::ExtendedValue &dataExv, - bool dataExvIsAssumedSize, mlir::Value baseAddr, + bool dataExvIsAssumedSize, AddrAndBoundsInfo &info, bool treatIndexAsSection = false) { int dimension = 0; mlir::Type idxTy = builder.getIndexType(); @@ -831,11 +841,30 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value stride = one; bool strideInBytes = false; - if (fir::unwrapRefType(baseAddr.getType()).isa()) { - mlir::Value d = builder.createIntegerConstant(loc, idxTy, dimension); - auto dimInfo = builder.create(loc, idxTy, idxTy, idxTy, - baseAddr, d); - stride = dimInfo.getByteStride(); + if (fir::unwrapRefType(info.addr.getType()).isa()) { + if (info.isPresent) { + stride = + builder + .genIfOp(loc, idxTy, info.isPresent, /*withElseRegion=*/true) + .genThen([&]() { + mlir::Value d = + builder.createIntegerConstant(loc, idxTy, dimension); + auto dimInfo = builder.create( + loc, idxTy, idxTy, idxTy, info.addr, d); + builder.create(loc, dimInfo.getByteStride()); + }) + .genElse([&] { + mlir::Value zero = + builder.createIntegerConstant(loc, idxTy, 0); + builder.create(loc, zero); + }) + .getResults()[0]; + } else { + mlir::Value d = builder.createIntegerConstant(loc, idxTy, dimension); + auto dimInfo = builder.create(loc, idxTy, idxTy, + idxTy, info.addr, d); + stride = dimInfo.getByteStride(); + } strideInBytes = true; } @@ -919,7 +948,26 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc, } } - extent = fir::factory::readExtent(builder, loc, dataExv, dimension); + if (info.isPresent && + fir::unwrapRefType(info.addr.getType()).isa()) { + extent = + builder + .genIfOp(loc, idxTy, info.isPresent, /*withElseRegion=*/true) + .genThen([&]() { + mlir::Value ext = fir::factory::readExtent( + builder, loc, dataExv, dimension); + builder.create(loc, ext); + }) + .genElse([&] { + mlir::Value zero = + builder.createIntegerConstant(loc, idxTy, 0); + builder.create(loc, zero); + }) + .getResults()[0]; + } else { + extent = fir::factory::readExtent(builder, loc, dataExv, dimension); + } + if (dataExvIsAssumedSize && dimension + 1 == dataExvRank) { extent = zero; if (ubound && lbound) { @@ -976,6 +1024,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds( dataExv = converter.genExprAddr(operandLocation, *exprBase, stmtCtx); info.addr = fir::getBase(dataExv); + info.rawInput = info.addr; asFortran << (*exprBase).AsFortran(); } else { const Fortran::parser::Name &name = @@ -993,7 +1042,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds( bounds = genBoundsOps( builder, operandLocation, converter, stmtCtx, arrayElement->subscripts, asFortran, dataExv, - dataExvIsAssumedSize, info.addr, treatIndexAsSection); + dataExvIsAssumedSize, info, treatIndexAsSection); } asFortran << ')'; } else if (auto structComp = Fortran::parser::Unwrap< @@ -1001,6 +1050,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds( fir::ExtendedValue compExv = converter.genExprAddr(operandLocation, *expr, stmtCtx); info.addr = fir::getBase(compExv); + info.rawInput = info.addr; if (fir::unwrapRefType(info.addr.getType()) .isa()) bounds = genBaseBoundsOps( @@ -1012,7 +1062,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds( *Fortran::parser::GetLastName(*structComp).symbol); if (isOptional) info.isPresent = builder.create( - operandLocation, builder.getI1Type(), info.addr); + operandLocation, builder.getI1Type(), info.rawInput); if (auto loadOp = mlir::dyn_cast_or_null( info.addr.getDefiningOp())) { @@ -1020,6 +1070,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds( fir::isPointerType(loadOp.getType())) info.addr = builder.create(operandLocation, info.addr); + info.rawInput = info.addr; } // If the component is an allocatable or pointer the result of @@ -1029,6 +1080,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds( if (auto boxAddrOp = mlir::dyn_cast_or_null( info.addr.getDefiningOp())) { info.addr = boxAddrOp.getVal(); + info.rawInput = info.addr; bounds = genBoundsOpsFromBox( builder, operandLocation, converter, compExv, info); } @@ -1043,6 +1095,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds( fir::ExtendedValue compExv = converter.genExprAddr(operandLocation, *expr, stmtCtx); info.addr = fir::getBase(compExv); + info.rawInput = info.addr; asFortran << (*expr).AsFortran(); } else if (const auto *dataRef{ std::get_if( diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 43f54c6..6ae270f 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -67,9 +67,12 @@ static Op createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value varPtrPtr; if (auto boxTy = baseAddr.getType().dyn_cast()) { if (isPresent) { + mlir::Type ifRetTy = boxTy.getEleTy(); + if (!fir::isa_ref_type(ifRetTy)) + ifRetTy = fir::ReferenceType::get(ifRetTy); baseAddr = builder - .genIfOp(loc, {boxTy.getEleTy()}, isPresent, + .genIfOp(loc, {ifRetTy}, isPresent, /*withElseRegion=*/true) .genThen([&]() { mlir::Value boxAddr = @@ -78,7 +81,7 @@ static Op createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc, }) .genElse([&] { mlir::Value absent = - builder.create(loc, boxTy.getEleTy()); + builder.create(loc, ifRetTy); builder.create(loc, mlir::ValueRange{absent}); }) .getResults()[0]; @@ -295,9 +298,16 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList, asFortran, bounds, /*treatIndexAsSection=*/true); - Op op = createDataEntryOp( - builder, operandLocation, info.addr, asFortran, bounds, structured, - implicit, dataClause, info.addr.getType(), info.isPresent); + // If the input value is optional and is not a descriptor, we use the + // rawInput directly. + mlir::Value baseAddr = + ((info.addr.getType() != fir::unwrapRefType(info.rawInput.getType())) && + info.isPresent) + ? info.rawInput + : info.addr; + Op op = createDataEntryOp(builder, operandLocation, baseAddr, asFortran, + bounds, structured, implicit, dataClause, + baseAddr.getType(), info.isPresent); dataOperands.push_back(op.getAccPtr()); } } diff --git a/flang/test/Lower/OpenACC/acc-bounds.f90 b/flang/test/Lower/OpenACC/acc-bounds.f90 index bd96bc8..df97cbc 100644 --- a/flang/test/Lower/OpenACC/acc-bounds.f90 +++ b/flang/test/Lower/OpenACC/acc-bounds.f90 @@ -126,8 +126,8 @@ contains ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref>>> {fir.bindc_name = "a", fir.optional}) { -! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) -! CHECK: %[[IS_PRESENT:.*]] = fir.is_present %[[ARG0_DECL]]#0 : (!fir.ref>>>) -> i1 +! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[IS_PRESENT:.*]] = fir.is_present %[[ARG0_DECL]]#1 : (!fir.ref>>>) -> i1 ! CHECK: %[[BOX:.*]] = fir.if %[[IS_PRESENT]] -> (!fir.box>>) { ! CHECK: %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref>>> ! CHECK: fir.result %[[LOAD]] : !fir.box>> @@ -153,4 +153,38 @@ contains ! CHECK: %[[ATTACH:.*]] = acc.attach varPtr(%[[BOX_ADDR]] : !fir.ptr>) bounds(%[[BOUND]]) -> !fir.ptr> {name = "a"} ! CHECK: acc.data dataOperands(%[[ATTACH]] : !fir.ptr>) + subroutine acc_optional_data2(a, n) + integer :: n + real, optional :: a(n) + !$acc data no_create(a) + !$acc end data + end subroutine + +! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data2( +! CHECK-SAME: %[[A:.*]]: !fir.ref> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref {fir.bindc_name = "n"}) { +! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) {fortran_attrs = #fir.var_attrs, uniq_name = "_QMopenacc_boundsFacc_optional_data2Ea"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) +! CHECK: %[[NO_CREATE:.*]] = acc.nocreate varPtr(%[[DECL_A]]#1 : !fir.ref>) bounds(%10) -> !fir.ref> {name = "a"} +! CHECK: acc.data dataOperands(%[[NO_CREATE]] : !fir.ref>) { + + subroutine acc_optional_data3(a, n) + integer :: n + real, optional :: a(n) + !$acc data no_create(a(1:n)) + !$acc end data + end subroutine + +! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data3( +! CHECK-SAME: %[[A:.*]]: !fir.ref> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref {fir.bindc_name = "n"}) { +! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) {fortran_attrs = #fir.var_attrs, uniq_name = "_QMopenacc_boundsFacc_optional_data3Ea"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) +! CHECK: %[[PRES:.*]] = fir.is_present %[[DECL_A]]#1 : (!fir.ref>) -> i1 +! CHECK: %[[STRIDE:.*]] = fir.if %[[PRES]] -> (index) { +! CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[DECL_A]]#0, %c0{{.*}} : (!fir.box>, index) -> (index, index, index) +! CHECK: fir.result %[[DIMS]]#2 : index +! CHECK: } else { +! CHECK: fir.result %c0{{.*}} : index +! CHECK: } +! CHECK: %[[BOUNDS:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%[[STRIDE]] : index) startIdx(%c1 : index) {strideInBytes = true} +! CHECK: %[[NOCREATE:.*]] = acc.nocreate varPtr(%[[DECL_A]]#1 : !fir.ref>) bounds(%14) -> !fir.ref> {name = "a(1:n)"} +! CHECK: acc.data dataOperands(%[[NOCREATE]] : !fir.ref>) { + end module -- cgit v1.1 From 66d462d0a1ba1e510fff479baff8f21ecb924b1f Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 8 Feb 2024 08:54:52 -0800 Subject: Add missing textual header to module map --- clang/include/module.modulemap | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap index 794526b..9285595 100644 --- a/clang/include/module.modulemap +++ b/clang/include/module.modulemap @@ -81,6 +81,7 @@ module Clang_Basic { textual header "clang/Basic/RISCVVTypes.def" textual header "clang/Basic/Sanitizers.def" textual header "clang/Basic/TargetCXXABI.def" + textual header "clang/Basic/TargetOSMacros.def" textual header "clang/Basic/TransformTypeTraits.def" textual header "clang/Basic/TokenKinds.def" textual header "clang/Basic/WebAssemblyReferenceTypes.def" -- cgit v1.1 From 750981f1a2c6069cded709b75cc87d7abd05277a Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 8 Feb 2024 09:03:47 -0800 Subject: Fix a truly strange triple in testcase --- lldb/test/API/macosx/universal/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/API/macosx/universal/Makefile b/lldb/test/API/macosx/universal/Makefile index 8712fde..7d4762f 100644 --- a/lldb/test/API/macosx/universal/Makefile +++ b/lldb/test/API/macosx/universal/Makefile @@ -14,7 +14,7 @@ testit.x86_64: testit.x86_64.o $(CC) -isysroot $(SDKROOT) -target x86_64-apple-macosx10.9 -o testit.x86_64 $< testit.x86_64h.o: main.c - $(CC) -isysroot $(SDKROOT) -g -O0 -target x86_64h-apple-macosx10.9-apple-macosx10.9-apple-macosx10.9-apple-macosx10.9 -c -o testit.x86_64h.o $< + $(CC) -isysroot $(SDKROOT) -g -O0 -target x86_64h-apple-macosx10.9 -c -o testit.x86_64h.o $< testit.x86_64.o: main.c $(CC) -isysroot $(SDKROOT) -g -O0 -target x86_64-apple-macosx10.9 -c -o testit.x86_64.o $< -- cgit v1.1 From bdde5f9bea75e897bcc31a95b9c3376988c211cc Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 8 Feb 2024 16:40:48 +0000 Subject: [DebugInfo][RemoveDIs] Turn on non-instrinsic debug-info by default This patch causes all variable-location debug-info to be converted into non-intrinsic records as they passes through the optimisation / instrumentation passes. There's a brief introduction here [0] and a more detailed thread on what this means on discourse at [1]. If this commit is breaking your downstream tests, please see comment 12 in [1], which documents the kind of variation in tests we'd expect to see from this change and what to do about it. [0] https://llvm.org/docs/RemoveDIsDebugInfo.html [1] https://discourse.llvm.org/t/rfc-instruction-api-changes-needed-to-eliminate-debug-intrinsics-from-ir/68939 --- llvm/lib/IR/BasicBlock.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index fe9d0d0..bf02eba 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -34,7 +34,7 @@ cl::opt UseNewDbgInfoFormat("experimental-debuginfo-iterators", cl::desc("Enable communicating debuginfo positions " "through iterators, eliminating intrinsics"), - cl::init(false)); + cl::init(true)); DPMarker *BasicBlock::createMarker(Instruction *I) { assert(IsNewDbgInfoFormat && -- cgit v1.1 From f219cda7bd43696792ca4668ca5a9fbf55a9f09f Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Thu, 8 Feb 2024 09:16:12 -0800 Subject: [lldb] Fix printf formatting of std::time_t seconds (#81078) This formatter https://github.com/llvm/llvm-project/pull/78609 was originally passing the signed seconds (which can refer to times in the past) with an unsigned printf formatter, and had tests that expected to see negative values from the printf which always failed on macOS. I'm not clear how they ever passed on any platform. Fix the printf to print seconds as a signed value, and re-enable the tests. --- lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp | 6 ++--- .../libcxx/chrono/TestDataFormatterLibcxxChrono.py | 30 ++++++++++------------ 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp index a7d7066..7893aa7 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp @@ -1108,7 +1108,7 @@ bool lldb_private::formatters::LibcxxChronoSysSecondsSummaryProvider( const std::time_t seconds = ptr_sp->GetValueAsSigned(0); if (seconds < chrono_timestamp_min || seconds > chrono_timestamp_max) - stream.Printf("timestamp=%" PRIu64 " s", static_cast(seconds)); + stream.Printf("timestamp=%" PRId64 " s", static_cast(seconds)); else { std::array str; std::size_t size = @@ -1116,8 +1116,8 @@ bool lldb_private::formatters::LibcxxChronoSysSecondsSummaryProvider( if (size == 0) return false; - stream.Printf("date/time=%s timestamp=%" PRIu64 " s", str.data(), - static_cast(seconds)); + stream.Printf("date/time=%s timestamp=%" PRId64 " s", str.data(), + static_cast(seconds)); } return true; diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py index 9706f9e..a90fb82 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py @@ -54,17 +54,16 @@ class LibcxxChronoDataFormatterTestCase(TestBase): substrs=["ss_0 = date/time=1970-01-01T00:00:00Z timestamp=0 s"], ) - # FIXME disabled temporarily, macOS is printing this as an unsigned? - #self.expect( - # "frame variable ss_neg_date_time", - # substrs=[ - # "ss_neg_date_time = date/time=-32767-01-01T00:00:00Z timestamp=-1096193779200 s" - # ], - #) - #self.expect( - # "frame variable ss_neg_seconds", - # substrs=["ss_neg_seconds = timestamp=-1096193779201 s"], - #) + self.expect( + "frame variable ss_neg_date_time", + substrs=[ + "ss_neg_date_time = date/time=-32767-01-01T00:00:00Z timestamp=-1096193779200 s" + ], + ) + self.expect( + "frame variable ss_neg_seconds", + substrs=["ss_neg_seconds = timestamp=-1096193779201 s"], + ) self.expect( "frame variable ss_pos_date_time", @@ -77,11 +76,10 @@ class LibcxxChronoDataFormatterTestCase(TestBase): substrs=["ss_pos_seconds = timestamp=971890963200 s"], ) - # FIXME disabled temporarily, macOS is printing this as an unsigned? - #self.expect( - # "frame variable ss_min", - # substrs=["ss_min = timestamp=-9223372036854775808 s"], - #) + self.expect( + "frame variable ss_min", + substrs=["ss_min = timestamp=-9223372036854775808 s"], + ) self.expect( "frame variable ss_max", substrs=["ss_max = timestamp=9223372036854775807 s"], -- cgit v1.1 From af97edff70b0d9cb89729dc0d8af1d1ea101686e Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Thu, 8 Feb 2024 09:32:12 -0800 Subject: [lldb] Refactor GetFormatFromCString to always check for partial matches (NFC) (#81018) Refactors logic in `ParseInternal` that was previously calling `GetFormatFromCString` twice, once with `partial_match_ok` set to false, and the second time set to true. With this change, lldb formats (ie `%@`, `%S`, etc) are checked first. If a format is not one of those, then `GetFormatFromCString` is called once, and now always checks for partial matches. --- lldb/include/lldb/DataFormatters/FormatManager.h | 2 +- lldb/source/Core/FormatEntity.cpp | 26 ++++++++++-------------- lldb/source/DataFormatters/FormatManager.cpp | 17 +++++++--------- lldb/source/Interpreter/OptionArgParser.cpp | 3 +-- 4 files changed, 20 insertions(+), 28 deletions(-) diff --git a/lldb/include/lldb/DataFormatters/FormatManager.h b/lldb/include/lldb/DataFormatters/FormatManager.h index 986614f..db2fe99 100644 --- a/lldb/include/lldb/DataFormatters/FormatManager.h +++ b/lldb/include/lldb/DataFormatters/FormatManager.h @@ -138,7 +138,7 @@ public: } static bool GetFormatFromCString(const char *format_cstr, - bool partial_match_ok, lldb::Format &format); + lldb::Format &format); static char GetFormatAsFormatChar(lldb::Format format); diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp index 3c665c2..fa5eadc 100644 --- a/lldb/source/Core/FormatEntity.cpp +++ b/lldb/source/Core/FormatEntity.cpp @@ -2151,11 +2151,7 @@ static Status ParseInternal(llvm::StringRef &format, Entry &parent_entry, if (entry.printf_format.find('%') == std::string::npos) { bool clear_printf = false; - if (FormatManager::GetFormatFromCString( - entry.printf_format.c_str(), false, entry.fmt)) { - // We have an LLDB format, so clear the printf format - clear_printf = true; - } else if (entry.printf_format.size() == 1) { + if (entry.printf_format.size() == 1) { switch (entry.printf_format[0]) { case '@': // if this is an @ sign, print ObjC description entry.number = ValueObject:: @@ -2198,20 +2194,20 @@ static Status ParseInternal(llvm::StringRef &format, Entry &parent_entry, eValueObjectRepresentationStyleExpressionPath; clear_printf = true; break; - default: + } + } + + if (entry.number == 0) { + if (FormatManager::GetFormatFromCString( + entry.printf_format.c_str(), entry.fmt)) { + clear_printf = true; + } else if (entry.printf_format == "tid") { + verify_is_thread_id = true; + } else { error.SetErrorStringWithFormat("invalid format: '%s'", entry.printf_format.c_str()); return error; } - } else if (FormatManager::GetFormatFromCString( - entry.printf_format.c_str(), true, entry.fmt)) { - clear_printf = true; - } else if (entry.printf_format == "tid") { - verify_is_thread_id = true; - } else { - error.SetErrorStringWithFormat("invalid format: '%s'", - entry.printf_format.c_str()); - return error; } // Our format string turned out to not be a printf style format diff --git a/lldb/source/DataFormatters/FormatManager.cpp b/lldb/source/DataFormatters/FormatManager.cpp index f1f135d..092fa3c 100644 --- a/lldb/source/DataFormatters/FormatManager.cpp +++ b/lldb/source/DataFormatters/FormatManager.cpp @@ -91,7 +91,7 @@ static bool GetFormatFromFormatChar(char format_char, Format &format) { } static bool GetFormatFromFormatName(llvm::StringRef format_name, - bool partial_match_ok, Format &format) { + Format &format) { uint32_t i; for (i = 0; i < g_num_format_infos; ++i) { if (format_name.equals_insensitive(g_format_infos[i].format_name)) { @@ -100,13 +100,11 @@ static bool GetFormatFromFormatName(llvm::StringRef format_name, } } - if (partial_match_ok) { - for (i = 0; i < g_num_format_infos; ++i) { - if (llvm::StringRef(g_format_infos[i].format_name) - .starts_with_insensitive(format_name)) { - format = g_format_infos[i].format; - return true; - } + for (i = 0; i < g_num_format_infos; ++i) { + if (llvm::StringRef(g_format_infos[i].format_name) + .starts_with_insensitive(format_name)) { + format = g_format_infos[i].format; + return true; } } format = eFormatInvalid; @@ -124,7 +122,6 @@ void FormatManager::Changed() { } bool FormatManager::GetFormatFromCString(const char *format_cstr, - bool partial_match_ok, lldb::Format &format) { bool success = false; if (format_cstr && format_cstr[0]) { @@ -134,7 +131,7 @@ bool FormatManager::GetFormatFromCString(const char *format_cstr, return true; } - success = GetFormatFromFormatName(format_cstr, partial_match_ok, format); + success = GetFormatFromFormatName(format_cstr, format); } if (!success) format = eFormatInvalid; diff --git a/lldb/source/Interpreter/OptionArgParser.cpp b/lldb/source/Interpreter/OptionArgParser.cpp index d13805a..75ccad8 100644 --- a/lldb/source/Interpreter/OptionArgParser.cpp +++ b/lldb/source/Interpreter/OptionArgParser.cpp @@ -93,8 +93,7 @@ Status OptionArgParser::ToFormat(const char *s, lldb::Format &format, *byte_size_ptr = 0; } - const bool partial_match_ok = true; - if (!FormatManager::GetFormatFromCString(s, partial_match_ok, format)) { + if (!FormatManager::GetFormatFromCString(s, format)) { StreamString error_strm; error_strm.Printf( "Invalid format character or name '%s'. Valid values are:\n", s); -- cgit v1.1 From bef25ae297d6d246bf0fa8667c8b08f9d5e8dae7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Feb 2024 17:31:06 +0000 Subject: [X86] X86FixupVectorConstants - use explicit register bitwidth for the loaded vector instead of using constant pool bitwidth Fixes #81136 - we might be loading from a constant pool entry wider than the destination register bitwidth, affecting the vextload scale calculation. ConvertToBroadcastAVX512 doesn't yet set an explicit bitwidth (it will default to the constant pool bitwidth) due to difficulties in looking up the original register width through the fold tables, but as we only use rebuildSplatCst this shouldn't cause any miscompilations, although it might prevent folding to broadcast if only the lower bits match a splatable pattern. --- llvm/lib/Target/X86/X86FixupVectorConstants.cpp | 35 ++++++++++++++----------- llvm/test/CodeGen/X86/pr81136.ll | 3 +-- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index 32ca9c16..da7dcbb 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -226,6 +226,7 @@ static Constant *rebuildConstant(LLVMContext &Ctx, Type *SclTy, // width, built up of potentially smaller scalar values. static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumBits*/, unsigned /*NumElts*/, unsigned SplatBitWidth) { + // TODO: Truncate to NumBits once ConvertToBroadcastAVX512 support this. std::optional Splat = getSplatableConstant(C, SplatBitWidth); if (!Splat) return nullptr; @@ -328,7 +329,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, std::function RebuildConstant; }; - auto FixupConstant = [&](ArrayRef Fixups, unsigned OperandNo) { + auto FixupConstant = [&](ArrayRef Fixups, unsigned RegBitWidth, + unsigned OperandNo) { #ifdef EXPENSIVE_CHECKS assert(llvm::is_sorted(Fixups, [](const FixupEntry &A, const FixupEntry &B) { @@ -340,7 +342,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) && "Unexpected number of operands!"); if (auto *C = X86::getConstantFromPool(MI, OperandNo)) { - unsigned RegBitWidth = C->getType()->getPrimitiveSizeInBits(); + RegBitWidth = + RegBitWidth ? RegBitWidth : C->getType()->getPrimitiveSizeInBits(); for (const FixupEntry &Fixup : Fixups) { if (Fixup.Op) { // Construct a suitable constant and adjust the MI to use the new @@ -377,7 +380,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, // TODO: SSE3 MOVDDUP Handling return FixupConstant({{X86::MOVSSrm, 1, 32, rebuildZeroUpperCst}, {X86::MOVSDrm, 1, 64, rebuildZeroUpperCst}}, - 1); + 128, 1); case X86::VMOVAPDrm: case X86::VMOVAPSrm: case X86::VMOVUPDrm: @@ -386,7 +389,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst}, {X86::VMOVSDrm, 1, 64, rebuildZeroUpperCst}, {X86::VMOVDDUPrm, 1, 64, rebuildSplatCst}}, - 1); + 128, 1); case X86::VMOVAPDYrm: case X86::VMOVAPSYrm: case X86::VMOVUPDYrm: @@ -394,7 +397,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, return FixupConstant({{X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst}, {X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst}, {X86::VBROADCASTF128rm, 1, 128, rebuildSplatCst}}, - 1); + 256, 1); case X86::VMOVAPDZ128rm: case X86::VMOVAPSZ128rm: case X86::VMOVUPDZ128rm: @@ -403,7 +406,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VBROADCASTSSZ128rm, 1, 32, rebuildSplatCst}, {X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst}, {X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}}, - 1); + 128, 1); case X86::VMOVAPDZ256rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPDZ256rm: @@ -412,7 +415,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {{X86::VBROADCASTSSZ256rm, 1, 32, rebuildSplatCst}, {X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst}, {X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}}, - 1); + 256, 1); case X86::VMOVAPDZrm: case X86::VMOVAPSZrm: case X86::VMOVUPDZrm: @@ -421,7 +424,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst}, {X86::VBROADCASTF32X4rm, 1, 128, rebuildSplatCst}, {X86::VBROADCASTF64X4rm, 1, 256, rebuildSplatCst}}, - 1); + 512, 1); /* Integer Loads */ case X86::MOVDQArm: case X86::MOVDQUrm: { @@ -440,7 +443,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {HasSSE41 ? X86::PMOVZXWDrm : 0, 4, 16, rebuildZExtCst}, {HasSSE41 ? X86::PMOVSXDQrm : 0, 2, 32, rebuildSExtCst}, {HasSSE41 ? X86::PMOVZXDQrm : 0, 2, 32, rebuildZExtCst}}; - return FixupConstant(Fixups, 1); + return FixupConstant(Fixups, 128, 1); } case X86::VMOVDQArm: case X86::VMOVDQUrm: { @@ -465,7 +468,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VPMOVZXWDrm, 4, 16, rebuildZExtCst}, {X86::VPMOVSXDQrm, 2, 32, rebuildSExtCst}, {X86::VPMOVZXDQrm, 2, 32, rebuildZExtCst}}; - return FixupConstant(Fixups, 1); + return FixupConstant(Fixups, 128, 1); } case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: { @@ -490,7 +493,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {HasAVX2 ? X86::VPMOVZXWDYrm : 0, 8, 16, rebuildZExtCst}, {HasAVX2 ? X86::VPMOVSXDQYrm : 0, 4, 32, rebuildSExtCst}, {HasAVX2 ? X86::VPMOVZXDQYrm : 0, 4, 32, rebuildZExtCst}}; - return FixupConstant(Fixups, 1); + return FixupConstant(Fixups, 256, 1); } case X86::VMOVDQA32Z128rm: case X86::VMOVDQA64Z128rm: @@ -515,7 +518,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VPMOVZXWDZ128rm, 4, 16, rebuildZExtCst}, {X86::VPMOVSXDQZ128rm, 2, 32, rebuildSExtCst}, {X86::VPMOVZXDQZ128rm, 2, 32, rebuildZExtCst}}; - return FixupConstant(Fixups, 1); + return FixupConstant(Fixups, 128, 1); } case X86::VMOVDQA32Z256rm: case X86::VMOVDQA64Z256rm: @@ -539,7 +542,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VPMOVZXWDZ256rm, 8, 16, rebuildZExtCst}, {X86::VPMOVSXDQZ256rm, 4, 32, rebuildSExtCst}, {X86::VPMOVZXDQZ256rm, 4, 32, rebuildZExtCst}}; - return FixupConstant(Fixups, 1); + return FixupConstant(Fixups, 256, 1); } case X86::VMOVDQA32Zrm: case X86::VMOVDQA64Zrm: @@ -564,7 +567,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VPMOVZXWDZrm, 16, 16, rebuildZExtCst}, {X86::VPMOVSXDQZrm, 8, 32, rebuildSExtCst}, {X86::VPMOVZXDQZrm, 8, 32, rebuildZExtCst}}; - return FixupConstant(Fixups, 1); + return FixupConstant(Fixups, 512, 1); } } @@ -592,7 +595,9 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32; FixupEntry Fixups[] = {{(int)OpBcst32, 32, 32, rebuildSplatCst}, {(int)OpBcst64, 64, 64, rebuildSplatCst}}; - return FixupConstant(Fixups, OpNo); + // TODO: Add support for RegBitWidth, but currently rebuildSplatCst + // doesn't require it (defaults to Constant::getPrimitiveSizeInBits). + return FixupConstant(Fixups, 0, OpNo); } return false; }; diff --git a/llvm/test/CodeGen/X86/pr81136.ll b/llvm/test/CodeGen/X86/pr81136.ll index 8843adc..b4ac3fc 100644 --- a/llvm/test/CodeGen/X86/pr81136.ll +++ b/llvm/test/CodeGen/X86/pr81136.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s -; FIXME: Should be vpmovzxbq[128,1] instead of vpmovzxbd[128,1,0,0] define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) { ; CHECK-LABEL: PR81136: ; CHECK: # %bb.0: @@ -9,7 +8,7 @@ define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) { ; CHECK-NEXT: vmovd %esi, %xmm1 ; CHECK-NEXT: vmovdqa (%rdx), %ymm2 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = [128,1,0,0] +; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm4 = [128,1] ; CHECK-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4 ; CHECK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -- cgit v1.1 From c8d431e0ed6ab6276bf45d1c36466faad8e4e4d1 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 8 Feb 2024 09:40:11 -0800 Subject: [riscv] Add test coverage in advance of a upcoming fix This is a reduced test case for a fix for the issue identified in https://github.com/llvm/llvm-project/issues/80910. --- .../RISCV/rvv/fixed-vectors-buildvec-of-binop.ll | 34 ++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll index c8531ed..e376688 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll @@ -588,3 +588,37 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b, %v3 = insertelement <8 x i32> %v2, i32 %e3, i32 7 ret <8 x i32> %v3 } + +; FIXME: This is currently showing a miscompile, we effectively +; truncate before the ashr instead of after it, so if %a or %b +; is e.g. UINT32_MAX+1 we get different result. +define <2 x i32> @build_vec_of_trunc_op(i64 %a, i64 %b) { +; RV32-LABEL: build_vec_of_trunc_op: +; RV32: # %bb.0: # %entry +; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: srli a0, a0, 1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: slli a3, a3, 31 +; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: ret +; +; RV64-LABEL: build_vec_of_trunc_op: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vslide1down.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 1 +; RV64-NEXT: ret +entry: + %conv11.i = ashr i64 %a, 1 + %conv11.2 = ashr i64 %b, 1 + %0 = trunc i64 %conv11.i to i32 + %1 = trunc i64 %conv11.2 to i32 + %2 = insertelement <2 x i32> zeroinitializer, i32 %0, i64 0 + %3 = insertelement <2 x i32> %2, i32 %1, i64 1 + ret <2 x i32> %3 +} -- cgit v1.1 From 16d1a6486c25769d264a6ddb70a48bbb1c23c077 Mon Sep 17 00:00:00 2001 From: Cooper Partin Date: Thu, 8 Feb 2024 09:50:21 -0800 Subject: [DirectX] Fix HLSL bitshifts to leverage the OpenCL pipeline for bitshifting (#81030) Fixes #55106 In HLSL bit shifts are defined to shift by shift size % type size. This contains the following changes: HLSL codegen bit shifts will be emitted as x << (y & (sizeof(x) - 1) and bitshift masking leverages the OpenCL pipeline for this. Tests were also added to validate this behavior. Before this change the following was being emitted: ; Function Attrs: noinline nounwind optnone define noundef i32 @"?shl32@@YAHHH@Z"(i32 noundef %V, i32 noundef %S) #0 { entry: %S.addr = alloca i32, align 4 %V.addr = alloca i32, align 4 store i32 %S, ptr %S.addr, align 4 store i32 %V, ptr %V.addr, align 4 %0 = load i32, ptr %V.addr, align 4 %1 = load i32, ptr %S.addr, align 4 %shl = shl i32 %0, %1 ret i32 %shl } After this change: ; Function Attrs: noinline nounwind optnone define noundef i32 @"?shl32@@YAHHH@Z"(i32 noundef %V, i32 noundef %S) #0 { entry: %S.addr = alloca i32, align 4 %V.addr = alloca i32, align 4 store i32 %S, ptr %S.addr, align 4 store i32 %V, ptr %V.addr, align 4 %0 = load i32, ptr %V.addr, align 4 %1 = load i32, ptr %S.addr, align 4 %shl.mask = and i32 %1, 31 %shl = shl i32 %0, %shl.mask ret i32 %shl } --------- Co-authored-by: Cooper Partin --- clang/lib/CodeGen/CGExprScalar.cpp | 4 ++-- clang/test/CodeGenHLSL/shift-mask.hlsl | 35 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGenHLSL/shift-mask.hlsl diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index df8f71c..fa03163 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -4168,7 +4168,7 @@ Value *ScalarExprEmitter::EmitShl(const BinOpInfo &Ops) { bool SanitizeBase = SanitizeSignedBase || SanitizeUnsignedBase; bool SanitizeExponent = CGF.SanOpts.has(SanitizerKind::ShiftExponent); // OpenCL 6.3j: shift values are effectively % word size of LHS. - if (CGF.getLangOpts().OpenCL) + if (CGF.getLangOpts().OpenCL || CGF.getLangOpts().HLSL) RHS = ConstrainShiftValue(Ops.LHS, RHS, "shl.mask"); else if ((SanitizeBase || SanitizeExponent) && isa(Ops.LHS->getType())) { @@ -4237,7 +4237,7 @@ Value *ScalarExprEmitter::EmitShr(const BinOpInfo &Ops) { RHS = Builder.CreateIntCast(RHS, Ops.LHS->getType(), false, "sh_prom"); // OpenCL 6.3j: shift values are effectively % word size of LHS. - if (CGF.getLangOpts().OpenCL) + if (CGF.getLangOpts().OpenCL || CGF.getLangOpts().HLSL) RHS = ConstrainShiftValue(Ops.LHS, RHS, "shr.mask"); else if (CGF.SanOpts.has(SanitizerKind::ShiftExponent) && isa(Ops.LHS->getType())) { diff --git a/clang/test/CodeGenHLSL/shift-mask.hlsl b/clang/test/CodeGenHLSL/shift-mask.hlsl new file mode 100644 index 0000000..d046efa --- /dev/null +++ b/clang/test/CodeGenHLSL/shift-mask.hlsl @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s + +int shl32(int V, int S) { + return V << S; +} + +// CHECK: define noundef i32 @"?shl32{{[@$?.A-Za-z0-9_]+}}"(i32 noundef %V, i32 noundef %S) #0 { +// CHECK-DAG: %[[Masked:.*]] = and i32 %{{.*}}, 31 +// CHECK-DAG: %{{.*}} = shl i32 %{{.*}}, %[[Masked]] + +int shr32(int V, int S) { + return V >> S; +} + +// CHECK: define noundef i32 @"?shr32{{[@$?.A-Za-z0-9_]+}}"(i32 noundef %V, i32 noundef %S) #0 { +// CHECK-DAG: %[[Masked:.*]] = and i32 %{{.*}}, 31 +// CHECK-DAG: %{{.*}} = ashr i32 %{{.*}}, %[[Masked]] + +int64_t shl64(int64_t V, int64_t S) { + return V << S; +} + +// CHECK: define noundef i64 @"?shl64{{[@$?.A-Za-z0-9_]+}}"(i64 noundef %V, i64 noundef %S) #0 { +// CHECK-DAG: %[[Masked:.*]] = and i64 %{{.*}}, 63 +// CHECK-DAG: %{{.*}} = shl i64 %{{.*}}, %[[Masked]] + +int64_t shr64(int64_t V, int64_t S) { + return V >> S; +} + +// CHECK: define noundef i64 @"?shr64{{[@$?.A-Za-z0-9_]+}}"(i64 noundef %V, i64 noundef %S) #0 { +// CHECK-DAG: %[[Masked:.*]] = and i64 %{{.*}}, 63 +// CHECK-DAG: %{{.*}} = ashr i64 %{{.*}}, %[[Masked]] -- cgit v1.1 From 758fd59d018fe01262dd246e3e1e3d4389cb82e4 Mon Sep 17 00:00:00 2001 From: "S. Bharadwaj Yadavalli" Date: Thu, 8 Feb 2024 13:02:32 -0500 Subject: [DirectX][NFC] Change usage pattern *Dxil* to *DXIL* for uniformity (#80778) Match DXIL TableGen class names with structure names in DXIL Emitter. Delete unnecessary Name field. --- llvm/lib/Target/DirectX/DXIL.td | 89 +++++++++++----------- llvm/lib/Target/DirectX/DXILMetadata.cpp | 8 +- llvm/utils/TableGen/DXILEmitter.cpp | 125 +++++++++++++++---------------- 3 files changed, 107 insertions(+), 115 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index aec6460..3f3ace5 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -14,28 +14,28 @@ include "llvm/IR/Intrinsics.td" // Abstract representation of the class a DXIL Operation belongs to. -class DxilOpClass { +class DXILOpClass { string Name = name; } // Abstract representation of the category a DXIL Operation belongs to -class DxilOpCategory { +class DXILOpCategory { string Name = name; } -def UnaryClass : DxilOpClass<"Unary">; -def BinaryClass : DxilOpClass<"Binary">; -def FlattenedThreadIdInGroupClass : DxilOpClass<"FlattenedThreadIdInGroup">; -def ThreadIdInGroupClass : DxilOpClass<"ThreadIdInGroup">; -def ThreadIdClass : DxilOpClass<"ThreadId">; -def GroupIdClass : DxilOpClass<"GroupId">; +def UnaryClass : DXILOpClass<"Unary">; +def BinaryClass : DXILOpClass<"Binary">; +def FlattenedThreadIdInGroupClass : DXILOpClass<"FlattenedThreadIdInGroup">; +def ThreadIdInGroupClass : DXILOpClass<"ThreadIdInGroup">; +def ThreadIdClass : DXILOpClass<"ThreadId">; +def GroupIdClass : DXILOpClass<"GroupId">; -def BinaryUintCategory : DxilOpCategory<"Binary uint">; -def UnaryFloatCategory : DxilOpCategory<"Unary float">; -def ComputeIDCategory : DxilOpCategory<"Compute/Mesh/Amplification shader">; +def BinaryUintCategory : DXILOpCategory<"Binary uint">; +def UnaryFloatCategory : DXILOpCategory<"Unary float">; +def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">; // The parameter description for a DXIL operation -class DxilOpParameter { int Pos = pos; // Position in parameter list @@ -49,16 +49,13 @@ class DxilOpParameter { - // TODO : Appears redundant. OpName should serve the same purpose - string Name = name; // short, unique name - +class DXILOperationDesc { string OpName = ""; // Name of DXIL operation int OpCode = 0; // Unique non-negative integer associated with the operation - DxilOpClass OpClass; // Class of the operation - DxilOpCategory OpCategory; // Category of the operation + DXILOpClass OpClass; // Class of the operation + DXILOpCategory OpCategory; // Category of the operation string Doc = ""; // Description of the operation - list Params = []; // Parameter list of the operation + list Params = []; // Parameter list of the operation string OverloadTypes = ""; // Overload types, if applicable string Attributes = ""; // Attribute shorthands: rn=does not access // memory,ro=only reads from memory, @@ -73,9 +70,9 @@ class DxilOperationDesc { list StatsGroup = []; } -class DxilOperation params, - list statsGroup = []> : DxilOperationDesc { +class DXILOperation params, + list statsGroup = []> : DXILOperationDesc { let OpName = name; let OpCode = opCode; let Doc = doc; @@ -90,56 +87,56 @@ class DxilOperation { Intrinsic llvm_intrinsic = llvm_intrinsic_; } -def Sin : DxilOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.", +def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.", "half;float;", "rn", [ - DxilOpParameter<0, "$o", "", "operation result">, - DxilOpParameter<1, "i32", "opcode", "DXIL opcode">, - DxilOpParameter<2, "$o", "value", "input value"> + DXILOpParameter<0, "$o", "", "operation result">, + DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, + DXILOpParameter<2, "$o", "value", "input value"> ], ["floats"]>, LLVMIntrinsic; -def UMax : DxilOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b", +def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b", "i16;i32;i64;", "rn", [ - DxilOpParameter<0, "$o", "", "operation result">, - DxilOpParameter<1, "i32", "opcode", "DXIL opcode">, - DxilOpParameter<2, "$o", "a", "input value">, - DxilOpParameter<3, "$o", "b", "input value"> + DXILOpParameter<0, "$o", "", "operation result">, + DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, + DXILOpParameter<2, "$o", "a", "input value">, + DXILOpParameter<3, "$o", "b", "input value"> ], ["uints"]>, LLVMIntrinsic; -def ThreadId : DxilOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", "i32;", "rn", +def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", "i32;", "rn", [ - DxilOpParameter<0, "i32", "", "thread ID component">, - DxilOpParameter<1, "i32", "opcode", "DXIL opcode">, - DxilOpParameter<2, "i32", "component", "component to read (x,y,z)"> + DXILOpParameter<0, "i32", "", "thread ID component">, + DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, + DXILOpParameter<2, "i32", "component", "component to read (x,y,z)"> ]>, LLVMIntrinsic; -def GroupId : DxilOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", "i32;", "rn", +def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", "i32;", "rn", [ - DxilOpParameter<0, "i32", "", "group ID component">, - DxilOpParameter<1, "i32", "opcode", "DXIL opcode">, - DxilOpParameter<2, "i32", "component", "component to read"> + DXILOpParameter<0, "i32", "", "group ID component">, + DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, + DXILOpParameter<2, "i32", "component", "component to read"> ]>, LLVMIntrinsic; -def ThreadIdInGroup : DxilOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory, +def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory, "reads the thread ID within the group (SV_GroupThreadID)", "i32;", "rn", [ - DxilOpParameter<0, "i32", "", "thread ID in group component">, - DxilOpParameter<1, "i32", "opcode", "DXIL opcode">, - DxilOpParameter<2, "i32", "component", "component to read (x,y,z)"> + DXILOpParameter<0, "i32", "", "thread ID in group component">, + DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, + DXILOpParameter<2, "i32", "component", "component to read (x,y,z)"> ]>, LLVMIntrinsic; -def FlattenedThreadIdInGroup : DxilOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory, +def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory, "provides a flattened index for a given thread within a given group (SV_GroupIndex)", "i32;", "rn", [ - DxilOpParameter<0, "i32", "", "result">, - DxilOpParameter<1, "i32", "opcode", "DXIL opcode"> + DXILOpParameter<0, "i32", "", "result">, + DXILOpParameter<1, "i32", "opcode", "DXIL opcode"> ]>, LLVMIntrinsic; diff --git a/llvm/lib/Target/DirectX/DXILMetadata.cpp b/llvm/lib/Target/DirectX/DXILMetadata.cpp index db55f25..2d94490 100644 --- a/llvm/lib/Target/DirectX/DXILMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILMetadata.cpp @@ -213,7 +213,7 @@ public: // FIXME: add signature for profile other than CS. // See https://github.com/llvm/llvm-project/issues/57928. MDTuple *Signatures = nullptr; - return emitDxilEntryPointTuple( + return emitDXILEntryPointTuple( &F, F.getName().str(), Signatures, Resources, Props.emitDXILEntryProps(RawShaderFlag, Ctx, /*IsLib*/ false), Ctx); } @@ -222,7 +222,7 @@ public: // FIXME: add signature for profile other than CS. // See https://github.com/llvm/llvm-project/issues/57928. MDTuple *Signatures = nullptr; - return emitDxilEntryPointTuple( + return emitDXILEntryPointTuple( &F, F.getName().str(), Signatures, /*entry in lib doesn't need resources metadata*/ nullptr, Props.emitDXILEntryProps(RawShaderFlag, Ctx, /*IsLib*/ true), Ctx); @@ -233,13 +233,13 @@ public: static MDTuple *emitEmptyEntryForLib(MDTuple *Resources, uint64_t RawShaderFlag, LLVMContext &Ctx) { - return emitDxilEntryPointTuple( + return emitDXILEntryPointTuple( nullptr, "", nullptr, Resources, EntryProps::emitEntryPropsForEmptyEntry(RawShaderFlag, Ctx), Ctx); } private: - static MDTuple *emitDxilEntryPointTuple(Function *Fn, const std::string &Name, + static MDTuple *emitDXILEntryPointTuple(Function *Fn, const std::string &Name, MDTuple *Signatures, MDTuple *Resources, MDTuple *Properties, diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp index 475a57a..cb9f9c6 100644 --- a/llvm/utils/TableGen/DXILEmitter.cpp +++ b/llvm/utils/TableGen/DXILEmitter.cpp @@ -30,7 +30,7 @@ struct DXILShaderModel { int Minor = 0; }; -struct DXILParam { +struct DXILParameter { int Pos; // position in parameter list ParameterKind Kind; StringRef Name; // short, unique name @@ -38,23 +38,21 @@ struct DXILParam { bool IsConst; // whether this argument requires a constant value in the IR StringRef EnumName; // the name of the enum type if applicable int MaxValue; // the maximum value for this parameter if applicable - DXILParam(const Record *R); + DXILParameter(const Record *R); }; -struct DXILOperationData { - StringRef Name; // short, unique name - - StringRef DXILOp; // name of DXIL operation - int DXILOpID; // ID of DXIL operation - StringRef DXILClass; // name of the opcode class +struct DXILOperationDesc { + StringRef OpName; // name of DXIL operation + int OpCode; // ID of DXIL operation + StringRef OpClass; // name of the opcode class StringRef Category; // classification for this instruction StringRef Doc; // the documentation description of this instruction - SmallVector Params; // the operands that this instruction takes + SmallVector Params; // the operands that this instruction takes StringRef OverloadTypes; // overload types if applicable StringRef FnAttr; // attribute shorthands: rn=does not access // memory,ro=only reads from memory - StringRef Intrinsic; // The llvm intrinsic map to DXILOp. Default is "" which + StringRef Intrinsic; // The llvm intrinsic map to OpName. Default is "" which // means no map exist bool IsDeriv = false; // whether this is some kind of derivative bool IsGradient = false; // whether this requires a gradient calculation @@ -71,11 +69,10 @@ struct DXILOperationData { int OverloadParamIndex; // parameter index which control the overload. // When < 0, should be only 1 overload type. SmallVector counters; // counters for this inst. - DXILOperationData(const Record *R) { - Name = R->getValueAsString("Name"); - DXILOp = R->getValueAsString("OpName"); - DXILOpID = R->getValueAsInt("OpCode"); - DXILClass = R->getValueAsDef("OpClass")->getValueAsString("Name"); + DXILOperationDesc(const Record *R) { + OpName = R->getValueAsString("OpName"); + OpCode = R->getValueAsInt("OpCode"); + OpClass = R->getValueAsDef("OpClass")->getValueAsString("Name"); Category = R->getValueAsDef("OpCategory")->getValueAsString("Name"); if (R->getValue("llvm_intrinsic")) { @@ -92,7 +89,7 @@ struct DXILOperationData { OverloadParamIndex = -1; for (unsigned I = 0; I < ParamList->size(); ++I) { Record *Param = ParamList->getElementAsRecord(I); - Params.emplace_back(DXILParam(Param)); + Params.emplace_back(DXILParameter(Param)); auto &CurParam = Params.back(); if (CurParam.Kind >= ParameterKind::OVERLOAD) OverloadParamIndex = I; @@ -121,7 +118,7 @@ static ParameterKind parameterTypeNameToKind(StringRef Name) { .Default(ParameterKind::INVALID); } -DXILParam::DXILParam(const Record *R) { +DXILParameter::DXILParameter(const Record *R) { Name = R->getValueAsString("Name"); Pos = R->getValueAsInt("Pos"); Kind = parameterTypeNameToKind(R->getValueAsString("LLVMType")); @@ -166,10 +163,9 @@ static std::string parameterKindToString(ParameterKind Kind) { llvm_unreachable("Unknown llvm::dxil::ParameterKind enum"); } -static void emitDXILOpEnum(DXILOperationData &DXILOp, raw_ostream &OS) { +static void emitDXILOpEnum(DXILOperationDesc &Op, raw_ostream &OS) { // Name = ID, // Doc - OS << DXILOp.Name << " = " << DXILOp.DXILOpID << ", // " << DXILOp.Doc - << "\n"; + OS << Op.OpName << " = " << Op.OpCode << ", // " << Op.Doc << "\n"; } static std::string buildCategoryStr(StringSet<> &Cetegorys) { @@ -182,14 +178,14 @@ static std::string buildCategoryStr(StringSet<> &Cetegorys) { } // Emit enum declaration for DXIL. -static void emitDXILEnums(std::vector &DXILOps, +static void emitDXILEnums(std::vector &Ops, raw_ostream &OS) { // Sort by Category + OpName. - llvm::sort(DXILOps, [](DXILOperationData &A, DXILOperationData &B) { + llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) { // Group by Category first. if (A.Category == B.Category) // Inside same Category, order by OpName. - return A.DXILOp < B.DXILOp; + return A.OpName < B.OpName; else return A.Category < B.Category; }); @@ -199,18 +195,18 @@ static void emitDXILEnums(std::vector &DXILOps, StringMap> ClassMap; StringRef PrevCategory = ""; - for (auto &DXILOp : DXILOps) { - StringRef Category = DXILOp.Category; + for (auto &Op : Ops) { + StringRef Category = Op.Category; if (Category != PrevCategory) { OS << "\n// " << Category << "\n"; PrevCategory = Category; } - emitDXILOpEnum(DXILOp, OS); - auto It = ClassMap.find(DXILOp.DXILClass); + emitDXILOpEnum(Op, OS); + auto It = ClassMap.find(Op.OpClass); if (It != ClassMap.end()) { - It->second.insert(DXILOp.Category); + It->second.insert(Op.Category); } else { - ClassMap[DXILOp.DXILClass].insert(DXILOp.Category); + ClassMap[Op.OpClass].insert(Op.Category); } } @@ -253,18 +249,18 @@ static void emitDXILEnums(std::vector &DXILOps, } // Emit map from llvm intrinsic to DXIL operation. -static void emitDXILIntrinsicMap(std::vector &DXILOps, +static void emitDXILIntrinsicMap(std::vector &Ops, raw_ostream &OS) { OS << "\n"; // FIXME: use array instead of SmallDenseMap. OS << "static const SmallDenseMap LowerMap = " "{\n"; - for (auto &DXILOp : DXILOps) { - if (DXILOp.Intrinsic.empty()) + for (auto &Op : Ops) { + if (Op.Intrinsic.empty()) continue; // {Intrinsic::sin, dxil::OpCode::Sin}, - OS << " { Intrinsic::" << DXILOp.Intrinsic - << ", dxil::OpCode::" << DXILOp.DXILOp << "},\n"; + OS << " { Intrinsic::" << Op.Intrinsic << ", dxil::OpCode::" << Op.OpName + << "},\n"; } OS << "};\n"; OS << "\n"; @@ -315,20 +311,20 @@ static std::string lowerFirstLetter(StringRef Name) { return LowerName; } -static std::string getDXILOpClassName(StringRef DXILOpClass) { +static std::string getDXILOpClassName(StringRef OpClass) { // Lower first letter expect for special case. - return StringSwitch(DXILOpClass) + return StringSwitch(OpClass) .Case("CBufferLoad", "cbufferLoad") .Case("CBufferLoadLegacy", "cbufferLoadLegacy") .Case("GSInstanceID", "gsInstanceID") - .Default(lowerFirstLetter(DXILOpClass)); + .Default(lowerFirstLetter(OpClass)); } -static void emitDXILOperationTable(std::vector &DXILOps, +static void emitDXILOperationTable(std::vector &Ops, raw_ostream &OS) { - // Sort by DXILOpID. - llvm::sort(DXILOps, [](DXILOperationData &A, DXILOperationData &B) { - return A.DXILOpID < B.DXILOpID; + // Sort by OpCode. + llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) { + return A.OpCode < B.OpCode; }); // Collect Names. @@ -338,18 +334,18 @@ static void emitDXILOperationTable(std::vector &DXILOps, StringMap> ParameterMap; StringSet<> ClassSet; - for (auto &DXILOp : DXILOps) { - OpStrings.add(DXILOp.DXILOp.str()); + for (auto &Op : Ops) { + OpStrings.add(Op.OpName.str()); - if (ClassSet.contains(DXILOp.DXILClass)) + if (ClassSet.contains(Op.OpClass)) continue; - ClassSet.insert(DXILOp.DXILClass); - OpClassStrings.add(getDXILOpClassName(DXILOp.DXILClass)); + ClassSet.insert(Op.OpClass); + OpClassStrings.add(getDXILOpClassName(Op.OpClass)); SmallVector ParamKindVec; - for (auto &Param : DXILOp.Params) { + for (auto &Param : Op.Params) { ParamKindVec.emplace_back(Param.Kind); } - ParameterMap[DXILOp.DXILClass] = ParamKindVec; + ParameterMap[Op.OpClass] = ParamKindVec; Parameters.add(ParamKindVec); } @@ -363,26 +359,25 @@ static void emitDXILOperationTable(std::vector &DXILOps, // OpCodeClassNameIndex, // OverloadKind::FLOAT | OverloadKind::HALF, Attribute::AttrKind::ReadNone, 0, // 3, ParameterTableOffset}, - OS << "static const OpCodeProperty *getOpCodeProperty(dxil::OpCode DXILOp) " + OS << "static const OpCodeProperty *getOpCodeProperty(dxil::OpCode Op) " "{\n"; OS << " static const OpCodeProperty OpCodeProps[] = {\n"; - for (auto &DXILOp : DXILOps) { - OS << " { dxil::OpCode::" << DXILOp.DXILOp << ", " - << OpStrings.get(DXILOp.DXILOp.str()) - << ", OpCodeClass::" << DXILOp.DXILClass << ", " - << OpClassStrings.get(getDXILOpClassName(DXILOp.DXILClass)) << ", " - << getDXILOperationOverload(DXILOp.OverloadTypes) << ", " - << emitDXILOperationFnAttr(DXILOp.FnAttr) << ", " - << DXILOp.OverloadParamIndex << ", " << DXILOp.Params.size() << ", " - << Parameters.get(ParameterMap[DXILOp.DXILClass]) << " },\n"; + for (auto &Op : Ops) { + OS << " { dxil::OpCode::" << Op.OpName << ", " + << OpStrings.get(Op.OpName.str()) << ", OpCodeClass::" << Op.OpClass + << ", " << OpClassStrings.get(getDXILOpClassName(Op.OpClass)) << ", " + << getDXILOperationOverload(Op.OverloadTypes) << ", " + << emitDXILOperationFnAttr(Op.FnAttr) << ", " << Op.OverloadParamIndex + << ", " << Op.Params.size() << ", " + << Parameters.get(ParameterMap[Op.OpClass]) << " },\n"; } OS << " };\n"; OS << " // FIXME: change search to indexing with\n"; - OS << " // DXILOp once all DXIL op is added.\n"; + OS << " // Op once all DXIL operations are added.\n"; OS << " OpCodeProperty TmpProp;\n"; - OS << " TmpProp.OpCode = DXILOp;\n"; + OS << " TmpProp.OpCode = Op;\n"; OS << " const OpCodeProperty *Prop =\n"; OS << " llvm::lower_bound(OpCodeProps, TmpProp,\n"; OS << " [](const OpCodeProperty &A, const " @@ -394,12 +389,12 @@ static void emitDXILOperationTable(std::vector &DXILOps, OS << "}\n\n"; // Emit the string tables. - OS << "static const char *getOpCodeName(dxil::OpCode DXILOp) {\n\n"; + OS << "static const char *getOpCodeName(dxil::OpCode Op) {\n\n"; OpStrings.emitStringLiteralDef(OS, " static const char DXILOpCodeNameTable[]"); - OS << " auto *Prop = getOpCodeProperty(DXILOp);\n"; + OS << " auto *Prop = getOpCodeProperty(Op);\n"; OS << " unsigned Index = Prop->OpCodeNameOffset;\n"; OS << " return DXILOpCodeNameTable + Index;\n"; OS << "}\n\n"; @@ -431,14 +426,14 @@ static void emitDXILOperationTable(std::vector &DXILOps, } static void EmitDXILOperation(RecordKeeper &Records, raw_ostream &OS) { - std::vector Ops = Records.getAllDerivedDefinitions("DxilOperation"); + std::vector Ops = Records.getAllDerivedDefinitions("DXILOperation"); OS << "// Generated code, do not edit.\n"; OS << "\n"; - std::vector DXILOps; + std::vector DXILOps; DXILOps.reserve(Ops.size()); for (auto *Record : Ops) { - DXILOps.emplace_back(DXILOperationData(Record)); + DXILOps.emplace_back(DXILOperationDesc(Record)); } OS << "#ifdef DXIL_OP_ENUM\n"; -- cgit v1.1 From abc4f74df7ab3b324b7bf9d171e8a22a92d7dda5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 8 Feb 2024 10:03:08 -0800 Subject: [flang][cuda] Lower attribute for local variable (#81076) This is a first simple patch to introduce a new FIR attribute to carry the CUDA variable attribute information to hlfir.declare and fir.declare operations. It currently lowers this information for local variables. The texture attribute is omitted since it is rejected by semantic and will not make its way to MLIR. This new attribute is added as optional attribute to the hlfir.declare and fir.declare operations. --- flang/include/flang/Lower/ConvertVariable.h | 6 ++++ flang/include/flang/Optimizer/Builder/HLFIRTools.h | 10 +++--- flang/include/flang/Optimizer/Dialect/FIRAttr.td | 23 +++++++++++- flang/include/flang/Optimizer/Dialect/FIROps.td | 3 +- flang/include/flang/Optimizer/HLFIR/HLFIROps.td | 6 ++-- flang/lib/Lower/ConvertVariable.cpp | 41 ++++++++++++++++++++-- flang/lib/Optimizer/Builder/HLFIRTools.cpp | 5 +-- flang/lib/Optimizer/Dialect/FIRAttr.cpp | 3 +- flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp | 5 +-- .../Optimizer/HLFIR/Transforms/ConvertToFIR.cpp | 6 +++- flang/test/Lower/CUDA/cuda-data-attribute.cuf | 22 ++++++++++++ flang/unittests/Optimizer/FortranVariableTest.cpp | 12 ++++--- 12 files changed, 121 insertions(+), 21 deletions(-) create mode 100644 flang/test/Lower/CUDA/cuda-data-attribute.cuf diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h index 0ff3ca9..cdbf050 100644 --- a/flang/include/flang/Lower/ConvertVariable.h +++ b/flang/include/flang/Lower/ConvertVariable.h @@ -137,6 +137,12 @@ translateSymbolAttributes(mlir::MLIRContext *mlirContext, fir::FortranVariableFlagsEnum extraFlags = fir::FortranVariableFlagsEnum::None); +/// Translate the CUDA Fortran attributes of \p sym into the FIR CUDA attribute +/// representation. +fir::CUDAAttributeAttr +translateSymbolCUDAAttribute(mlir::MLIRContext *mlirContext, + const Fortran::semantics::Symbol &sym); + /// Map a symbol to a given fir::ExtendedValue. This will generate an /// hlfir.declare when lowering to HLFIR and map the hlfir.declare result to the /// symbol. diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h index efbd57c..fe69ffa 100644 --- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h +++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h @@ -233,11 +233,11 @@ translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder, fir::FortranVariableOpInterface fortranVariable); /// Generate declaration for a fir::ExtendedValue in memory. -fir::FortranVariableOpInterface genDeclare(mlir::Location loc, - fir::FirOpBuilder &builder, - const fir::ExtendedValue &exv, - llvm::StringRef name, - fir::FortranVariableFlagsAttr flags); +fir::FortranVariableOpInterface +genDeclare(mlir::Location loc, fir::FirOpBuilder &builder, + const fir::ExtendedValue &exv, llvm::StringRef name, + fir::FortranVariableFlagsAttr flags, + fir::CUDAAttributeAttr cudaAttr = {}); /// Generate an hlfir.associate to build a variable from an expression value. /// The type of the variable must be provided so that scalar logicals are diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td index 114bf7d..bc73124 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td +++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td @@ -55,7 +55,28 @@ def fir_FortranVariableFlagsAttr : fir_Attr<"FortranVariableFlags"> { let returnType = "::fir::FortranVariableFlagsEnum"; let convertFromStorage = "$_self.getFlags()"; let constBuilderCall = - "::fir::FortranVariableFlagsAttr::get($_builder.getContext(), $0)"; + "::fir::FortranVariableFlagsAttr::get($_builder.getContext(), $0)"; +} + +def CUDAconstant : I32EnumAttrCase<"Constant", 0, "constant">; +def CUDAdevice : I32EnumAttrCase<"Device", 1, "device">; +def CUDAmanaged : I32EnumAttrCase<"Managed", 2, "managed">; +def CUDApinned : I32EnumAttrCase<"Pinned", 3, "pinned">; +def CUDAshared : I32EnumAttrCase<"Shared", 4, "shared">; +def CUDAunified : I32EnumAttrCase<"Unified", 5, "unified">; +// Texture is omitted since it is obsolete and rejected by semantic. + +def fir_CUDAAttribute : I32EnumAttr< + "CUDAAttribute", + "CUDA Fortran variable attributes", + [CUDAconstant, CUDAdevice, CUDAmanaged, CUDApinned, CUDAshared, + CUDAunified]> { + let genSpecializedAttr = 0; + let cppNamespace = "::fir"; +} + +def fir_CUDAAttributeAttr : EnumAttr { + let assemblyFormat = [{ ```<` $value `>` }]; } def fir_BoxFieldAttr : I32EnumAttr< diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index fcecc60..b954a0c 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -3027,7 +3027,8 @@ def fir_DeclareOp : fir_Op<"declare", [AttrSizedOperandSegments, Optional:$shape, Variadic:$typeparams, Builtin_StringAttr:$uniq_name, - OptionalAttr:$fortran_attrs + OptionalAttr:$fortran_attrs, + OptionalAttr:$cuda_attr ); let results = (outs AnyRefOrBox); diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td index 753ede2..f22e9a7 100644 --- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td +++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td @@ -88,7 +88,8 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments, Optional:$shape, Variadic:$typeparams, Builtin_StringAttr:$uniq_name, - OptionalAttr:$fortran_attrs + OptionalAttr:$fortran_attrs, + OptionalAttr:$cuda_attr ); let results = (outs AnyFortranVariable, AnyRefOrBoxLike); @@ -101,7 +102,8 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments, let builders = [ OpBuilder<(ins "mlir::Value":$memref, "llvm::StringRef":$uniq_name, CArg<"mlir::Value", "{}">:$shape, CArg<"mlir::ValueRange", "{}">:$typeparams, - CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs)>]; + CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs, + CArg<"fir::CUDAAttributeAttr", "{}">:$cuda_attr)>]; let extraClassDeclaration = [{ /// Get the variable original base (same as input). It lacks diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 8ea2557..f761e14 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -1579,6 +1579,38 @@ fir::FortranVariableFlagsAttr Fortran::lower::translateSymbolAttributes( return fir::FortranVariableFlagsAttr::get(mlirContext, flags); } +fir::CUDAAttributeAttr Fortran::lower::translateSymbolCUDAAttribute( + mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym) { + std::optional cudaAttr = + Fortran::semantics::GetCUDADataAttr(&sym); + if (cudaAttr) { + fir::CUDAAttribute attr; + switch (*cudaAttr) { + case Fortran::common::CUDADataAttr::Constant: + attr = fir::CUDAAttribute::Constant; + break; + case Fortran::common::CUDADataAttr::Device: + attr = fir::CUDAAttribute::Device; + break; + case Fortran::common::CUDADataAttr::Managed: + attr = fir::CUDAAttribute::Managed; + break; + case Fortran::common::CUDADataAttr::Pinned: + attr = fir::CUDAAttribute::Pinned; + break; + case Fortran::common::CUDADataAttr::Shared: + attr = fir::CUDAAttribute::Shared; + break; + case Fortran::common::CUDADataAttr::Texture: + // Obsolete attribute + break; + } + + return fir::CUDAAttributeAttr::get(mlirContext, attr); + } + return {}; +} + /// Map a symbol to its FIR address and evaluated specification expressions. /// Not for symbols lowered to fir.box. /// Will optionally create fir.declare. @@ -1618,6 +1650,8 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter, auto name = converter.mangleName(sym); fir::FortranVariableFlagsAttr attributes = Fortran::lower::translateSymbolAttributes(builder.getContext(), sym); + fir::CUDAAttributeAttr cudaAttr = + Fortran::lower::translateSymbolCUDAAttribute(builder.getContext(), sym); if (isCrayPointee) { mlir::Type baseType = @@ -1664,7 +1698,7 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter, return; } auto newBase = builder.create( - loc, base, name, shapeOrShift, lenParams, attributes); + loc, base, name, shapeOrShift, lenParams, attributes, cudaAttr); symMap.addVariableDefinition(sym, newBase, force); return; } @@ -1709,9 +1743,12 @@ void Fortran::lower::genDeclareSymbol( fir::FortranVariableFlagsAttr attributes = Fortran::lower::translateSymbolAttributes( builder.getContext(), sym.GetUltimate(), extraFlags); + fir::CUDAAttributeAttr cudaAttr = + Fortran::lower::translateSymbolCUDAAttribute(builder.getContext(), + sym.GetUltimate()); auto name = converter.mangleName(sym); hlfir::EntityWithAttributes declare = - hlfir::genDeclare(loc, builder, exv, name, attributes); + hlfir::genDeclare(loc, builder, exv, name, attributes, cudaAttr); symMap.addVariableDefinition(sym, declare.getIfVariableInterface(), force); return; } diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp index 94f723b..61e5311 100644 --- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp +++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp @@ -198,7 +198,8 @@ mlir::Value hlfir::Entity::getFirBase() const { fir::FortranVariableOpInterface hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder, const fir::ExtendedValue &exv, llvm::StringRef name, - fir::FortranVariableFlagsAttr flags) { + fir::FortranVariableFlagsAttr flags, + fir::CUDAAttributeAttr cudaAttr) { mlir::Value base = fir::getBase(exv); assert(fir::conformsWithPassByRef(base.getType()) && @@ -228,7 +229,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder, }, [](const auto &) {}); auto declareOp = builder.create( - loc, base, name, shapeOrShift, lenParams, flags); + loc, base, name, shapeOrShift, lenParams, flags, cudaAttr); return mlir::cast(declareOp.getOperation()); } diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp index 4871091..04431b6 100644 --- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp +++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp @@ -14,6 +14,7 @@ #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/Support/KindMapping.h" #include "mlir/IR/AttributeSupport.h" +#include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/DialectImplementation.h" #include "llvm/ADT/SmallString.h" @@ -297,5 +298,5 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr, void FIROpsDialect::registerAttributes() { addAttributes(); + UpperBoundAttr, CUDAAttributeAttr>(); } diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp index ce12e6f..85644c1 100644 --- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp +++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp @@ -123,14 +123,15 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder, mlir::OperationState &result, mlir::Value memref, llvm::StringRef uniq_name, mlir::Value shape, mlir::ValueRange typeparams, - fir::FortranVariableFlagsAttr fortran_attrs) { + fir::FortranVariableFlagsAttr fortran_attrs, + fir::CUDAAttributeAttr cuda_attr) { auto nameAttr = builder.getStringAttr(uniq_name); mlir::Type inputType = memref.getType(); bool hasExplicitLbs = hasExplicitLowerBounds(shape); mlir::Type hlfirVariableType = getHLFIRVariableType(inputType, hasExplicitLbs); build(builder, result, {hlfirVariableType, inputType}, memref, shape, - typeparams, nameAttr, fortran_attrs); + typeparams, nameAttr, fortran_attrs, cuda_attr); } mlir::LogicalResult hlfir::DeclareOp::verify() { diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp index b690185..b15fb59 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp @@ -320,12 +320,16 @@ public: mlir::Location loc = declareOp->getLoc(); mlir::Value memref = declareOp.getMemref(); fir::FortranVariableFlagsAttr fortranAttrs; + fir::CUDAAttributeAttr cudaAttr; if (auto attrs = declareOp.getFortranAttrs()) fortranAttrs = fir::FortranVariableFlagsAttr::get(rewriter.getContext(), *attrs); + if (auto attr = declareOp.getCudaAttr()) + cudaAttr = fir::CUDAAttributeAttr::get(rewriter.getContext(), *attr); auto firDeclareOp = rewriter.create( loc, memref.getType(), memref, declareOp.getShape(), - declareOp.getTypeparams(), declareOp.getUniqName(), fortranAttrs); + declareOp.getTypeparams(), declareOp.getUniqName(), fortranAttrs, + cudaAttr); // Propagate other attributes from hlfir.declare to fir.declare. // OpenACC's acc.declare is one example. Right now, the propagation diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf new file mode 100644 index 0000000..caa8ac7 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf @@ -0,0 +1,22 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s +! RUN: bbc -emit-hlfir -fcuda %s -o - | fir-opt -convert-hlfir-to-fir | FileCheck %s --check-prefix=FIR + +! Test lowering of CUDA attribute on local variables. + +subroutine local_var_attrs + real, constant :: rc + real, device :: rd + real, allocatable, managed :: rm + real, allocatable, pinned :: rp +end subroutine + +! CHECK-LABEL: func.func @_QPlocal_var_attrs() +! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QFlocal_var_attrsErc"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QFlocal_var_attrsErd"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFlocal_var_attrsErm"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFlocal_var_attrsErp"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) + +! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QFlocal_var_attrsErc"} : (!fir.ref) -> !fir.ref +! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QFlocal_var_attrsErd"} : (!fir.ref) -> !fir.ref +! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFlocal_var_attrsErm"} : (!fir.ref>>) -> !fir.ref>> +! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFlocal_var_attrsErp"} : (!fir.ref>>) -> !fir.ref>> diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp index 42ed225..4b101ce 100644 --- a/flang/unittests/Optimizer/FortranVariableTest.cpp +++ b/flang/unittests/Optimizer/FortranVariableTest.cpp @@ -49,7 +49,8 @@ TEST_F(FortranVariableTest, SimpleScalar) { auto name = mlir::StringAttr::get(&context, "x"); auto declare = builder->create(loc, addr.getType(), addr, /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt, name, - /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}); + /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, + /*cuda_attr=*/fir::CUDAAttributeAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_FALSE(fortranVariable.isArray()); @@ -74,7 +75,8 @@ TEST_F(FortranVariableTest, CharacterScalar) { auto name = mlir::StringAttr::get(&context, "x"); auto declare = builder->create(loc, addr.getType(), addr, /*shape=*/mlir::Value{}, typeParams, name, - /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}); + /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, + /*cuda_attr=*/fir::CUDAAttributeAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_FALSE(fortranVariable.isArray()); @@ -104,7 +106,8 @@ TEST_F(FortranVariableTest, SimpleArray) { auto name = mlir::StringAttr::get(&context, "x"); auto declare = builder->create(loc, addr.getType(), addr, shape, /*typeParams*/ std::nullopt, name, - /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}); + /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, + /*cuda_attr=*/fir::CUDAAttributeAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_TRUE(fortranVariable.isArray()); @@ -134,7 +137,8 @@ TEST_F(FortranVariableTest, CharacterArray) { auto name = mlir::StringAttr::get(&context, "x"); auto declare = builder->create(loc, addr.getType(), addr, shape, typeParams, name, - /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}); + /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, + /*cuda_attr=*/fir::CUDAAttributeAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_TRUE(fortranVariable.isArray()); -- cgit v1.1 From 17f0680f69f44d340fd0205f7763b2830357c0d5 Mon Sep 17 00:00:00 2001 From: Krystian Stasiowski Date: Thu, 8 Feb 2024 13:04:10 -0500 Subject: [Clang][Sema] Abbreviated function templates do not append invented parameters to empty template parameter lists (#80864) According to [dcl.fct] p23: > An abbreviated function template can have a _template-head_. The invented _template-parameters_ are appended to the _template-parameter-list_ after the explicitly declared _template-parameters_. `template<>` is not a _template-head_ -- a _template-head_ must have at least one _template-parameter_. This patch corrects our current behavior of appending the invented template parameters to the innermost template parameter list, regardless of whether it is empty. Example: ``` template struct A { void f(auto); }; template<> void A::f(auto); // ok template<> template<> // warning: extraneous template parameter list in template specialization void A::f(auto); ``` --- clang/docs/ReleaseNotes.rst | 2 ++ clang/include/clang/AST/DeclTemplate.h | 1 + clang/lib/AST/DeclPrinter.cpp | 4 ++++ clang/lib/Sema/SemaDecl.cpp | 2 +- clang/lib/Sema/SemaDeclCXX.cpp | 11 +++++++++- clang/test/AST/ast-print-method-decl.cpp | 3 +-- .../test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp | 24 ++++++++++++++++++++++ clang/test/OpenMP/for_loop_auto.cpp | 2 +- 8 files changed, 44 insertions(+), 5 deletions(-) create mode 100644 clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e158284..32440ee 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -215,6 +215,8 @@ Bug Fixes to C++ Support Fixes (`#68490 `_) - Fix a crash when trying to call a varargs function that also has an explicit object parameter. Fixes (`#80971 ICE when explicit object parameter be a function parameter pack`) +- Fixed a bug where abbreviated function templates would append their invented template parameters to + an empty template parameter lists. Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h index baf7114..e3b6a7e 100644 --- a/clang/include/clang/AST/DeclTemplate.h +++ b/clang/include/clang/AST/DeclTemplate.h @@ -134,6 +134,7 @@ public: const_iterator end() const { return begin() + NumParams; } unsigned size() const { return NumParams; } + bool empty() const { return NumParams == 0; } ArrayRef asArray() { return llvm::ArrayRef(begin(), end()); } ArrayRef asArray() const { diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp index 822ac12..43d2219 100644 --- a/clang/lib/AST/DeclPrinter.cpp +++ b/clang/lib/AST/DeclPrinter.cpp @@ -1215,6 +1215,10 @@ void DeclPrinter::printTemplateParameters(const TemplateParameterList *Params, bool OmitTemplateKW) { assert(Params); + // Don't print invented template parameter lists. + if (!Params->empty() && Params->getParam(0)->isImplicit()) + return; + if (!OmitTemplateKW) Out << "template "; Out << '<'; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 18a5d93..2c526cd 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -9759,7 +9759,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC, SmallVector TemplateParamLists; llvm::append_range(TemplateParamLists, TemplateParamListsRef); if (TemplateParameterList *Invented = D.getInventedTemplateParameterList()) { - if (!TemplateParamLists.empty() && + if (!TemplateParamLists.empty() && !TemplateParamLists.back()->empty() && Invented->getDepth() == TemplateParamLists.back()->getDepth()) TemplateParamLists.back() = Invented; else diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index ab8a967..fea8c50 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -19294,7 +19294,16 @@ void Sema::ActOnStartFunctionDeclarationDeclarator( ExplicitLists, /*IsFriend=*/false, IsMemberSpecialization, IsInvalid, /*SuppressDiagnostic=*/true); } - if (ExplicitParams) { + // C++23 [dcl.fct]p23: + // An abbreviated function template can have a template-head. The invented + // template-parameters are appended to the template-parameter-list after + // the explicitly declared template-parameters. + // + // A template-head must have one or more template-parameters (read: + // 'template<>' is *not* a template-head). Only append the invented + // template parameters if we matched the nested-name-specifier to a non-empty + // TemplateParameterList. + if (ExplicitParams && !ExplicitParams->empty()) { Info.AutoTemplateParameterDepth = ExplicitParams->getDepth(); llvm::append_range(Info.TemplateParams, *ExplicitParams); Info.NumExplicitTemplateParams = ExplicitParams->size(); diff --git a/clang/test/AST/ast-print-method-decl.cpp b/clang/test/AST/ast-print-method-decl.cpp index 9f5d112..75dea0c 100644 --- a/clang/test/AST/ast-print-method-decl.cpp +++ b/clang/test/AST/ast-print-method-decl.cpp @@ -32,8 +32,7 @@ struct DelegatingCtor2 { // CHECK: struct DelegatingCtor3 { struct DelegatingCtor3 { - // FIXME: template <> should not be output - // CHECK: template <> DelegatingCtor3(auto); + // CHECK: DelegatingCtor3(auto); DelegatingCtor3(auto); // FIXME: Implicitly specialized method should not be output diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp new file mode 100644 index 0000000..469c4e0 --- /dev/null +++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp @@ -0,0 +1,24 @@ +// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify %s + +// FIXME: This should be an error with -pedantic-errors. +template<> // expected-warning {{extraneous template parameter list in template specialization}} +void f(auto); + +template +void f(auto); + +template +struct A { + void g(auto); +}; + +template +void A::g(auto) { } + +template<> +void A::g(auto) { } + +// FIXME: This should be an error with -pedantic-errors. +template<> +template<> // expected-warning {{extraneous template parameter list in template specialization}} +void A::g(auto) { } diff --git a/clang/test/OpenMP/for_loop_auto.cpp b/clang/test/OpenMP/for_loop_auto.cpp index b2c5540..4467de6 100644 --- a/clang/test/OpenMP/for_loop_auto.cpp +++ b/clang/test/OpenMP/for_loop_auto.cpp @@ -10,7 +10,7 @@ #ifndef HEADER #define HEADER -// CHECK: template <> void do_loop(const auto &v) { +// CHECK: void do_loop(const auto &v) { // CHECK-NEXT: #pragma omp parallel for // CHECK-NEXT: for (const auto &i : v) // CHECK-NEXT: ; -- cgit v1.1 From 35fae044c5faf8ddb9be7b47bb7573e839f77472 Mon Sep 17 00:00:00 2001 From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com> Date: Thu, 8 Feb 2024 12:12:24 -0600 Subject: [mlir][sparse] using non-static field to avoid data races. (#81165) --- .../SparseTensor/Transforms/Utils/LoopEmitter.cpp | 15 +++---- .../SparseTensor/Transforms/Utils/LoopEmitter.h | 1 + .../Transforms/Utils/SparseTensorLevel.cpp | 48 ++++++++++++++-------- .../Transforms/Utils/SparseTensorLevel.h | 20 +++++---- 4 files changed, 50 insertions(+), 34 deletions(-) diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp index 1c2857d..0ead135 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp @@ -94,7 +94,7 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput, this->loopTag = loopTag; this->hasOutput = hasOutput; this->isSparseOut = isSparseOut; - SparseIterator::setSparseEmitStrategy(emitStrategy); + this->emitStrategy = emitStrategy; const unsigned numManifestTensors = ts.size(); const unsigned synTensorId = numManifestTensors; @@ -166,13 +166,13 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput, std::unique_ptr LoopEmitter::makeLevelIterator(OpBuilder &builder, Location loc, TensorId t, Level l) { - auto it = makeSimpleIterator(*lvls[t][l]); + auto it = makeSimpleIterator(*lvls[t][l], emitStrategy); auto stt = getSparseTensorType(tensors[t]); if (stt.hasEncoding() && stt.getEncoding().isSlice()) { Value offset = genSliceOffset(builder, loc, tensors[t], l); Value stride = genSliceStride(builder, loc, tensors[t], l); - auto slicedIt = makeSlicedLevelIterator(std::move(it), offset, stride, - lvls[t][l]->getSize()); + auto slicedIt = makeSlicedLevelIterator( + std::move(it), offset, stride, lvls[t][l]->getSize(), emitStrategy); return slicedIt; } return it; @@ -186,7 +186,7 @@ void LoopEmitter::initializeLoopEmit( TensorId synId = getSynTensorId(); for (unsigned i = 0, e = loopHighs.size(); i < e; i++) { Value sz = loopHighs[i] = synSetter(builder, loc, i); - auto [stl, it] = makeSynLevelAndIterator(sz, synId, i); + auto [stl, it] = makeSynLevelAndIterator(sz, synId, i, emitStrategy); lvls[synId][i] = std::move(stl); iters[synId][i].emplace_back(std::move(it)); } @@ -317,12 +317,13 @@ void LoopEmitter::initSubSectIterator(OpBuilder &builder, Location loc) { size = ADDI(size, ADDI(MULI(idxMax, C_IDX(stride)), C_IDX(1))); } it = makeNonEmptySubSectIterator(builder, loc, parent, loopHighs[loop], - std::move(lvlIt), size, curDep.second); + std::move(lvlIt), size, curDep.second, + emitStrategy); } else { const SparseIterator &subSectIter = *iters[t][lvl].back(); it = makeTraverseSubSectIterator(builder, loc, subSectIter, *parent, std::move(lvlIt), loopHighs[loop], - curDep.second); + curDep.second, emitStrategy); } lastIter[t] = it.get(); iters[t][lvl].emplace_back(std::move(it)); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h index 5bab2c6..7bfe713 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h @@ -380,6 +380,7 @@ private: /// tensor. bool hasOutput; bool isSparseOut; + SparseEmitStrategy emitStrategy; // // Fields which have `numTensor` many entries. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp index 04b49c3..4ba9ecb 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp @@ -773,9 +773,6 @@ public: // SparseIterator derived classes implementation. //===----------------------------------------------------------------------===// -SparseEmitStrategy SparseIterator::emitStrategy = - SparseEmitStrategy::kFunctional; - void SparseIterator::genInit(OpBuilder &b, Location l, const SparseIterator *p) { if (emitStrategy == SparseEmitStrategy::kDebugInterface) { @@ -1303,27 +1300,38 @@ sparse_tensor::makeSparseTensorLevel(OpBuilder &b, Location l, Value t, } std::pair, std::unique_ptr> -sparse_tensor::makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl) { +sparse_tensor::makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl, + SparseEmitStrategy strategy) { auto stl = std::make_unique(tid, lvl, sz, /*encoded=*/false); auto it = std::make_unique(*stl); + it->setSparseEmitStrategy(strategy); return std::make_pair(std::move(stl), std::move(it)); } std::unique_ptr -sparse_tensor::makeSimpleIterator(const SparseTensorLevel &stl) { +sparse_tensor::makeSimpleIterator(const SparseTensorLevel &stl, + SparseEmitStrategy strategy) { + std::unique_ptr ret; if (!isUniqueLT(stl.getLT())) { // We always dedupliate the non-unique level, but we should optimize it away // if possible. - return std::make_unique(stl); + ret = std::make_unique(stl); + } else { + ret = std::make_unique(stl); } - return std::make_unique(stl); + ret->setSparseEmitStrategy(strategy); + return ret; } std::unique_ptr sparse_tensor::makeSlicedLevelIterator(std::unique_ptr &&sit, - Value offset, Value stride, Value size) { + Value offset, Value stride, Value size, + SparseEmitStrategy strategy) { - return std::make_unique(std::move(sit), offset, stride, size); + auto ret = + std::make_unique(std::move(sit), offset, stride, size); + ret->setSparseEmitStrategy(strategy); + return ret; } static const SparseIterator *tryUnwrapFilter(const SparseIterator *it) { @@ -1335,38 +1343,42 @@ static const SparseIterator *tryUnwrapFilter(const SparseIterator *it) { std::unique_ptr sparse_tensor::makeNonEmptySubSectIterator( OpBuilder &b, Location l, const SparseIterator *parent, Value loopBound, - std::unique_ptr &&delegate, Value size, unsigned stride) { + std::unique_ptr &&delegate, Value size, unsigned stride, + SparseEmitStrategy strategy) { // Try unwrap the NonEmptySubSectIterator from a filter parent. parent = tryUnwrapFilter(parent); - auto it = std::make_unique( - b, l, parent, std::move(delegate), size); + std::unique_ptr it = + std::make_unique(b, l, parent, + std::move(delegate), size); if (stride != 1) { // TODO: We can safely skip bound checking on sparse levels, but for dense // iteration space, we need the bound to infer the dense loop range. - return std::make_unique(std::move(it), /*offset=*/C_IDX(0), - C_IDX(stride), /*size=*/loopBound); + it = std::make_unique(std::move(it), /*offset=*/C_IDX(0), + C_IDX(stride), /*size=*/loopBound); } + it->setSparseEmitStrategy(strategy); return it; } std::unique_ptr sparse_tensor::makeTraverseSubSectIterator( OpBuilder &b, Location l, const SparseIterator &subSectIter, const SparseIterator &parent, std::unique_ptr &&wrap, - Value loopBound, unsigned stride) { + Value loopBound, unsigned stride, SparseEmitStrategy strategy) { // This must be a subsection iterator or a filtered subsection iterator. auto &subSect = llvm::cast(*tryUnwrapFilter(&subSectIter)); - auto it = std::make_unique( + std::unique_ptr it = std::make_unique( subSect, *tryUnwrapFilter(&parent), std::move(wrap)); if (stride != 1) { - return std::make_unique(std::move(it), /*offset=*/C_IDX(0), - C_IDX(stride), /*size=*/loopBound); + it = std::make_unique(std::move(it), /*offset=*/C_IDX(0), + C_IDX(stride), /*size=*/loopBound); } + it->setSparseEmitStrategy(strategy); return it; } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h index fc2d9de..d1e94b7 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h @@ -111,8 +111,8 @@ protected: public: virtual ~SparseIterator() = default; - static void setSparseEmitStrategy(SparseEmitStrategy strategy) { - SparseIterator::emitStrategy = strategy; + void setSparseEmitStrategy(SparseEmitStrategy strategy) { + emitStrategy = strategy; } virtual std::string getDebugInterfacePrefix() const = 0; @@ -248,7 +248,7 @@ protected: return ref.take_front(cursorValsCnt); } - static SparseEmitStrategy emitStrategy; + SparseEmitStrategy emitStrategy; public: const IterKind kind; // For LLVM-style RTTI. @@ -277,32 +277,34 @@ std::unique_ptr makeSparseTensorLevel(OpBuilder &builder, /// Helper function to create a simple SparseIterator object that iterate over /// the SparseTensorLevel. -std::unique_ptr -makeSimpleIterator(const SparseTensorLevel &stl); +std::unique_ptr makeSimpleIterator(const SparseTensorLevel &stl, + SparseEmitStrategy strategy); /// Helper function to create a synthetic SparseIterator object that iterate /// over a dense space specified by [0,`sz`). std::pair, std::unique_ptr> -makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl); +makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl, + SparseEmitStrategy strategy); /// Helper function to create a SparseIterator object that iterate over a /// sliced space, the orignal space (before slicing) is traversed by `sit`. std::unique_ptr makeSlicedLevelIterator(std::unique_ptr &&sit, Value offset, - Value stride, Value size); + Value stride, Value size, SparseEmitStrategy strategy); /// Helper function to create a SparseIterator object that iterate over the /// non-empty subsections set. std::unique_ptr makeNonEmptySubSectIterator( OpBuilder &b, Location l, const SparseIterator *parent, Value loopBound, - std::unique_ptr &&delegate, Value size, unsigned stride); + std::unique_ptr &&delegate, Value size, unsigned stride, + SparseEmitStrategy strategy); /// Helper function to create a SparseIterator object that iterate over a /// non-empty subsection created by NonEmptySubSectIterator. std::unique_ptr makeTraverseSubSectIterator( OpBuilder &b, Location l, const SparseIterator &subsectIter, const SparseIterator &parent, std::unique_ptr &&wrap, - Value loopBound, unsigned stride); + Value loopBound, unsigned stride, SparseEmitStrategy strategy); } // namespace sparse_tensor } // namespace mlir -- cgit v1.1 From da95d926f6fce4ed9707c77908ad96624268f134 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 8 Feb 2024 19:19:18 +0100 Subject: [clang][lex] Always pass suggested module to `InclusionDirective()` callback (#81061) This patch provides more information to the `PPCallbacks::InclusionDirective()` hook. We now always pass the suggested module, regardless of whether it was actually imported or not. The extra `bool ModuleImported` parameter then denotes whether the header `#include` will be automatically translated into import the the module. The main change is in `clang/lib/Lex/PPDirectives.cpp`, where we take care to not modify `SuggestedModule` after it's been populated by `LookupHeaderIncludeOrImport()`. We now exclusively use the `SM` (`ModuleToImport`) variable instead, which has been equivalent to `SuggestedModule` until now. This allows us to use the original non-modified `SuggestedModule` for the callback itself. (This patch turns out to be necessary for https://github.com/apple/llvm-project/pull/8011). --- clang-tools-extra/clang-move/Move.cpp | 3 +- .../clang-tidy/ExpandModularHeadersPPCallbacks.cpp | 6 +- .../clang-tidy/ExpandModularHeadersPPCallbacks.h | 2 +- .../altera/KernelNameRestrictionCheck.cpp | 5 +- .../clang-tidy/bugprone/SuspiciousIncludeCheck.cpp | 7 ++- .../clang-tidy/llvm/IncludeOrderCheck.cpp | 7 ++- .../llvmlibc/RestrictSystemLibcHeadersCheck.cpp | 9 +-- .../clang-tidy/misc/HeaderIncludeCycleCheck.cpp | 2 +- .../modernize/DeprecatedHeadersCheck.cpp | 7 ++- .../clang-tidy/modernize/MacroToEnumCheck.cpp | 3 +- .../portability/RestrictSystemIncludesCheck.cpp | 4 +- .../portability/RestrictSystemIncludesCheck.h | 3 +- .../readability/DuplicateIncludeCheck.cpp | 7 ++- .../clang-tidy/utils/IncludeInserter.cpp | 3 +- clang-tools-extra/clangd/Headers.cpp | 3 +- clang-tools-extra/clangd/ParsedAST.cpp | 2 +- clang-tools-extra/clangd/index/IndexAction.cpp | 3 +- .../clangd/unittests/ReplayPeambleTests.cpp | 2 +- clang-tools-extra/include-cleaner/lib/Record.cpp | 6 +- clang-tools-extra/modularize/CoverageChecker.cpp | 3 +- .../modularize/PreprocessorTracker.cpp | 20 +++---- clang-tools-extra/pp-trace/PPCallbacksTracker.cpp | 6 +- clang-tools-extra/pp-trace/PPCallbacksTracker.h | 3 +- .../test/pp-trace/pp-trace-include.cpp | 12 ++-- clang/include/clang/Lex/PPCallbacks.h | 16 +++-- clang/include/clang/Lex/PreprocessingRecord.h | 3 +- .../DependencyScanning/ModuleDepCollector.h | 3 +- clang/lib/CodeGen/MacroPPCallbacks.cpp | 4 +- clang/lib/CodeGen/MacroPPCallbacks.h | 3 +- clang/lib/Frontend/DependencyFile.cpp | 3 +- clang/lib/Frontend/DependencyGraph.cpp | 7 ++- clang/lib/Frontend/ModuleDependencyCollector.cpp | 3 +- clang/lib/Frontend/PrecompiledPreamble.cpp | 3 +- clang/lib/Frontend/PrintPreprocessedOutput.cpp | 11 ++-- clang/lib/Frontend/Rewrite/InclusionRewriter.cpp | 10 ++-- clang/lib/Lex/PPDirectives.cpp | 70 +++++++++++----------- clang/lib/Lex/PreprocessingRecord.cpp | 11 ++-- .../DependencyScanning/ModuleDepCollector.cpp | 8 +-- clang/tools/libclang/Indexing.cpp | 5 +- clang/unittests/Lex/PPCallbacksTest.cpp | 9 ++- 40 files changed, 168 insertions(+), 129 deletions(-) diff --git a/clang-tools-extra/clang-move/Move.cpp b/clang-tools-extra/clang-move/Move.cpp index 1d10348..ac16803 100644 --- a/clang-tools-extra/clang-move/Move.cpp +++ b/clang-tools-extra/clang-move/Move.cpp @@ -133,7 +133,8 @@ public: CharSourceRange FilenameRange, OptionalFileEntryRef /*File*/, StringRef SearchPath, StringRef /*RelativePath*/, - const Module * /*Imported*/, + const Module * /*SuggestedModule*/, + bool /*ModuleImported*/, SrcMgr::CharacteristicKind /*FileType*/) override { if (auto FileEntry = SM.getFileEntryRefForID(SM.getFileID(HashLoc))) MoveTool->addIncludes(FileName, IsAngled, SearchPath, diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp index 5ecd4fb..5e2cc20 100644 --- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp +++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp @@ -166,12 +166,12 @@ void ExpandModularHeadersPPCallbacks::InclusionDirective( SourceLocation DirectiveLoc, const Token &IncludeToken, StringRef IncludedFilename, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef IncludedFile, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, bool ModuleImported, SrcMgr::CharacteristicKind FileType) { - if (Imported) { + if (ModuleImported) { serialization::ModuleFile *MF = Compiler.getASTReader()->getModuleManager().lookup( - *Imported->getASTFile()); + *SuggestedModule->getASTFile()); handleModuleFile(MF); } parseToLocation(DirectiveLoc); diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h index 3f6abc3..0742c21 100644 --- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h +++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h @@ -69,7 +69,7 @@ private: bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef IncludedFile, StringRef SearchPath, StringRef RelativePath, - const Module *Imported, + const Module *SuggestedModule, bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void EndOfMainFile() override; diff --git a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp index 084e44a..fb1e0e8 100644 --- a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp +++ b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp @@ -29,7 +29,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FileNameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void EndOfMainFile() override; @@ -61,7 +62,7 @@ void KernelNameRestrictionCheck::registerPPCallbacks(const SourceManager &SM, void KernelNameRestrictionPPCallbacks::InclusionDirective( SourceLocation HashLoc, const Token &, StringRef FileName, bool, CharSourceRange, OptionalFileEntryRef, StringRef, StringRef, const Module *, - SrcMgr::CharacteristicKind) { + bool, SrcMgr::CharacteristicKind) { IncludeDirective ID = {HashLoc, FileName}; IncludeDirectives.push_back(std::move(ID)); } diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp index 61d89cf..09ba79f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp @@ -26,7 +26,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; private: @@ -51,8 +52,8 @@ void SuspiciousIncludeCheck::registerPPCallbacks( void SuspiciousIncludePPCallbacks::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { if (IncludeTok.getIdentifierInfo()->getPPKeywordID() == tok::pp_import) return; diff --git a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp index bdd72f8..4246c8c5 100644 --- a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp @@ -27,7 +27,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void EndOfMainFile() override; @@ -81,8 +82,8 @@ static int getPriority(StringRef Filename, bool IsAngled, bool IsMainModule) { void IncludeOrderPPCallbacks::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { // We recognize the first include as a special main module header and want // to leave it in the top position. IncludeDirective ID = {HashLoc, FilenameRange, std::string(FileName), diff --git a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp index 3451d34..b656917 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp @@ -33,7 +33,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; private: @@ -45,14 +46,14 @@ private: void RestrictedIncludesPPCallbacks::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { // Compiler provided headers are allowed (e.g stddef.h). if (SrcMgr::isSystem(FileType) && SearchPath == CompilerIncudeDir) return; portability::RestrictedIncludesPPCallbacks::InclusionDirective( HashLoc, IncludeTok, FileName, IsAngled, FilenameRange, File, SearchPath, - RelativePath, Imported, FileType); + RelativePath, SuggestedModule, ModuleImported, FileType); } void RestrictSystemLibcHeadersCheck::registerPPCallbacks( diff --git a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp index bebd6e39..fadfdc8 100644 --- a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp @@ -83,7 +83,7 @@ public: void InclusionDirective(SourceLocation, const Token &, StringRef FilePath, bool, CharSourceRange Range, OptionalFileEntryRef File, StringRef, StringRef, - const Module *, + const Module *, bool, SrcMgr::CharacteristicKind FileType) override { if (FileType != clang::SrcMgr::C_User) return; diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp index 030a781..6d287eb 100644 --- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp @@ -32,7 +32,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; private: @@ -178,8 +179,8 @@ IncludeModernizePPCallbacks::IncludeModernizePPCallbacks( void IncludeModernizePPCallbacks::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { // If we don't want to warn for non-main file reports and this is one, skip // it. diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp index b197c22..0b47ed3 100644 --- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp @@ -117,7 +117,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override { clearCurrentEnum(HashLoc); } diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp index 9ee0b4e..db5693e 100644 --- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp +++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp @@ -21,8 +21,8 @@ namespace clang::tidy::portability { void RestrictedIncludesPPCallbacks::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { if (!Check.contains(FileName) && SrcMgr::isSystem(FileType)) { SmallString<256> FullPath; llvm::sys::path::append(FullPath, SearchPath); diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h index ad18e6f..60fae5e 100644 --- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h +++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h @@ -50,7 +50,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void EndOfMainFile() override; diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp index d1f41e0..6714716 100644 --- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp @@ -47,7 +47,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void MacroDefined(const Token &MacroNameTok, @@ -76,8 +77,8 @@ void DuplicateIncludeCallbacks::FileChanged(SourceLocation Loc, void DuplicateIncludeCallbacks::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { if (llvm::is_contained(Files.back(), FileName)) { // We want to delete the entire line, so make sure that [Start,End] covers // everything. diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp index d0b7474..b53016f 100644 --- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp +++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp @@ -25,7 +25,8 @@ public: bool IsAngled, CharSourceRange FileNameRange, OptionalFileEntryRef /*IncludedFile*/, StringRef /*SearchPath*/, StringRef /*RelativePath*/, - const Module * /*ImportedModule*/, + const Module * /*SuggestedModule*/, + bool /*ModuleImported*/, SrcMgr::CharacteristicKind /*FileType*/) override { Inserter->addInclude(FileNameRef, IsAngled, HashLocation, IncludeToken.getEndLoc()); diff --git a/clang-tools-extra/clangd/Headers.cpp b/clang-tools-extra/clangd/Headers.cpp index 076e636..75f8668 100644 --- a/clang-tools-extra/clangd/Headers.cpp +++ b/clang-tools-extra/clangd/Headers.cpp @@ -41,7 +41,8 @@ public: OptionalFileEntryRef File, llvm::StringRef /*SearchPath*/, llvm::StringRef /*RelativePath*/, - const clang::Module * /*Imported*/, + const clang::Module * /*SuggestedModule*/, + bool /*ModuleImported*/, SrcMgr::CharacteristicKind FileKind) override { auto MainFID = SM.getMainFileID(); // If an include is part of the preamble patch, translate #line directives. diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp index 14a9179..bbb0e2c 100644 --- a/clang-tools-extra/clangd/ParsedAST.cpp +++ b/clang-tools-extra/clangd/ParsedAST.cpp @@ -244,7 +244,7 @@ private: SynthesizedFilenameTok.getEndLoc()) .toCharRange(SM), File, "SearchPath", "RelPath", - /*Imported=*/nullptr, Inc.FileKind); + /*SuggestedModule=*/nullptr, /*ModuleImported=*/false, Inc.FileKind); if (File) Delegate->FileSkipped(*File, SynthesizedFilenameTok, Inc.FileKind); } diff --git a/clang-tools-extra/clangd/index/IndexAction.cpp b/clang-tools-extra/clangd/index/IndexAction.cpp index 5d56285..ed56c2a 100644 --- a/clang-tools-extra/clangd/index/IndexAction.cpp +++ b/clang-tools-extra/clangd/index/IndexAction.cpp @@ -89,7 +89,8 @@ public: llvm::StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, llvm::StringRef SearchPath, - llvm::StringRef RelativePath, const Module *Imported, + llvm::StringRef RelativePath, + const Module *SuggestedModule, bool ModuleImported, SrcMgr::CharacteristicKind FileType) override { auto IncludeURI = toURI(File); if (!IncludeURI) diff --git a/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp b/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp index 472fe30..147d9ab 100644 --- a/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp +++ b/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp @@ -72,7 +72,7 @@ struct ReplayPreamblePPCallback : public PPCallbacks { void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef, - StringRef, StringRef, const clang::Module *, + StringRef, StringRef, const clang::Module *, bool, SrcMgr::CharacteristicKind) override { Includes.emplace_back(SM, HashLoc, IncludeTok, FileName, IsAngled, FilenameRange); diff --git a/clang-tools-extra/include-cleaner/lib/Record.cpp b/clang-tools-extra/include-cleaner/lib/Record.cpp index c93c56a..78a4df6 100644 --- a/clang-tools-extra/include-cleaner/lib/Record.cpp +++ b/clang-tools-extra/include-cleaner/lib/Record.cpp @@ -65,7 +65,8 @@ public: StringRef SpelledFilename, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind) override { if (!Active) return; @@ -214,7 +215,8 @@ public: OptionalFileEntryRef File, llvm::StringRef /*SearchPath*/, llvm::StringRef /*RelativePath*/, - const clang::Module * /*Imported*/, + const clang::Module * /*SuggestedModule*/, + bool /*ModuleImported*/, SrcMgr::CharacteristicKind FileKind) override { FileID HashFID = SM.getFileID(HashLoc); int HashLine = SM.getLineNumber(HashFID, SM.getFileOffset(HashLoc)); diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp index 1e8b0aa..0e76c53 100644 --- a/clang-tools-extra/modularize/CoverageChecker.cpp +++ b/clang-tools-extra/modularize/CoverageChecker.cpp @@ -90,7 +90,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override { Checker.collectUmbrellaHeaderHeader(File->getName()); } diff --git a/clang-tools-extra/modularize/PreprocessorTracker.cpp b/clang-tools-extra/modularize/PreprocessorTracker.cpp index 7557fb1..85e3aab 100644 --- a/clang-tools-extra/modularize/PreprocessorTracker.cpp +++ b/clang-tools-extra/modularize/PreprocessorTracker.cpp @@ -730,15 +730,14 @@ public: ~PreprocessorCallbacks() override {} // Overridden handlers. - void InclusionDirective(clang::SourceLocation HashLoc, - const clang::Token &IncludeTok, - llvm::StringRef FileName, bool IsAngled, - clang::CharSourceRange FilenameRange, - clang::OptionalFileEntryRef File, - llvm::StringRef SearchPath, - llvm::StringRef RelativePath, - const clang::Module *Imported, - clang::SrcMgr::CharacteristicKind FileType) override; + void + InclusionDirective(clang::SourceLocation HashLoc, + const clang::Token &IncludeTok, llvm::StringRef FileName, + bool IsAngled, clang::CharSourceRange FilenameRange, + clang::OptionalFileEntryRef File, + llvm::StringRef SearchPath, llvm::StringRef RelativePath, + const clang::Module *SuggestedModule, bool ModuleImported, + clang::SrcMgr::CharacteristicKind FileType) override; void FileChanged(clang::SourceLocation Loc, clang::PPCallbacks::FileChangeReason Reason, clang::SrcMgr::CharacteristicKind FileType, @@ -1275,7 +1274,8 @@ void PreprocessorCallbacks::InclusionDirective( llvm::StringRef FileName, bool IsAngled, clang::CharSourceRange FilenameRange, clang::OptionalFileEntryRef File, llvm::StringRef SearchPath, llvm::StringRef RelativePath, - const clang::Module *Imported, clang::SrcMgr::CharacteristicKind FileType) { + const clang::Module *SuggestedModule, bool ModuleImported, + clang::SrcMgr::CharacteristicKind FileType) { int DirectiveLine, DirectiveColumn; std::string HeaderPath = getSourceLocationFile(PP, HashLoc); getSourceLocationLineAndColumn(PP, HashLoc, DirectiveLine, DirectiveColumn); diff --git a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp index a59a827..3bb30fd 100644 --- a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp +++ b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp @@ -135,7 +135,8 @@ void PPCallbacksTracker::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, llvm::StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, llvm::StringRef SearchPath, llvm::StringRef RelativePath, - const Module *Imported, SrcMgr::CharacteristicKind FileType) { + const Module *SuggestedModule, bool ModuleImported, + SrcMgr::CharacteristicKind FileType) { beginCallback("InclusionDirective"); appendArgument("HashLoc", HashLoc); appendArgument("IncludeTok", IncludeTok); @@ -145,7 +146,8 @@ void PPCallbacksTracker::InclusionDirective( appendArgument("File", File); appendFilePathArgument("SearchPath", SearchPath); appendFilePathArgument("RelativePath", RelativePath); - appendArgument("Imported", Imported); + appendArgument("SuggestedModule", SuggestedModule); + appendArgument("ModuleImported", ModuleImported); } // Callback invoked whenever there was an explicit module-import diff --git a/clang-tools-extra/pp-trace/PPCallbacksTracker.h b/clang-tools-extra/pp-trace/PPCallbacksTracker.h index c195a72..04590a9 100644 --- a/clang-tools-extra/pp-trace/PPCallbacksTracker.h +++ b/clang-tools-extra/pp-trace/PPCallbacksTracker.h @@ -95,7 +95,8 @@ public: llvm::StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, llvm::StringRef SearchPath, - llvm::StringRef RelativePath, const Module *Imported, + llvm::StringRef RelativePath, + const Module *SuggestedModule, bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void moduleImport(SourceLocation ImportLoc, ModuleIdPath Path, const Module *Imported) override; diff --git a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp index db0b2c8..ea9896e 100644 --- a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp +++ b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp @@ -59,7 +59,8 @@ // CHECK-NEXT: File: "{{.*}}{{[/\\]}}Inputs/Level1A.h" // CHECK-NEXT: SearchPath: "{{.*}}{{[/\\]}}pp-trace" // CHECK-NEXT: RelativePath: "Inputs/Level1A.h" -// CHECK-NEXT: Imported: (null) +// CHECK-NEXT: SuggestedModule: (null) +// CHECK-NEXT: ModuleImported: false // CHECK-NEXT: - Callback: FileChanged // CHECK-NEXT: Loc: "{{.*}}{{[/\\]}}Inputs/Level1A.h:1:1" // CHECK-NEXT: Reason: EnterFile @@ -74,7 +75,8 @@ // CHECK-NEXT: File: "{{.*}}{{[/\\]}}Inputs/Level2A.h" // CHECK-NEXT: SearchPath: "{{.*}}{{[/\\]}}Inputs" // CHECK-NEXT: RelativePath: "Level2A.h" -// CHECK-NEXT: Imported: (null) +// CHECK-NEXT: SuggestedModule: (null) +// CHECK-NEXT: ModuleImported: false // CHECK-NEXT: - Callback: FileChanged // CHECK-NEXT: Loc: "{{.*}}{{[/\\]}}Inputs/Level2A.h:1:1" // CHECK-NEXT: Reason: EnterFile @@ -105,7 +107,8 @@ // CHECK-NEXT: File: "{{.*}}{{[/\\]}}Inputs/Level1B.h" // CHECK-NEXT: SearchPath: "{{.*}}{{[/\\]}}pp-trace" // CHECK-NEXT: RelativePath: "Inputs/Level1B.h" -// CHECK-NEXT: Imported: (null) +// CHECK-NEXT: SuggestedModule: (null) +// CHECK-NEXT: ModuleImported: false // CHECK-NEXT: - Callback: FileChanged // CHECK-NEXT: Loc: "{{.*}}{{[/\\]}}Inputs/Level1B.h:1:1" // CHECK-NEXT: Reason: EnterFile @@ -120,7 +123,8 @@ // CHECK-NEXT: File: "{{.*}}{{[/\\]}}Inputs/Level2B.h" // CHECK-NEXT: SearchPath: "{{.*}}{{[/\\]}}Inputs" // CHECK-NEXT: RelativePath: "Level2B.h" -// CHECK-NEXT: Imported: (null) +// CHECK-NEXT: SuggestedModule: (null) +// CHECK-NEXT: ModuleImported: false // CHECK-NEXT: - Callback: FileChanged // CHECK-NEXT: Loc: "{{.*}}{{[/\\]}}Inputs/Level2B.h:1:1" // CHECK-NEXT: Reason: EnterFile diff --git a/clang/include/clang/Lex/PPCallbacks.h b/clang/include/clang/Lex/PPCallbacks.h index e3942af..dfc74b5 100644 --- a/clang/include/clang/Lex/PPCallbacks.h +++ b/clang/include/clang/Lex/PPCallbacks.h @@ -127,8 +127,10 @@ public: /// \param RelativePath The path relative to SearchPath, at which the include /// file was found. This is equal to FileName except for framework includes. /// - /// \param Imported The module, whenever an inclusion directive was - /// automatically turned into a module import or null otherwise. + /// \param SuggestedModule The module suggested for this header, if any. + /// + /// \param ModuleImported Whether this include was translated into import of + /// \p SuggestedModule. /// /// \param FileType The characteristic kind, indicates whether a file or /// directory holds normal user code, system code, or system code which is @@ -139,7 +141,8 @@ public: bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, StringRef RelativePath, - const Module *Imported, + const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) {} /// Callback invoked whenever a submodule was entered. @@ -473,14 +476,15 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override { First->InclusionDirective(HashLoc, IncludeTok, FileName, IsAngled, FilenameRange, File, SearchPath, RelativePath, - Imported, FileType); + SuggestedModule, ModuleImported, FileType); Second->InclusionDirective(HashLoc, IncludeTok, FileName, IsAngled, FilenameRange, File, SearchPath, RelativePath, - Imported, FileType); + SuggestedModule, ModuleImported, FileType); } void EnteredSubmodule(Module *M, SourceLocation ImportLoc, diff --git a/clang/include/clang/Lex/PreprocessingRecord.h b/clang/include/clang/Lex/PreprocessingRecord.h index 5ddf024..437d8e4c 100644 --- a/clang/include/clang/Lex/PreprocessingRecord.h +++ b/clang/include/clang/Lex/PreprocessingRecord.h @@ -532,7 +532,8 @@ class Token; StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, + const Module *SuggestedModule, bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void Ifdef(SourceLocation Loc, const Token &MacroNameTok, const MacroDefinition &MD) override; diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h index 051363b..13ad253 100644 --- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h +++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h @@ -166,7 +166,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void moduleImport(SourceLocation ImportLoc, ModuleIdPath Path, const Module *Imported) override; diff --git a/clang/lib/CodeGen/MacroPPCallbacks.cpp b/clang/lib/CodeGen/MacroPPCallbacks.cpp index 8589869..c5d1e3a 100644 --- a/clang/lib/CodeGen/MacroPPCallbacks.cpp +++ b/clang/lib/CodeGen/MacroPPCallbacks.cpp @@ -168,8 +168,8 @@ void MacroPPCallbacks::FileChanged(SourceLocation Loc, FileChangeReason Reason, void MacroPPCallbacks::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { // Record the line location of the current included file. LastHashLoc = HashLoc; diff --git a/clang/lib/CodeGen/MacroPPCallbacks.h b/clang/lib/CodeGen/MacroPPCallbacks.h index 5af177d..5f46864 100644 --- a/clang/lib/CodeGen/MacroPPCallbacks.h +++ b/clang/lib/CodeGen/MacroPPCallbacks.h @@ -102,7 +102,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; /// Hook called whenever a macro definition is seen. diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp index 19abcac..369816e 100644 --- a/clang/lib/Frontend/DependencyFile.cpp +++ b/clang/lib/Frontend/DependencyFile.cpp @@ -66,7 +66,8 @@ struct DepCollectorPPCallbacks : public PPCallbacks { StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override { if (!File) DepCollector.maybeAddDependency(FileName, /*FromModule*/ false, diff --git a/clang/lib/Frontend/DependencyGraph.cpp b/clang/lib/Frontend/DependencyGraph.cpp index b471471..20e5f23 100644 --- a/clang/lib/Frontend/DependencyGraph.cpp +++ b/clang/lib/Frontend/DependencyGraph.cpp @@ -49,7 +49,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void EndOfMainFile() override { @@ -68,8 +69,8 @@ void clang::AttachDependencyGraphGen(Preprocessor &PP, StringRef OutputFile, void DependencyGraphCallback::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { if (!File) return; diff --git a/clang/lib/Frontend/ModuleDependencyCollector.cpp b/clang/lib/Frontend/ModuleDependencyCollector.cpp index 939e611..b88cb60 100644 --- a/clang/lib/Frontend/ModuleDependencyCollector.cpp +++ b/clang/lib/Frontend/ModuleDependencyCollector.cpp @@ -55,7 +55,8 @@ struct ModuleDependencyPPCallbacks : public PPCallbacks { StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override { if (!File) return; diff --git a/clang/lib/Frontend/PrecompiledPreamble.cpp b/clang/lib/Frontend/PrecompiledPreamble.cpp index 62373b2..9b0ef30 100644 --- a/clang/lib/Frontend/PrecompiledPreamble.cpp +++ b/clang/lib/Frontend/PrecompiledPreamble.cpp @@ -98,7 +98,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override { // File is std::nullopt if it wasn't found. // (We have some false negatives if PP recovered e.g. -> "foo") diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp index 7f5f669..a26d2c3 100644 --- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp +++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp @@ -153,7 +153,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void Ident(SourceLocation Loc, StringRef str) override; void PragmaMessage(SourceLocation Loc, StringRef Namespace, @@ -401,8 +402,8 @@ void PrintPPOutputPPCallbacks::FileChanged(SourceLocation Loc, void PrintPPOutputPPCallbacks::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { // In -dI mode, dump #include directives prior to dumping their content or // interpretation. Similar for -fkeep-system-includes. if (DumpIncludeDirectives || (KeepSystemIncludes && isSystem(FileType))) { @@ -418,14 +419,14 @@ void PrintPPOutputPPCallbacks::InclusionDirective( } // When preprocessing, turn implicit imports into module import pragmas. - if (Imported) { + if (ModuleImported) { switch (IncludeTok.getIdentifierInfo()->getPPKeywordID()) { case tok::pp_include: case tok::pp_import: case tok::pp_include_next: MoveToLine(HashLoc, /*RequireStartOfLine=*/true); *OS << "#pragma clang module import " - << Imported->getFullModuleName(true) + << SuggestedModule->getFullModuleName(true) << " /* clang -E: implicit import for " << "#" << PP.getSpelling(IncludeTok) << " " << (IsAngled ? '<' : '"') << FileName << (IsAngled ? '>' : '"') diff --git a/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp b/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp index b6b3746..1462058 100644 --- a/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp +++ b/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp @@ -75,7 +75,8 @@ private: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override; void If(SourceLocation Loc, SourceRange ConditionRange, ConditionValueKind ConditionValue) override; @@ -189,9 +190,10 @@ void InclusionRewriter::InclusionDirective( StringRef /*FileName*/, bool /*IsAngled*/, CharSourceRange /*FilenameRange*/, OptionalFileEntryRef /*File*/, StringRef /*SearchPath*/, StringRef /*RelativePath*/, - const Module *Imported, SrcMgr::CharacteristicKind FileType) { - if (Imported) { - auto P = ModuleIncludes.insert(std::make_pair(HashLoc, Imported)); + const Module *SuggestedModule, bool ModuleImported, + SrcMgr::CharacteristicKind FileType) { + if (ModuleImported) { + auto P = ModuleIncludes.insert(std::make_pair(HashLoc, SuggestedModule)); (void)P; assert(P.second && "Unexpected revisitation of the same include directive"); } else diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index a980f4b..97f9c0a 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -2253,26 +2253,27 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // FIXME: We do not have a good way to disambiguate C++ clang modules from // C++ standard modules (other than use/non-use of Header Units). - Module *SM = SuggestedModule.getModule(); - bool MaybeTranslateInclude = - Action == Enter && File && SM && !SM->isForBuilding(getLangOpts()); + Module *ModuleToImport = SuggestedModule.getModule(); + + bool MaybeTranslateInclude = Action == Enter && File && ModuleToImport && + !ModuleToImport->isForBuilding(getLangOpts()); // Maybe a usable Header Unit bool UsableHeaderUnit = false; - if (getLangOpts().CPlusPlusModules && SM && SM->isHeaderUnit()) { + if (getLangOpts().CPlusPlusModules && ModuleToImport && + ModuleToImport->isHeaderUnit()) { if (TrackGMFState.inGMF() || IsImportDecl) UsableHeaderUnit = true; else if (!IsImportDecl) { // This is a Header Unit that we do not include-translate - SuggestedModule = ModuleMap::KnownHeader(); - SM = nullptr; + ModuleToImport = nullptr; } } // Maybe a usable clang header module. bool UsableClangHeaderModule = - (getLangOpts().CPlusPlusModules || getLangOpts().Modules) && SM && - !SM->isHeaderUnit(); + (getLangOpts().CPlusPlusModules || getLangOpts().Modules) && + ModuleToImport && !ModuleToImport->isHeaderUnit(); // Determine whether we should try to import the module for this #include, if // there is one. Don't do so if precompiled module support is disabled or we @@ -2282,12 +2283,11 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // unavailable, diagnose the situation and bail out. // FIXME: Remove this; loadModule does the same check (but produces // slightly worse diagnostics). - if (checkModuleIsAvailable(getLangOpts(), getTargetInfo(), - *SuggestedModule.getModule(), + if (checkModuleIsAvailable(getLangOpts(), getTargetInfo(), *ModuleToImport, getDiagnostics())) { Diag(FilenameTok.getLocation(), diag::note_implicit_top_level_module_import_here) - << SuggestedModule.getModule()->getTopLevelModuleName(); + << ModuleToImport->getTopLevelModuleName(); return {ImportAction::None}; } @@ -2295,7 +2295,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // FIXME: Should we have a second loadModule() overload to avoid this // extra lookup step? SmallVector, 2> Path; - for (Module *Mod = SM; Mod; Mod = Mod->Parent) + for (Module *Mod = ModuleToImport; Mod; Mod = Mod->Parent) Path.push_back(std::make_pair(getIdentifierInfo(Mod->Name), FilenameTok.getLocation())); std::reverse(Path.begin(), Path.end()); @@ -2306,12 +2306,12 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // Load the module to import its macros. We'll make the declarations // visible when the parser gets here. - // FIXME: Pass SuggestedModule in here rather than converting it to a path - // and making the module loader convert it back again. + // FIXME: Pass SM in here rather than converting it to a path and making the + // module loader convert it back again. ModuleLoadResult Imported = TheModuleLoader.loadModule( IncludeTok.getLocation(), Path, Module::Hidden, /*IsInclusionDirective=*/true); - assert((Imported == nullptr || Imported == SuggestedModule.getModule()) && + assert((Imported == nullptr || Imported == SM) && "the imported module is different than the suggested one"); if (Imported) { @@ -2323,8 +2323,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // was in the directory of an umbrella header, for instance), but no // actual module containing it exists (because the umbrella header is // incomplete). Treat this as a textual inclusion. - SuggestedModule = ModuleMap::KnownHeader(); - SM = nullptr; + ModuleToImport = nullptr; } else if (Imported.isConfigMismatch()) { // On a configuration mismatch, enter the header textually. We still know // that it's part of the corresponding module. @@ -2365,7 +2364,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // this file will have no effect. if (Action == Enter && File && !HeaderInfo.ShouldEnterIncludeFile(*this, *File, EnterOnce, - getLangOpts().Modules, SM, + getLangOpts().Modules, ModuleToImport, IsFirstIncludeOfFile)) { // C++ standard modules: // If we are not in the GMF, then we textually include only @@ -2380,7 +2379,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( if (UsableHeaderUnit && !getLangOpts().CompilingPCH) Action = TrackGMFState.inGMF() ? Import : Skip; else - Action = (SuggestedModule && !getLangOpts().CompilingPCH) ? Import : Skip; + Action = (ModuleToImport && !getLangOpts().CompilingPCH) ? Import : Skip; } // Check for circular inclusion of the main file. @@ -2400,8 +2399,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // FIXME: Use a different callback for a pp-import? Callbacks->InclusionDirective(HashLoc, IncludeTok, LookupFilename, isAngled, FilenameRange, File, SearchPath, RelativePath, - Action == Import ? SuggestedModule.getModule() - : nullptr, + SuggestedModule.getModule(), Action == Import, FileCharacter); if (Action == Skip && File) Callbacks->FileSkipped(*File, FilenameTok, FileCharacter); @@ -2412,7 +2410,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // If this is a C++20 pp-import declaration, diagnose if we didn't find any // module corresponding to the named header. - if (IsImportDecl && !SuggestedModule) { + if (IsImportDecl && !ModuleToImport) { Diag(FilenameTok, diag::err_header_import_not_header_unit) << OriginalFilename << File->getName(); return {ImportAction::None}; @@ -2517,8 +2515,8 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( switch (Action) { case Skip: // If we don't need to enter the file, stop now. - if (SM) - return {ImportAction::SkippedModuleImport, SM}; + if (ModuleToImport) + return {ImportAction::SkippedModuleImport, ModuleToImport}; return {ImportAction::None}; case IncludeLimitReached: @@ -2530,13 +2528,13 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // If this is a module import, make it visible if needed. assert(SM && "no module to import"); - makeModuleVisible(SM, EndLoc); + makeModuleVisible(ModuleToImport, EndLoc); if (IncludeTok.getIdentifierInfo()->getPPKeywordID() == tok::pp___include_macros) return {ImportAction::None}; - return {ImportAction::ModuleImport, SM}; + return {ImportAction::ModuleImport, ModuleToImport}; } case Enter: @@ -2573,13 +2571,14 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // Determine if we're switching to building a new submodule, and which one. // This does not apply for C++20 modules header units. - if (SM && !SM->isHeaderUnit()) { - if (SM->getTopLevelModule()->ShadowingModule) { + if (ModuleToImport && !ModuleToImport->isHeaderUnit()) { + if (ModuleToImport->getTopLevelModule()->ShadowingModule) { // We are building a submodule that belongs to a shadowed module. This // means we find header files in the shadowed module. - Diag(SM->DefinitionLoc, diag::err_module_build_shadowed_submodule) - << SM->getFullModuleName(); - Diag(SM->getTopLevelModule()->ShadowingModule->DefinitionLoc, + Diag(ModuleToImport->DefinitionLoc, + diag::err_module_build_shadowed_submodule) + << ModuleToImport->getFullModuleName(); + Diag(ModuleToImport->getTopLevelModule()->ShadowingModule->DefinitionLoc, diag::note_previous_definition); return {ImportAction::None}; } @@ -2591,21 +2590,22 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // that behaves the same as the header would behave in a compilation using // that PCH, which means we should enter the submodule. We need to teach // the AST serialization layer to deal with the resulting AST. - if (getLangOpts().CompilingPCH && SM->isForBuilding(getLangOpts())) + if (getLangOpts().CompilingPCH && + ModuleToImport->isForBuilding(getLangOpts())) return {ImportAction::None}; assert(!CurLexerSubmodule && "should not have marked this as a module yet"); - CurLexerSubmodule = SM; + CurLexerSubmodule = ModuleToImport; // Let the macro handling code know that any future macros are within // the new submodule. - EnterSubmodule(SM, EndLoc, /*ForPragma*/ false); + EnterSubmodule(ModuleToImport, EndLoc, /*ForPragma*/ false); // Let the parser know that any future declarations are within the new // submodule. // FIXME: There's no point doing this if we're handling a #__include_macros // directive. - return {ImportAction::ModuleBegin, SM}; + return {ImportAction::ModuleBegin, ModuleToImport}; } assert(!IsImportDecl && "failed to diagnose missing module for import decl"); diff --git a/clang/lib/Lex/PreprocessingRecord.cpp b/clang/lib/Lex/PreprocessingRecord.cpp index aab6a2b..be5aac7 100644 --- a/clang/lib/Lex/PreprocessingRecord.cpp +++ b/clang/lib/Lex/PreprocessingRecord.cpp @@ -472,8 +472,8 @@ void PreprocessingRecord::MacroUndefined(const Token &Id, void PreprocessingRecord::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { InclusionDirective::InclusionKind Kind = InclusionDirective::Include; switch (IncludeTok.getIdentifierInfo()->getPPKeywordID()) { @@ -506,10 +506,9 @@ void PreprocessingRecord::InclusionDirective( EndLoc = EndLoc.getLocWithOffset(-1); // the InclusionDirective expects // a token range. } - clang::InclusionDirective *ID = - new (*this) clang::InclusionDirective(*this, Kind, FileName, !IsAngled, - (bool)Imported, File, - SourceRange(HashLoc, EndLoc)); + clang::InclusionDirective *ID = new (*this) clang::InclusionDirective( + *this, Kind, FileName, !IsAngled, ModuleImported, File, + SourceRange(HashLoc, EndLoc)); addPreprocessedEntity(ID); } diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp index 995d8b2..5a9e563 100644 --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp @@ -430,14 +430,14 @@ void ModuleDepCollectorPP::LexedFileChanged(FileID FID, void ModuleDepCollectorPP::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, - StringRef SearchPath, StringRef RelativePath, const Module *Imported, - SrcMgr::CharacteristicKind FileType) { - if (!File && !Imported) { + StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) { + if (!File && !ModuleImported) { // This is a non-modular include that HeaderSearch failed to find. Add it // here as `FileChanged` will never see it. MDC.addFileDep(FileName); } - handleImport(Imported); + handleImport(SuggestedModule); } void ModuleDepCollectorPP::moduleImport(SourceLocation ImportLoc, diff --git a/clang/tools/libclang/Indexing.cpp b/clang/tools/libclang/Indexing.cpp index 17d393e..05d88452 100644 --- a/clang/tools/libclang/Indexing.cpp +++ b/clang/tools/libclang/Indexing.cpp @@ -261,12 +261,13 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override { bool isImport = (IncludeTok.is(tok::identifier) && IncludeTok.getIdentifierInfo()->getPPKeywordID() == tok::pp_import); DataConsumer.ppIncludedFile(HashLoc, FileName, File, isImport, IsAngled, - Imported); + ModuleImported); } /// MacroDefined - This hook is called whenever a macro definition is seen. diff --git a/clang/unittests/Lex/PPCallbacksTest.cpp b/clang/unittests/Lex/PPCallbacksTest.cpp index e0a27b5..f3cdb1d 100644 --- a/clang/unittests/Lex/PPCallbacksTest.cpp +++ b/clang/unittests/Lex/PPCallbacksTest.cpp @@ -37,7 +37,8 @@ public: StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, StringRef SearchPath, - StringRef RelativePath, const Module *Imported, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, SrcMgr::CharacteristicKind FileType) override { this->HashLoc = HashLoc; this->IncludeTok = IncludeTok; @@ -47,7 +48,8 @@ public: this->File = File; this->SearchPath = SearchPath.str(); this->RelativePath = RelativePath.str(); - this->Imported = Imported; + this->SuggestedModule = SuggestedModule; + this->ModuleImported = ModuleImported; this->FileType = FileType; } @@ -59,7 +61,8 @@ public: OptionalFileEntryRef File; SmallString<16> SearchPath; SmallString<16> RelativePath; - const Module* Imported; + const Module *SuggestedModule; + bool ModuleImported; SrcMgr::CharacteristicKind FileType; }; -- cgit v1.1 From 13c14ad42c65e154dc079332dd5dd58e8925d26c Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 8 Feb 2024 18:14:07 +0000 Subject: Revert "[DebugInfo][RemoveDIs] Turn on non-instrinsic debug-info by default" This reverts commit bdde5f9bea75e897bcc31a95b9c3376988c211cc. Two situations that are tripping a few buildbots: https://lab.llvm.org/buildbot/#/builders/205/builds/25126 Here, polly is currently presenting a DebugLoc attached to a debugging intrinsic as a "true" source location in a user report, something that's unreliable. https://lab.llvm.org/buildbot/#/builders/184/builds/10242 These HWAsan failures are probably (97% confidence) because in StackInfoBuilder::visit we're not observing DPValues attached to lifetime intrinsics because they're delt with higher up the function. But it's late-o'clock here, so revert for now. --- llvm/lib/IR/BasicBlock.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index bf02eba..fe9d0d0 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -34,7 +34,7 @@ cl::opt UseNewDbgInfoFormat("experimental-debuginfo-iterators", cl::desc("Enable communicating debuginfo positions " "through iterators, eliminating intrinsics"), - cl::init(true)); + cl::init(false)); DPMarker *BasicBlock::createMarker(Instruction *I) { assert(IsNewDbgInfoFormat && -- cgit v1.1 From 544f610d5310e1c1e7dd7a081d5a2a2607225740 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 8 Feb 2024 19:22:16 +0100 Subject: [libc++] Use __is_pointer_in_range inside vector::insert (#80624) --- libcxx/include/vector | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libcxx/include/vector b/libcxx/include/vector index 3934361..ce7df7a 100644 --- a/libcxx/include/vector +++ b/libcxx/include/vector @@ -351,6 +351,7 @@ template requires is-vector-bool-reference // Since C++ #include <__type_traits/type_identity.h> #include <__utility/exception_guard.h> #include <__utility/forward.h> +#include <__utility/is_pointer_in_range.h> #include <__utility/move.h> #include <__utility/pair.h> #include <__utility/swap.h> @@ -1580,14 +1581,13 @@ template _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::insert(const_iterator __position, const_reference __x) { pointer __p = this->__begin_ + (__position - begin()); - // We can't compare unrelated pointers inside constant expressions - if (!__libcpp_is_constant_evaluated() && this->__end_ < this->__end_cap()) { + if (this->__end_ < this->__end_cap()) { if (__p == this->__end_) { __construct_one_at_end(__x); } else { __move_range(__p, this->__end_, __p + 1); const_pointer __xr = pointer_traits::pointer_to(__x); - if (__p <= __xr && __xr < this->__end_) + if (std::__is_pointer_in_range(std::__to_address(__p), std::__to_address(__end_), std::addressof(__x))) ++__xr; *__p = *__xr; } -- cgit v1.1 From d272d944de9f0cb274752f77e97d4ceab2401ec5 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 8 Feb 2024 19:22:49 +0100 Subject: [libc++][NFC] Simplify the implementation of `numeric_limits` (#80425) The cv specializations for `numeric_limits` inherited privately for some reason. We can simplify the implementation by inheriting publicly and removing the members that just replicate the values from the base class. --- libcxx/include/limits | 283 +------------------------------------------------- 1 file changed, 5 insertions(+), 278 deletions(-) diff --git a/libcxx/include/limits b/libcxx/include/limits index a240580..c704b4d 100644 --- a/libcxx/include/limits +++ b/libcxx/include/limits @@ -436,8 +436,8 @@ protected: }; template -class _LIBCPP_TEMPLATE_VIS numeric_limits : private __libcpp_numeric_limits<__remove_cv_t<_Tp> > { - typedef __libcpp_numeric_limits<__remove_cv_t<_Tp> > __base; +class _LIBCPP_TEMPLATE_VIS numeric_limits : private __libcpp_numeric_limits<_Tp> { + typedef __libcpp_numeric_limits<_Tp> __base; typedef typename __base::type type; public: @@ -530,286 +530,13 @@ template _LIBCPP_CONSTEXPR const float_round_style numeric_limits<_Tp>::round_style; template -class _LIBCPP_TEMPLATE_VIS numeric_limits : private numeric_limits<_Tp> { - typedef numeric_limits<_Tp> __base; - typedef _Tp type; - -public: - static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); } - - static _LIBCPP_CONSTEXPR const int digits = __base::digits; - static _LIBCPP_CONSTEXPR const int digits10 = __base::digits10; - static _LIBCPP_CONSTEXPR const int max_digits10 = __base::max_digits10; - static _LIBCPP_CONSTEXPR const bool is_signed = __base::is_signed; - static _LIBCPP_CONSTEXPR const bool is_integer = __base::is_integer; - static _LIBCPP_CONSTEXPR const bool is_exact = __base::is_exact; - static _LIBCPP_CONSTEXPR const int radix = __base::radix; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); } +class _LIBCPP_TEMPLATE_VIS numeric_limits : public numeric_limits<_Tp> {}; - static _LIBCPP_CONSTEXPR const int min_exponent = __base::min_exponent; - static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10; - static _LIBCPP_CONSTEXPR const int max_exponent = __base::max_exponent; - static _LIBCPP_CONSTEXPR const int max_exponent10 = __base::max_exponent10; - - static _LIBCPP_CONSTEXPR const bool has_infinity = __base::has_infinity; - static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = __base::has_quiet_NaN; - static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN; - _LIBCPP_SUPPRESS_DEPRECATED_PUSH - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss; - _LIBCPP_SUPPRESS_DEPRECATED_POP - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); } - - static _LIBCPP_CONSTEXPR const bool is_iec559 = __base::is_iec559; - static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded; - static _LIBCPP_CONSTEXPR const bool is_modulo = __base::is_modulo; - - static _LIBCPP_CONSTEXPR const bool traps = __base::traps; - static _LIBCPP_CONSTEXPR const bool tinyness_before = __base::tinyness_before; - static _LIBCPP_CONSTEXPR const float_round_style round_style = __base::round_style; -}; - -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_specialized; -template -_LIBCPP_CONSTEXPR const int numeric_limits::digits; -template -_LIBCPP_CONSTEXPR const int numeric_limits::digits10; -template -_LIBCPP_CONSTEXPR const int numeric_limits::max_digits10; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_signed; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_integer; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_exact; -template -_LIBCPP_CONSTEXPR const int numeric_limits::radix; template -_LIBCPP_CONSTEXPR const int numeric_limits::min_exponent; -template -_LIBCPP_CONSTEXPR const int numeric_limits::min_exponent10; -template -_LIBCPP_CONSTEXPR const int numeric_limits::max_exponent; -template -_LIBCPP_CONSTEXPR const int numeric_limits::max_exponent10; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_infinity; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_quiet_NaN; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_signaling_NaN; -template -_LIBCPP_CONSTEXPR const float_denorm_style numeric_limits::has_denorm; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_denorm_loss; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_iec559; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_bounded; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_modulo; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::traps; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::tinyness_before; -template -_LIBCPP_CONSTEXPR const float_round_style numeric_limits::round_style; - -template -class _LIBCPP_TEMPLATE_VIS numeric_limits : private numeric_limits<_Tp> { - typedef numeric_limits<_Tp> __base; - typedef _Tp type; - -public: - static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); } - - static _LIBCPP_CONSTEXPR const int digits = __base::digits; - static _LIBCPP_CONSTEXPR const int digits10 = __base::digits10; - static _LIBCPP_CONSTEXPR const int max_digits10 = __base::max_digits10; - static _LIBCPP_CONSTEXPR const bool is_signed = __base::is_signed; - static _LIBCPP_CONSTEXPR const bool is_integer = __base::is_integer; - static _LIBCPP_CONSTEXPR const bool is_exact = __base::is_exact; - static _LIBCPP_CONSTEXPR const int radix = __base::radix; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); } - - static _LIBCPP_CONSTEXPR const int min_exponent = __base::min_exponent; - static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10; - static _LIBCPP_CONSTEXPR const int max_exponent = __base::max_exponent; - static _LIBCPP_CONSTEXPR const int max_exponent10 = __base::max_exponent10; - - static _LIBCPP_CONSTEXPR const bool has_infinity = __base::has_infinity; - static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = __base::has_quiet_NaN; - static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN; - _LIBCPP_SUPPRESS_DEPRECATED_PUSH - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss; - _LIBCPP_SUPPRESS_DEPRECATED_POP - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); } +class _LIBCPP_TEMPLATE_VIS numeric_limits : public numeric_limits<_Tp> {}; - static _LIBCPP_CONSTEXPR const bool is_iec559 = __base::is_iec559; - static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded; - static _LIBCPP_CONSTEXPR const bool is_modulo = __base::is_modulo; - - static _LIBCPP_CONSTEXPR const bool traps = __base::traps; - static _LIBCPP_CONSTEXPR const bool tinyness_before = __base::tinyness_before; - static _LIBCPP_CONSTEXPR const float_round_style round_style = __base::round_style; -}; - -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_specialized; -template -_LIBCPP_CONSTEXPR const int numeric_limits::digits; -template -_LIBCPP_CONSTEXPR const int numeric_limits::digits10; -template -_LIBCPP_CONSTEXPR const int numeric_limits::max_digits10; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_signed; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_integer; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_exact; -template -_LIBCPP_CONSTEXPR const int numeric_limits::radix; -template -_LIBCPP_CONSTEXPR const int numeric_limits::min_exponent; -template -_LIBCPP_CONSTEXPR const int numeric_limits::min_exponent10; -template -_LIBCPP_CONSTEXPR const int numeric_limits::max_exponent; -template -_LIBCPP_CONSTEXPR const int numeric_limits::max_exponent10; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_infinity; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_quiet_NaN; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_signaling_NaN; -template -_LIBCPP_CONSTEXPR const float_denorm_style numeric_limits::has_denorm; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_denorm_loss; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_iec559; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_bounded; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_modulo; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::traps; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::tinyness_before; -template -_LIBCPP_CONSTEXPR const float_round_style numeric_limits::round_style; - -template -class _LIBCPP_TEMPLATE_VIS numeric_limits : private numeric_limits<_Tp> { - typedef numeric_limits<_Tp> __base; - typedef _Tp type; - -public: - static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); } - - static _LIBCPP_CONSTEXPR const int digits = __base::digits; - static _LIBCPP_CONSTEXPR const int digits10 = __base::digits10; - static _LIBCPP_CONSTEXPR const int max_digits10 = __base::max_digits10; - static _LIBCPP_CONSTEXPR const bool is_signed = __base::is_signed; - static _LIBCPP_CONSTEXPR const bool is_integer = __base::is_integer; - static _LIBCPP_CONSTEXPR const bool is_exact = __base::is_exact; - static _LIBCPP_CONSTEXPR const int radix = __base::radix; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); } - - static _LIBCPP_CONSTEXPR const int min_exponent = __base::min_exponent; - static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10; - static _LIBCPP_CONSTEXPR const int max_exponent = __base::max_exponent; - static _LIBCPP_CONSTEXPR const int max_exponent10 = __base::max_exponent10; - - static _LIBCPP_CONSTEXPR const bool has_infinity = __base::has_infinity; - static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = __base::has_quiet_NaN; - static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN; - _LIBCPP_SUPPRESS_DEPRECATED_PUSH - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss; - _LIBCPP_SUPPRESS_DEPRECATED_POP - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); } - - static _LIBCPP_CONSTEXPR const bool is_iec559 = __base::is_iec559; - static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded; - static _LIBCPP_CONSTEXPR const bool is_modulo = __base::is_modulo; - - static _LIBCPP_CONSTEXPR const bool traps = __base::traps; - static _LIBCPP_CONSTEXPR const bool tinyness_before = __base::tinyness_before; - static _LIBCPP_CONSTEXPR const float_round_style round_style = __base::round_style; -}; - -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_specialized; -template -_LIBCPP_CONSTEXPR const int numeric_limits::digits; -template -_LIBCPP_CONSTEXPR const int numeric_limits::digits10; -template -_LIBCPP_CONSTEXPR const int numeric_limits::max_digits10; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_signed; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_integer; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_exact; -template -_LIBCPP_CONSTEXPR const int numeric_limits::radix; -template -_LIBCPP_CONSTEXPR const int numeric_limits::min_exponent; -template -_LIBCPP_CONSTEXPR const int numeric_limits::min_exponent10; -template -_LIBCPP_CONSTEXPR const int numeric_limits::max_exponent; -template -_LIBCPP_CONSTEXPR const int numeric_limits::max_exponent10; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_infinity; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_quiet_NaN; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_signaling_NaN; -template -_LIBCPP_CONSTEXPR const float_denorm_style numeric_limits::has_denorm; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::has_denorm_loss; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_iec559; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_bounded; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::is_modulo; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::traps; -template -_LIBCPP_CONSTEXPR const bool numeric_limits::tinyness_before; template -_LIBCPP_CONSTEXPR const float_round_style numeric_limits::round_style; +class _LIBCPP_TEMPLATE_VIS numeric_limits : public numeric_limits<_Tp> {}; _LIBCPP_END_NAMESPACE_STD -- cgit v1.1 From 1b5f6916199ce09244cdb52c6911f2028e6ca95a Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 8 Feb 2024 19:23:10 +0100 Subject: [libc++] Avoid including in (#80418) This reduces the time to include `` from 84ms to 36ms. --- libcxx/include/__compare/strong_order.h | 23 +++++++++++++---------- libcxx/include/__compare/weak_order.h | 12 +++++++----- libcxx/include/compare | 1 + libcxx/test/libcxx/transitive_includes/cxx23.csv | 1 - libcxx/test/libcxx/transitive_includes/cxx26.csv | 1 - 5 files changed, 21 insertions(+), 17 deletions(-) diff --git a/libcxx/include/__compare/strong_order.h b/libcxx/include/__compare/strong_order.h index 5f6ade5..3dc819e 100644 --- a/libcxx/include/__compare/strong_order.h +++ b/libcxx/include/__compare/strong_order.h @@ -13,11 +13,14 @@ #include <__compare/compare_three_way.h> #include <__compare/ordering.h> #include <__config> +#include <__math/exponential_functions.h> +#include <__math/traits.h> #include <__type_traits/conditional.h> #include <__type_traits/decay.h> +#include <__type_traits/is_floating_point.h> +#include <__type_traits/is_same.h> #include <__utility/forward.h> #include <__utility/priority_tag.h> -#include #include #include @@ -66,27 +69,27 @@ struct __fn { return strong_ordering::greater; } else if (__t == __u) { if constexpr (numeric_limits<_Dp>::radix == 2) { - return std::signbit(__u) <=> std::signbit(__t); + return __math::signbit(__u) <=> __math::signbit(__t); } else { // This is bullet 3 of the IEEE754 algorithm, relevant // only for decimal floating-point; // see https://stackoverflow.com/questions/69068075/ - if (__t == 0 || std::isinf(__t)) { - return std::signbit(__u) <=> std::signbit(__t); + if (__t == 0 || __math::isinf(__t)) { + return __math::signbit(__u) <=> __math::signbit(__t); } else { int __texp, __uexp; - (void)std::frexp(__t, &__texp); - (void)std::frexp(__u, &__uexp); + (void)__math::frexp(__t, &__texp); + (void)__math::frexp(__u, &__uexp); return (__t < 0) ? (__texp <=> __uexp) : (__uexp <=> __texp); } } } else { // They're unordered, so one of them must be a NAN. // The order is -QNAN, -SNAN, numbers, +SNAN, +QNAN. - bool __t_is_nan = std::isnan(__t); - bool __u_is_nan = std::isnan(__u); - bool __t_is_negative = std::signbit(__t); - bool __u_is_negative = std::signbit(__u); + bool __t_is_nan = __math::isnan(__t); + bool __u_is_nan = __math::isnan(__u); + bool __t_is_negative = __math::signbit(__t); + bool __u_is_negative = __math::signbit(__u); using _IntType = conditional_t< sizeof(__t) == sizeof(int32_t), int32_t, diff --git a/libcxx/include/__compare/weak_order.h b/libcxx/include/__compare/weak_order.h index 9f719eb..b82a708 100644 --- a/libcxx/include/__compare/weak_order.h +++ b/libcxx/include/__compare/weak_order.h @@ -13,10 +13,12 @@ #include <__compare/ordering.h> #include <__compare/strong_order.h> #include <__config> +#include <__math/traits.h> #include <__type_traits/decay.h> +#include <__type_traits/is_floating_point.h> +#include <__type_traits/is_same.h> #include <__utility/forward.h> #include <__utility/priority_tag.h> -#include #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header @@ -51,10 +53,10 @@ struct __fn { return weak_ordering::greater; } else { // Otherwise, at least one of them is a NaN. - bool __t_is_nan = std::isnan(__t); - bool __u_is_nan = std::isnan(__u); - bool __t_is_negative = std::signbit(__t); - bool __u_is_negative = std::signbit(__u); + bool __t_is_nan = __math::isnan(__t); + bool __u_is_nan = __math::isnan(__u); + bool __t_is_negative = __math::signbit(__t); + bool __u_is_negative = __math::signbit(__u); if (__t_is_nan && __u_is_nan) { return (__u_is_negative <=> __t_is_negative); } else if (__t_is_nan) { diff --git a/libcxx/include/compare b/libcxx/include/compare index 626c743..cc0cae8 100644 --- a/libcxx/include/compare +++ b/libcxx/include/compare @@ -162,6 +162,7 @@ namespace std { #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include #endif diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv index 7c7099d..bd82411 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx23.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv @@ -105,7 +105,6 @@ codecvt string codecvt tuple codecvt typeinfo codecvt version -compare cmath compare cstddef compare cstdint compare limits diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index 7c7099d..bd82411 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -105,7 +105,6 @@ codecvt string codecvt tuple codecvt typeinfo codecvt version -compare cmath compare cstddef compare cstdint compare limits -- cgit v1.1 From b92e0a31dab5917f31b4672430004add34b5e775 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Thu, 8 Feb 2024 10:23:20 -0800 Subject: [flang][cuda] Fix warning in switch --- flang/lib/Lower/ConvertVariable.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index f761e14..d57bdd4 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -1603,7 +1603,7 @@ fir::CUDAAttributeAttr Fortran::lower::translateSymbolCUDAAttribute( break; case Fortran::common::CUDADataAttr::Texture: // Obsolete attribute - break; + return {}; } return fir::CUDAAttributeAttr::get(mlirContext, attr); -- cgit v1.1 From c0ff10814fb056369cd2bbf0e672498b4cc9c1d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Thu, 8 Feb 2024 19:24:55 +0100 Subject: docs/GettingStarted: document linker-related cmake options (#80932) Both LLVM_LINK_LLVM_DYLIB and LLVM_PARALLEL_LINK_JOBS help with some common gotchas. It seems worth documenting them here explicitly. Based on a review comment, also "refactor" the documentation to avoid duplication. --- llvm/docs/CMake.rst | 2 ++ llvm/docs/GettingStarted.rst | 86 +++++++++----------------------------------- 2 files changed, 19 insertions(+), 69 deletions(-) diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst index 13d1912c..20f73c9 100644 --- a/llvm/docs/CMake.rst +++ b/llvm/docs/CMake.rst @@ -178,6 +178,8 @@ variable and type on the CMake command line: $ cmake -DVARIABLE:TYPE=value path/to/llvm/source +.. _cmake_frequently_used_variables: + Frequently-used CMake variables ------------------------------- diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst index 316fc6a..687d1f2 100644 --- a/llvm/docs/GettingStarted.rst +++ b/llvm/docs/GettingStarted.rst @@ -540,75 +540,23 @@ Variables are passed to ``cmake`` on the command line using the format ``-D=``. The following variables are some common options used by people developing LLVM. -+-------------------------+----------------------------------------------------+ -| Variable | Purpose | -+=========================+====================================================+ -| CMAKE_C_COMPILER | Tells ``cmake`` which C compiler to use. By | -| | default, this will be /usr/bin/cc. | -+-------------------------+----------------------------------------------------+ -| CMAKE_CXX_COMPILER | Tells ``cmake`` which C++ compiler to use. By | -| | default, this will be /usr/bin/c++. | -+-------------------------+----------------------------------------------------+ -| CMAKE_BUILD_TYPE | Tells ``cmake`` what type of build you are trying | -| | to generate files for. Valid options are Debug, | -| | Release, RelWithDebInfo, and MinSizeRel. Default | -| | is Debug. | -+-------------------------+----------------------------------------------------+ -| CMAKE_INSTALL_PREFIX | Specifies the install directory to target when | -| | running the install action of the build files. | -+-------------------------+----------------------------------------------------+ -| Python3_EXECUTABLE | Forces CMake to use a specific Python version by | -| | passing a path to a Python interpreter. By default | -| | the Python version of the interpreter in your PATH | -| | is used. | -+-------------------------+----------------------------------------------------+ -| LLVM_TARGETS_TO_BUILD | A semicolon delimited list controlling which | -| | targets will be built and linked into llvm. | -| | The default list is defined as | -| | ``LLVM_ALL_TARGETS``, and can be set to include | -| | out-of-tree targets. The default value includes: | -| | ``AArch64, AMDGPU, ARM, AVR, BPF, Hexagon, Lanai, | -| | Mips, MSP430, NVPTX, PowerPC, RISCV, Sparc, | -| | SystemZ, WebAssembly, X86, XCore``. Setting this | -| | to ``"host"`` will only compile the host | -| | architecture (e.g. equivalent to specifying ``X86``| -| | on an x86 host machine) can | -| | significantly speed up compile and test times. | -+-------------------------+----------------------------------------------------+ -| LLVM_ENABLE_DOXYGEN | Build doxygen-based documentation from the source | -| | code This is disabled by default because it is | -| | slow and generates a lot of output. | -+-------------------------+----------------------------------------------------+ -| LLVM_ENABLE_PROJECTS | A semicolon-delimited list selecting which of the | -| | other LLVM subprojects to additionally build. (Only| -| | effective when using a side-by-side project layout | -| | e.g. via git). The default list is empty. Can | -| | include: clang, clang-tools-extra, | -| | cross-project-tests, flang, libc, libclc, lld, | -| | lldb, mlir, openmp, polly, or pstl. | -+-------------------------+----------------------------------------------------+ -| LLVM_ENABLE_RUNTIMES | A semicolon-delimited list selecting which of the | -| | runtimes to build. (Only effective when using the | -| | full monorepo layout). The default list is empty. | -| | Can include: compiler-rt, libc, libcxx, libcxxabi, | -| | libunwind, or openmp. | -+-------------------------+----------------------------------------------------+ -| LLVM_ENABLE_SPHINX | Build sphinx-based documentation from the source | -| | code. This is disabled by default because it is | -| | slow and generates a lot of output. Sphinx version | -| | 1.5 or later recommended. | -+-------------------------+----------------------------------------------------+ -| LLVM_BUILD_LLVM_DYLIB | Generate libLLVM.so. This library contains a | -| | default set of LLVM components that can be | -| | overridden with ``LLVM_DYLIB_COMPONENTS``. The | -| | default contains most of LLVM and is defined in | -| | ``tools/llvm-shlib/CMakelists.txt``. This option is| -| | not available on Windows. | -+-------------------------+----------------------------------------------------+ -| LLVM_OPTIMIZED_TABLEGEN | Builds a release tablegen that gets used during | -| | the LLVM build. This can dramatically speed up | -| | debug builds. | -+-------------------------+----------------------------------------------------+ +* ``CMAKE_C_COMPILER`` +* ``CMAKE_CXX_COMPILER`` +* ``CMAKE_BUILD_TYPE`` +* ``CMAKE_INSTALL_PREFIX`` +* ``Python3_EXECUTABLE`` +* ``LLVM_TARGETS_TO_BUILD`` +* ``LLVM_ENABLE_PROJECTS`` +* ``LLVM_ENABLE_RUNTIMES`` +* ``LLVM_ENABLE_DOXYGEN`` +* ``LLVM_ENABLE_SPHINX`` +* ``LLVM_BUILD_LLVM_DYLIB`` +* ``LLVM_LINK_LLVM_DYLIB`` +* ``LLVM_PARALLEL_LINK_JOBS`` +* ``LLVM_OPTIMIZED_TABLEGEN`` + +See :ref:`the list of frequently-used CMake variables ` +for more information. To configure LLVM, follow these steps: -- cgit v1.1 From 687304a018d36c4b0def4618a98fee6975172453 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 8 Feb 2024 10:26:06 -0800 Subject: [clang][lex] Fix build failure after da95d926 --- clang/lib/Lex/PPDirectives.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 97f9c0a..0b22139 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -2306,12 +2306,12 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // Load the module to import its macros. We'll make the declarations // visible when the parser gets here. - // FIXME: Pass SM in here rather than converting it to a path and making the - // module loader convert it back again. + // FIXME: Pass ModuleToImport in here rather than converting it to a path + // and making the module loader convert it back again. ModuleLoadResult Imported = TheModuleLoader.loadModule( IncludeTok.getLocation(), Path, Module::Hidden, /*IsInclusionDirective=*/true); - assert((Imported == nullptr || Imported == SM) && + assert((Imported == nullptr || Imported == ModuleToImport) && "the imported module is different than the suggested one"); if (Imported) { @@ -2526,7 +2526,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( case Import: { // If this is a module import, make it visible if needed. - assert(SM && "no module to import"); + assert(ModuleToImport && "no module to import"); makeModuleVisible(ModuleToImport, EndLoc); -- cgit v1.1 From ab4a793e8bc78f50f9f104c9c732e2dd91bf70a2 Mon Sep 17 00:00:00 2001 From: Chelsea Cassanova Date: Thu, 8 Feb 2024 10:33:37 -0800 Subject: [lldb][debugger][NFC] Add broadcast bit for category-based progress events. (#81169) This commit adds a new broadcast bit to the debugger. When in use, it will be listened to for progress events that will be delivered and kept track of by category as opposed to the current behaviour of coming in one by one. --- lldb/include/lldb/API/SBDebugger.h | 1 + lldb/include/lldb/Core/Debugger.h | 1 + 2 files changed, 2 insertions(+) diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h index 218113a..62b2f91 100644 --- a/lldb/include/lldb/API/SBDebugger.h +++ b/lldb/include/lldb/API/SBDebugger.h @@ -46,6 +46,7 @@ public: eBroadcastBitProgress = (1 << 0), eBroadcastBitWarning = (1 << 1), eBroadcastBitError = (1 << 2), + eBroadcastBitProgressCategory = (1 << 3), }; SBDebugger(); diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h index c6d603c..6ba90eb 100644 --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -84,6 +84,7 @@ public: eBroadcastBitWarning = (1 << 1), eBroadcastBitError = (1 << 2), eBroadcastSymbolChange = (1 << 3), + eBroadcastBitProgressCategory = (1 << 4), }; using DebuggerList = std::vector; -- cgit v1.1 From a1ed821b49d9a189c3a0a11228c0de517020feca Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 8 Feb 2024 10:56:00 -0800 Subject: [TableGen] Simplify prepSkipToLineEnd for preprocessing The MemoryBuffer is created using `RequiresNullTerminator`, so we can safely skip the `CurPtr != CurBuf.end()` check. The redundant check causes a cppcheck report. In addition, elsewhere, including `*CurPtr == '#'` below, makes the null terminator assumption as well. Close #81120 --- llvm/lib/TableGen/TGLexer.cpp | 8 ++------ llvm/lib/TableGen/TGLexer.h | 5 ----- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index 5456432..99d866a 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -849,7 +849,8 @@ bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { do { // Skip all symbols to the line end. - prepSkipToLineEnd(); + while (*CurPtr != '\n') + ++CurPtr; // Find the first non-whitespace symbol in the next line(s). if (!prepSkipLineBegin()) @@ -1032,11 +1033,6 @@ bool TGLexer::prepSkipDirectiveEnd() { return true; } -void TGLexer::prepSkipToLineEnd() { - while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) - ++CurPtr; -} - bool TGLexer::prepIsProcessingEnabled() { for (const PreprocessorControlDesc &I : llvm::reverse(*PrepIncludeStack.back())) diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index 25dcd9f..9adc03c 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -467,11 +467,6 @@ private: // directive. bool prepSkipDirectiveEnd(); - // Skip all symbols to the end of the line/file. - // The method adjusts CurPtr, so that it points to either new line - // symbol in the current line or the buffer end. - void prepSkipToLineEnd(); - // Return true, if the current preprocessor control stack is such that // we should allow lexer to process the next token, false - otherwise. // -- cgit v1.1 From a56fa161ab2617fa3aab3f91285fc757b6a8e09b Mon Sep 17 00:00:00 2001 From: Krystian Stasiowski Date: Thu, 8 Feb 2024 13:59:47 -0500 Subject: [clang-tidy] Fix failing test after #80864 (#81171) The following test case in `clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp` is failing: ``` #ifdef PR64602 // Should not crash template struct S { auto foo(auto); }; template <> auto S<>::foo(auto) { return 1; } // CHECK8: error: template parameter list matching the non-templated nested type 'S<>' should be empty ('template<>') [clang-diagnostic-error] #endif ``` #80864 fixes a bug where we would (incorrectly) append invented template parameters to empty template parameter lists, which causes this test to fail. --- clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp index 547f634..d0efc5c 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp @@ -68,5 +68,6 @@ auto S<>::foo(auto) { return 1; } -// CHECK8: error: template parameter list matching the non-templated nested type 'S<>' should be empty ('template<>') [clang-diagnostic-error] +// CHECK8: error: conflicting types for 'foo' [clang-diagnostic-error] +// CHECK8: note: previous declaration is here #endif -- cgit v1.1 From 3d71e4166de81bc3b86d127d9ac6607bda2b2755 Mon Sep 17 00:00:00 2001 From: Jeremy Kun <2467754+j2kun@users.noreply.github.com> Date: Thu, 8 Feb 2024 11:06:43 -0800 Subject: [docs]: Add a note about using custom types with diagnostics (#73818) --- mlir/docs/Diagnostics.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mlir/docs/Diagnostics.md b/mlir/docs/Diagnostics.md index 9819843..82bc61d 100644 --- a/mlir/docs/Diagnostics.md +++ b/mlir/docs/Diagnostics.md @@ -119,6 +119,14 @@ op->emitError() << anotherOp; op->emitRemark() << anotherOp; ``` +To make a custom type compatible with Diagnostics, one must implement the +following friend function. + +```c++ +friend mlir::Diagnostic &operator<<( + mlir::Diagnostic &diagnostic, const MyType &foo); +``` + ### Attaching notes Unlike many other compiler frameworks, notes in MLIR cannot be emitted directly. -- cgit v1.1 From 74fc16aaaa227b84e22706d2c5e376287f560b9e Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 8 Feb 2024 11:24:07 -0800 Subject: [lldb] Expand background symbol download (#80890) LLDB has a setting (symbols.enable-background-lookup) that calls dsymForUUID on a background thread for images as they appear in the current backtrace. Originally, the laziness of only looking up symbols for images in the backtrace only existed to bring the number of dsymForUUID calls down to a manageable number. Users have requesting the same functionality but blocking. This gives them the same user experience as enabling dsymForUUID globally, but without the massive upfront cost of having to download all the images, the majority of which they'll likely not need. This patch renames the setting to have a more generic name (symbols.auto-download) and changes its values from a boolean to an enum. Users can now specify "off", "background" and "foreground". The default remains "off" although I'll probably change that in the near future. --- lldb/include/lldb/Core/ModuleList.h | 23 ++++++++++++++++++++++- lldb/include/lldb/lldb-enumerations.h | 6 ++++++ lldb/source/Core/CoreProperties.td | 7 ++++++- lldb/source/Core/ModuleList.cpp | 13 +++++++++---- lldb/source/Host/common/Host.cpp | 2 ++ lldb/source/Symbol/SymbolLocator.cpp | 22 ++++++++++++++++------ 6 files changed, 61 insertions(+), 12 deletions(-) diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h index d78f7c5..43d931a 100644 --- a/lldb/include/lldb/Core/ModuleList.h +++ b/lldb/include/lldb/Core/ModuleList.h @@ -47,6 +47,26 @@ class UUID; class VariableList; struct ModuleFunctionSearchOptions; +static constexpr OptionEnumValueElement g_auto_download_enum_values[] = { + { + lldb::eSymbolDownloadOff, + "off", + "Disable automatically downloading symbols.", + }, + { + lldb::eSymbolDownloadBackground, + "background", + "Download symbols in the background for images as they appear in the " + "backtrace.", + }, + { + lldb::eSymbolDownloadForeground, + "foreground", + "Download symbols in the foreground for images as they appear in the " + "backtrace.", + }, +}; + class ModuleListProperties : public Properties { mutable llvm::sys::RWMutex m_symlink_paths_mutex; PathMappingList m_symlink_paths; @@ -60,7 +80,6 @@ public: bool SetClangModulesCachePath(const FileSpec &path); bool GetEnableExternalLookup() const; bool SetEnableExternalLookup(bool new_value); - bool GetEnableBackgroundLookup() const; bool GetEnableLLDBIndexCache() const; bool SetEnableLLDBIndexCache(bool new_value); uint64_t GetLLDBIndexCacheMaxByteSize(); @@ -71,6 +90,8 @@ public: bool GetLoadSymbolOnDemand(); + lldb::SymbolDownload GetSymbolAutoDownload() const; + PathMappingList GetSymlinkMappings() const; }; diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 7e9b538..4640533 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -1314,6 +1314,12 @@ enum class ChildCacheState { ///< re-use what we computed the last time we called Update. }; +enum SymbolDownload { + eSymbolDownloadOff = 0, + eSymbolDownloadBackground = 1, + eSymbolDownloadForeground = 2, +}; + } // namespace lldb #endif // LLDB_LLDB_ENUMERATIONS_H diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td index 8d81967..9c4aa2d 100644 --- a/lldb/source/Core/CoreProperties.td +++ b/lldb/source/Core/CoreProperties.td @@ -8,7 +8,12 @@ let Definition = "modulelist" in { def EnableBackgroundLookup: Property<"enable-background-lookup", "Boolean">, Global, DefaultFalse, - Desc<"On macOS, enable calling dsymForUUID (or an equivalent script/binary) in the background to locate symbol files that weren't found.">; + Desc<"Alias for backward compatibility: when enabled this is the equivalent to 'symbols.download background'.">; + def AutoDownload: Property<"auto-download", "Enum">, + Global, + DefaultEnumValue<"eSymbolDownloadOff">, + EnumValues<"OptionEnumValues(g_auto_download_enum_values)">, + Desc<"On macOS, automatically download symbols with dsymForUUID (or an equivalent script/binary) for relevant images in the debug session.">; def ClangModulesCachePath: Property<"clang-modules-cache-path", "FileSpec">, Global, DefaultStringValue<"">, diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp index b7f3936..b03490b 100644 --- a/lldb/source/Core/ModuleList.cpp +++ b/lldb/source/Core/ModuleList.cpp @@ -104,10 +104,15 @@ bool ModuleListProperties::SetEnableExternalLookup(bool new_value) { return SetPropertyAtIndex(ePropertyEnableExternalLookup, new_value); } -bool ModuleListProperties::GetEnableBackgroundLookup() const { - const uint32_t idx = ePropertyEnableBackgroundLookup; - return GetPropertyAtIndexAs( - idx, g_modulelist_properties[idx].default_uint_value != 0); +SymbolDownload ModuleListProperties::GetSymbolAutoDownload() const { + // Backward compatibility alias. + if (GetPropertyAtIndexAs(ePropertyEnableBackgroundLookup, false)) + return eSymbolDownloadBackground; + + const uint32_t idx = ePropertyAutoDownload; + return GetPropertyAtIndexAs( + idx, static_cast( + g_modulelist_properties[idx].default_uint_value)); } FileSpec ModuleListProperties::GetClangModulesCachePath() const { diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp index f4cec97..b72ba7e 100644 --- a/lldb/source/Host/common/Host.cpp +++ b/lldb/source/Host/common/Host.cpp @@ -550,6 +550,8 @@ llvm::Error Host::OpenFileInExternalEditor(llvm::StringRef editor, } bool Host::IsInteractiveGraphicSession() { return false; } + +bool Host::IsNetworkLimited() { return false; } #endif std::unique_ptr Host::CreateDefaultConnection(llvm::StringRef url) { diff --git a/lldb/source/Symbol/SymbolLocator.cpp b/lldb/source/Symbol/SymbolLocator.cpp index 918f13ed..93a5bc4 100644 --- a/lldb/source/Symbol/SymbolLocator.cpp +++ b/lldb/source/Symbol/SymbolLocator.cpp @@ -10,6 +10,7 @@ #include "lldb/Core/Debugger.h" #include "lldb/Core/PluginManager.h" +#include "lldb/Host/Host.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Support/ThreadPool.h" @@ -18,12 +19,10 @@ using namespace lldb; using namespace lldb_private; void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) { - if (!ModuleList::GetGlobalModuleListProperties().GetEnableBackgroundLookup()) - return; - static llvm::SmallSet g_seen_uuids; static std::mutex g_mutex; - Debugger::GetThreadPool().async([=]() { + + auto lookup = [=]() { { std::lock_guard guard(g_mutex); if (g_seen_uuids.count(uuid)) @@ -36,12 +35,23 @@ void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) { module_spec.GetUUID() = uuid; if (!PluginManager::DownloadObjectAndSymbolFile(module_spec, error, /*force_lookup=*/true, - /*copy_executable=*/false)) + /*copy_executable=*/true)) return; if (error.Fail()) return; Debugger::ReportSymbolChange(module_spec); - }); + }; + + switch (ModuleList::GetGlobalModuleListProperties().GetSymbolAutoDownload()) { + case eSymbolDownloadOff: + break; + case eSymbolDownloadBackground: + Debugger::GetThreadPool().async(lookup); + break; + case eSymbolDownloadForeground: + lookup(); + break; + }; } -- cgit v1.1 From 88e52511ca71165f1ff3d7c42229aeacb2c16db3 Mon Sep 17 00:00:00 2001 From: alex-t Date: Thu, 8 Feb 2024 20:27:36 +0100 Subject: [AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init (#79586) This change implements synthesizing the private buffer resource descriptor in the kernel prolog instead of using the preloaded kernel argument. --- llvm/docs/AMDGPUUsage.rst | 10 +- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 108 ++++--- llvm/lib/Target/AMDGPU/SIFrameLowering.h | 14 +- .../AMDGPU/GlobalISel/call-outgoing-stack-args.ll | 10 +- .../abi-attribute-hints-undefined-behavior.ll | 18 +- .../blender-no-live-segment-at-def-implicit-def.ll | 5 +- .../AMDGPU/branch-folding-implicit-def-subreg.ll | 7 +- llvm/test/CodeGen/AMDGPU/call-argument-types.ll | 329 ++++++++++++--------- llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll | 30 +- llvm/test/CodeGen/AMDGPU/call-waitcnt.ll | 29 +- .../CodeGen/AMDGPU/callee-special-input-vgprs.ll | 6 +- llvm/test/CodeGen/AMDGPU/cc-update.ll | 84 +++--- .../AMDGPU/cross-block-use-is-not-abi-copy.ll | 10 +- .../CodeGen/AMDGPU/indirect-call-known-callees.ll | 9 +- llvm/test/CodeGen/AMDGPU/indirect-call.ll | 20 +- .../AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll | 5 +- llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 60 ++-- .../CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 5 +- .../CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll | 15 +- .../CodeGen/AMDGPU/lower-module-lds-via-table.ll | 15 +- ...machine-sink-temporal-divergence-swdev407790.ll | 14 +- .../CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll | 15 +- llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll | 7 +- .../CodeGen/AMDGPU/tuple-allocation-failure.ll | 14 +- llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll | 5 +- 25 files changed, 494 insertions(+), 350 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 6b24171..3019968 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -5530,9 +5530,13 @@ If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies Instead the flat SCRATCH instructions are used. Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs -that are used as a V# to access scratch. CP uses the value provided by the -runtime. It is used, together with Scratch Wavefront Offset as an offset, to -access the private memory space using a segment address. See +that are used as a V# to access scratch. +The compiler synthesizes the initialization value for the Private Segment +Buffer in the kernel prologue, using the Flat Scratch Init to initialize low +64-bit and a known constant for the high ones. If the Flat Scratch Init is not +available, CP uses the value provided by the runtime. It is used, together with +Scratch Wavefront Offset as an offset, to access the private memory space using +a segment address. See :ref:`amdgpu-amdhsa-initial-kernel-execution-state`. The scratch V# is a four-aligned SGPR and always selected for the kernel as diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index d02aee7..6327a81 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -379,7 +379,8 @@ public: } // namespace llvm // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` -void SIFrameLowering::emitEntryFunctionFlatScratchInit( +// and return the FlatScratchInit Register used +Register SIFrameLowering::emitEntryFunctionFlatScratchInit( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register ScratchWaveOffsetReg) const { const GCNSubtarget &ST = MF.getSubtarget(); @@ -399,6 +400,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( Register FlatScrInitLo; Register FlatScrInitHi; + Register FlatScratchInitReg; if (ST.isAmdPalOS()) { // Extract the scratch offset from the descriptor in the GIT @@ -408,7 +410,6 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( // Find unused reg to load flat scratch init into MachineRegisterInfo &MRI = MF.getRegInfo(); - Register FlatScrInit = AMDGPU::NoRegister; ArrayRef AllSGPR64s = TRI->getAllSGPR64(MF); unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; AllSGPR64s = AllSGPR64s.slice( @@ -417,16 +418,28 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( for (MCPhysReg Reg : AllSGPR64s) { if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) && MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { - FlatScrInit = Reg; + FlatScratchInitReg = Reg; break; } } - assert(FlatScrInit && "Failed to find free register for scratch init"); - FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); - FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); + } else { + FlatScratchInitReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(FlatScratchInitReg); + MBB.addLiveIn(FlatScratchInitReg); + } + + assert(FlatScratchInitReg && "Failed to find free register for scratch init"); + + FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); + + if (ST.isAmdPalOS()) { - buildGitPtr(MBB, I, DL, TII, FlatScrInit); + buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg); // We now have the GIT ptr - now get the scratch descriptor from the entry // at offset 0 (or offset 16 for a compute shader). @@ -441,8 +454,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; const GCNSubtarget &Subtarget = MF.getSubtarget(); unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); - BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) - .addReg(FlatScrInit) + BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg) + .addReg(FlatScratchInitReg) .addImm(EncodedOffset) // offset .addImm(0) // cpol .addMemOperand(MMO); @@ -450,20 +463,9 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( // Mask the offset in [47:0] of the descriptor const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) - .addReg(FlatScrInitHi) - .addImm(0xffff); + .addReg(FlatScrInitHi) + .addImm(0xffff); And->getOperand(3).setIsDead(); // Mark SCC as dead. - } else { - Register FlatScratchInitReg = - MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); - assert(FlatScratchInitReg); - - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(FlatScratchInitReg); - MBB.addLiveIn(FlatScratchInitReg); - - FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); - FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); } // Do a 64-bit pointer add. @@ -486,20 +488,21 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( addReg(FlatScrInitHi). addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); - return; + return FlatScratchInitReg; } - // For GFX9. + assert(ST.getGeneration() == AMDGPUSubtarget::GFX9); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) - .addReg(FlatScrInitLo) - .addReg(ScratchWaveOffsetReg); + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) .addReg(FlatScrInitHi) .addImm(0); Addc->getOperand(3).setIsDead(); // Mark SCC as dead. - return; + return AMDGPU::FLAT_SCR; } assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); @@ -520,6 +523,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( .addReg(FlatScrInitLo, RegState::Kill) .addImm(8); LShr->getOperand(3).setIsDead(); // Mark SCC as dead. + return AMDGPU::FLAT_SCR; } // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not @@ -611,11 +615,15 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const Function &F = MF.getFunction(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); assert(MFI->isEntryFunction()); + bool NeedsFlatScratchInit = + MFI->getUserSGPRInfo().hasFlatScratchInit() && + (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || + (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); + Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); @@ -641,7 +649,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // Now that we have fixed the reserved SRSRC we need to locate the // (potentially) preloaded SRSRC. Register PreloadedScratchRsrcReg; - if (ST.isAmdHsaOrMesa(F)) { + if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) { PreloadedScratchRsrcReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); if (ScratchRsrcReg && PreloadedScratchRsrcReg) { @@ -697,33 +705,30 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } - bool NeedsFlatScratchInit = - MFI->getUserSGPRInfo().hasFlatScratchInit() && - (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || - (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); - if ((NeedsFlatScratchInit || ScratchRsrcReg) && PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } + Register FlatScratchInit; if (NeedsFlatScratchInit) { - emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); + FlatScratchInit = + emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); } if (ScratchRsrcReg) { - emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, - PreloadedScratchRsrcReg, - ScratchRsrcReg, ScratchWaveOffsetReg); + emitEntryFunctionScratchRsrcRegSetup( + MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg, + PreloadedScratchRsrcReg, ScratchWaveOffsetReg); } } // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, Register PreloadedScratchRsrcReg, - Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { + const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg, + Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -771,7 +776,8 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( .addImm(21) .addReg(Rsrc03); } - } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { + } else if (ST.isMesaGfxShader(Fn) || + (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -830,6 +836,26 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( .addImm(Rsrc23 >> 32) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } else if (ST.isAmdHsaOrMesa(Fn)) { + + if (FlatScratchInit) { + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); + Register Lo_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + Register Hi_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), + TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1)) + .addReg(FlatScratchInit) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + BuildMI(MBB, I, DL, SMovB32, Lo_32) + .addImm(Rsrc23 & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Hi_32) + .addImm(Rsrc23 >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + return; + } + assert(PreloadedScratchRsrcReg); if (ScratchRsrcReg != PreloadedScratchRsrcReg) { diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index b3feb75..f706d48 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -67,19 +67,19 @@ public: MachineBasicBlock::iterator MI) const override; private: - void emitEntryFunctionFlatScratchInit(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const DebugLoc &DL, - Register ScratchWaveOffsetReg) const; + Register + emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register ScratchWaveOffsetReg) const; Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const; void emitEntryFunctionScratchRsrcRegSetup( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - Register PreloadedPrivateBufferReg, Register ScratchRsrcReg, - Register ScratchWaveOffsetReg) const; + Register FlatScratchInit, Register ScratchRsrcReg, + Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const; public: bool hasFP(const MachineFunction &MF) const override; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index e597ce6..6e49a5a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -13,10 +13,11 @@ define amdgpu_kernel void @kernel_caller_stack() { ; MUBUF-LABEL: kernel_caller_stack: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; MUBUF-NEXT: s_mov_b32 s2, -1 ; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; MUBUF-NEXT: s_mov_b32 s32, 0 -; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v0, 10 @@ -61,9 +62,10 @@ define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-LABEL: kernel_caller_byval: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; MUBUF-NEXT: s_mov_b32 s2, -1 ; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; MUBUF-NEXT: s_add_u32 s0, s0, s7 -; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: s_mov_b32 s3, 0xe00000 +; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index a439c0f..609b5e6 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -48,19 +48,20 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 { ; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs: ; FIXEDABI-SDAG: ; %bb.0: ; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9 -; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; FIXEDABI-SDAG-NEXT: s_mov_b32 s2, -1 ; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9 +; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5 +; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; FIXEDABI-SDAG-NEXT: s_mov_b32 s3, 0x11e80000 ; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; FIXEDABI-SDAG-NEXT: s_mov_b64 s[0:1], flat_scratch ; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8 ; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2 ; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0 -; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5] ; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 ; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 @@ -70,19 +71,20 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 { ; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs: ; FIXEDABI-GISEL: ; %bb.0: ; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9 -; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; FIXEDABI-GISEL-NEXT: s_mov_b32 s2, -1 ; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9 +; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5 +; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; FIXEDABI-GISEL-NEXT: s_mov_b32 s3, 0x11e80000 ; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2 -; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; FIXEDABI-GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch ; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8 ; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1 ; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0 -; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5] ; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 ; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index 7c8d40c..74c6bb5 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -10,8 +10,9 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 5a128c7..c06f213 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -5,13 +5,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-LABEL: name: f1 ; GFX90A: bb.0.bb: ; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr32 = S_MOV_B32 0 ; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc ; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc - ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: $sgpr2 = S_MOV_B32 4294967295, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: $sgpr3 = S_MOV_B32 14680064, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9 ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 87e17a1..381fb98 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -129,12 +129,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; HSA-LABEL: test_call_external_void_func_i1_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 @@ -234,8 +235,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 @@ -339,8 +341,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 @@ -422,12 +425,13 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; HSA-LABEL: test_call_external_void_func_i8_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 @@ -525,8 +529,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 @@ -625,8 +630,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 @@ -707,12 +713,13 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_i16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 @@ -809,8 +816,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_sshort v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 @@ -909,8 +917,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 @@ -991,12 +1000,13 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; HSA-LABEL: test_call_external_void_func_i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 42 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 @@ -1078,13 +1088,14 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_i64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: v_mov_b32_e32 v1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 @@ -1182,12 +1193,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_mov_b32 s4, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_mov_b32 s5, s4 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 @@ -1278,15 +1290,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2i64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 @@ -1391,12 +1404,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_mov_b32 s4, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_mov_b32 s5, s4 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v4, 1 ; HSA-NEXT: v_mov_b32_e32 v5, 2 ; HSA-NEXT: s_mov_b32 s32, 0 @@ -1514,12 +1528,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_mov_b32 s4, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_mov_b32 s5, s4 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v4, 1 ; HSA-NEXT: v_mov_b32_e32 v5, 2 ; HSA-NEXT: v_mov_b32_e32 v6, 3 @@ -1605,12 +1620,13 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_f16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 0x4400 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 @@ -1689,12 +1705,13 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 4.0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 @@ -1776,13 +1793,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 @@ -1868,14 +1886,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 4.0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 @@ -1968,16 +1987,17 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v5f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 4.0 ; HSA-NEXT: v_mov_b32_e32 v3, -1.0 ; HSA-NEXT: v_mov_b32_e32 v4, 0.5 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 @@ -2059,13 +2079,14 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_f64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 @@ -2154,15 +2175,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2f64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 ; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 @@ -2258,9 +2280,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3f64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 @@ -2268,7 +2292,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v4, 0 ; HSA-NEXT: v_mov_b32_e32 v5, 0x40200000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 @@ -2357,14 +2380,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; HSA-LABEL: test_call_external_void_func_v2i16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 s2, -1 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 @@ -2456,14 +2480,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; HSA-LABEL: test_call_external_void_func_v3i16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 s2, -1 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 @@ -2556,14 +2581,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; HSA-LABEL: test_call_external_void_func_v3f16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 s2, -1 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 @@ -2647,13 +2673,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3i16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 ; HSA-NEXT: v_mov_b32_e32 v1, 3 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 @@ -2737,13 +2764,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3f16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; HSA-NEXT: v_mov_b32_e32 v1, 0x4400 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 @@ -2835,14 +2863,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; HSA-LABEL: test_call_external_void_func_v4i16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 s2, -1 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 @@ -2928,13 +2957,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v4i16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 ; HSA-NEXT: v_mov_b32_e32 v1, 0x40003 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 @@ -3025,14 +3055,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; HSA-LABEL: test_call_external_void_func_v2f16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 s2, -1 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 @@ -3120,14 +3151,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; HSA-LABEL: test_call_external_void_func_v2i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 s2, -1 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 @@ -3210,13 +3242,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 @@ -3302,14 +3335,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; HSA-LABEL: test_call_external_void_func_v3i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: v_mov_b32_e32 v1, 4 ; HSA-NEXT: v_mov_b32_e32 v2, 5 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 @@ -3398,15 +3432,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; HSA-LABEL: test_call_external_void_func_v3i32_i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: v_mov_b32_e32 v1, 4 ; HSA-NEXT: v_mov_b32_e32 v2, 5 ; HSA-NEXT: v_mov_b32_e32 v3, 6 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 @@ -3493,14 +3528,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; HSA-LABEL: test_call_external_void_func_v4i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 s2, -1 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 @@ -3590,15 +3626,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v4i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 @@ -3691,16 +3728,17 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v5i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: v_mov_b32_e32 v4, 5 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 @@ -3803,13 +3841,14 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 @@ -3915,9 +3954,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v8i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 @@ -3927,7 +3968,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v6, 7 ; HSA-NEXT: v_mov_b32_e32 v7, 8 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 @@ -4038,7 +4078,6 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4046,7 +4085,9 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 ; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 @@ -4183,7 +4224,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4195,8 +4235,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 @@ -4359,9 +4401,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 @@ -4466,14 +4509,15 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; ; HSA-LABEL: test_call_external_i32_func_i32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 42 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_mov_b32 s39, 0x1100f000 ; HSA-NEXT: s_mov_b32 s38, -1 ; HSA-NEXT: s_getpc_b64 s[4:5] @@ -4581,13 +4625,14 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 ; HSA-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 @@ -4702,9 +4747,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8 ; HSA-NEXT: v_mov_b32_e32 v0, 8 @@ -4712,7 +4759,6 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; HSA-NEXT: s_movk_i32 s32, 0x400 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 @@ -4877,9 +4923,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8 ; HSA-NEXT: v_mov_b32_e32 v0, 8 @@ -4887,7 +4935,6 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; HSA-NEXT: s_movk_i32 s32, 0x800 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 @@ -5085,12 +5132,13 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 @@ -5339,14 +5387,15 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64: ; HSA: ; %bb.0: ; %entry ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; HSA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x80 ; HSA-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s2, -1 +; HSA-NEXT: s_mov_b32 s3, 0x11e80000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: v_mov_b32_e32 v0, s23 ; HSA-NEXT: v_mov_b32_e32 v1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll index c62a082..8e2fca5 100644 --- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll @@ -11,10 +11,11 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-LABEL: known_x_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_mov_b32 s3, 0xe00000 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -31,9 +32,10 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-LABEL: known_y_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch ; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -50,9 +52,10 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-LABEL: known_z_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -69,9 +72,10 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si ; CHECK-LABEL: known_yz_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -88,9 +92,10 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si ; CHECK-LABEL: known_xz_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch ; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -108,9 +113,10 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s ; CHECK-LABEL: known_xyz_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch ; CHECK-NEXT: v_mov_b32_e32 v31, 0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll index 616e5f0..6db5eff 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -7,12 +7,13 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s3, 0xe00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 @@ -30,10 +31,11 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 s3, 0xe00000 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_store_dword v0, v0, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -52,11 +54,12 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s3, 0xe00000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -74,11 +77,12 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) # define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s3, 0xe00000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -99,12 +103,13 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xe00000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 6d603ef..49bf48a 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -165,7 +165,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 ; FIXEDABI: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1 +; FIXEDABI: v_or_b32_e32 v31, v0, v1 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 @@ -181,7 +181,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 ; FIXEDABI: v_lshlrev_b32_e32 v1, 20, v2 -; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1 +; FIXEDABI: v_or_b32_e32 v31, v0, v1 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 @@ -198,7 +198,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { ; FIXEDABI-NOT: v2 ; FIXEDABI:v_lshlrev_b32_e32 v0, 20, v2 ; FIXEDABI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-NEXT: v_or_b32_e32 v31, v1, v0 +; FIXEDABI: v_or_b32_e32 v31, v1, v0 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll index ca09163b2..42beb1c 100644 --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -68,13 +68,14 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_mov_b32 s2, -1 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -88,11 +89,12 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX900-LABEL: test_kern_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_mov_b32 s3, 0xe00000 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -112,11 +114,12 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_mov_b32 s2, -1 +; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 +; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX1010-NEXT: s_getpc_b64 s[16:17] ; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 @@ -148,13 +151,14 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_mov_b32 s2, -1 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 @@ -171,11 +175,12 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX900-LABEL: test_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_mov_b32 s3, 0xe00000 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 @@ -199,10 +204,11 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX1010-NEXT: s_mov_b32 s2, -1 +; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 +; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 @@ -311,13 +317,14 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_mov_b32 s2, -1 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -332,11 +339,12 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX900-LABEL: test_force_fp_kern_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_mov_b32 s3, 0xe00000 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -358,11 +366,12 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_mov_b32 s2, -1 +; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 +; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX1010-NEXT: s_getpc_b64 s[16:17] ; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 @@ -413,14 +422,15 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX803-LABEL: test_force_fp_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_mov_b32 s2, -1 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_mov_b32 s33, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 @@ -437,12 +447,13 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX900-LABEL: test_force_fp_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_mov_b32 s3, 0xe00000 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_mov_b32 s33, 0 -; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 @@ -467,10 +478,11 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX1010-NEXT: s_mov_b32 s2, -1 +; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 +; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 11871db..68c632a 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -180,8 +180,9 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_load_dword s8, s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s0, s0, s15 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xe00000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s8, 0 @@ -229,8 +230,9 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_load_dword s8, s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s0, s0, s15 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xe00000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s8, 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index 47110d9..2d019ef 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -13,8 +13,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-NEXT: s_add_u32 s0, s0, s7 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -25,14 +23,17 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX9-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-NEXT: s_cmp_eq_u32 s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch ; GFX9-NEXT: s_cselect_b32 s5, s13, s11 ; GFX9-NEXT: s_cselect_b32 s4, s12, s10 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 408199b..a66ed93 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -12,8 +12,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s12, s14 ; GCN-NEXT: s_getpc_b64 s[14:15] @@ -37,8 +38,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) { ; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GISEL-NEXT: s_add_i32 s12, s12, s17 ; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GISEL-NEXT: s_add_u32 s0, s0, s17 -; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0x1e8f000 +; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch ; GISEL-NEXT: s_mov_b32 s13, s15 ; GISEL-NEXT: s_mov_b32 s12, s14 ; GISEL-NEXT: s_getpc_b64 s[14:15] @@ -67,8 +69,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s12, s14 ; GCN-NEXT: s_getpc_b64 s[14:15] @@ -93,8 +96,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) { ; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GISEL-NEXT: s_add_i32 s12, s12, s17 ; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GISEL-NEXT: s_add_u32 s0, s0, s17 -; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0x1e8f000 +; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch ; GISEL-NEXT: s_mov_b32 s13, s15 ; GISEL-NEXT: s_mov_b32 s12, s14 ; GISEL-NEXT: s_getpc_b64 s[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 6e90554..8843efd 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -11,8 +11,9 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_mov_b32 s33, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch ; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 66f31bb..4851c4f 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -118,10 +118,11 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -177,10 +178,11 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -236,10 +238,11 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -295,10 +298,11 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -341,8 +345,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -351,6 +353,9 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 0 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -370,14 +375,15 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -410,8 +416,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -420,6 +424,9 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 2 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -439,14 +446,15 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -479,8 +487,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -489,6 +495,9 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 1 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -508,14 +517,15 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -548,8 +558,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -558,6 +566,9 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 3 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -577,14 +588,15 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index 61818da..26271a0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -45,8 +45,9 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index bb7c43f..f780188 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -164,8 +164,9 @@ define amdgpu_kernel void @k01() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 @@ -198,8 +199,9 @@ define amdgpu_kernel void @k23() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 @@ -240,8 +242,9 @@ define amdgpu_kernel void @k123() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index 4d73436..fa4b93f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -229,8 +229,9 @@ define amdgpu_kernel void @k01() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 @@ -268,8 +269,9 @@ define amdgpu_kernel void @k23() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 @@ -310,8 +312,9 @@ define amdgpu_kernel void @k123() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 +; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 138a6a8..e17f311 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -44,17 +44,18 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] -; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 ; CHECK-NEXT: s_add_u32 s42, s34, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_addc_u32 s43, s35, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s33, s14 ; CHECK-NEXT: s_mov_b32 s40, s13 ; CHECK-NEXT: s_mov_b32 s41, s12 @@ -781,17 +782,18 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; CHECK-NEXT: s_load_dwordx2 s[44:45], s[6:7], 0x10 -; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] -; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 ; CHECK-NEXT: s_add_u32 s42, s36, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_addc_u32 s43, s37, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] ; CHECK-NEXT: s_mov_b32 s33, s14 ; CHECK-NEXT: s_mov_b32 s40, s13 ; CHECK-NEXT: s_mov_b32 s41, s12 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index f70441e..70a9bbb 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -69,8 +69,9 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -128,8 +129,9 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -240,8 +242,9 @@ define protected amdgpu_kernel void @kernel() { ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[16:17] diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index e7c5aaf..e6d9c0d 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -45,8 +45,8 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: s_mul_i32 s4, s4, s5 @@ -55,8 +55,9 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; GFX9-NEXT: s_add_u32 s6, s6, indirect@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, indirect@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch ; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: ds_write_b64 v0, v[3:4] diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 1118cc3..8d8459f 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -45,10 +45,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[38:39], 0x20 ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s15 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 -; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 +; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) @@ -73,7 +71,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 +; GLOBALNESS1-NEXT: s_mov_b32 s2, -1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3 +; GLOBALNESS1-NEXT: s_mov_b32 s3, 0xe00000 +; GLOBALNESS1-NEXT: s_mov_b64 s[0:1], flat_scratch ; GLOBALNESS1-NEXT: s_mov_b32 s68, s14 ; GLOBALNESS1-NEXT: s_mov_b32 s69, s13 ; GLOBALNESS1-NEXT: s_mov_b32 s70, s12 @@ -332,10 +333,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[38:39], 0x20 ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s15 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 -; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 +; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) @@ -360,7 +359,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 +; GLOBALNESS0-NEXT: s_mov_b32 s2, -1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3 +; GLOBALNESS0-NEXT: s_mov_b32 s3, 0xe00000 +; GLOBALNESS0-NEXT: s_mov_b64 s[0:1], flat_scratch ; GLOBALNESS0-NEXT: s_mov_b32 s66, s14 ; GLOBALNESS0-NEXT: s_mov_b32 s67, s13 ; GLOBALNESS0-NEXT: s_mov_b32 s68, s12 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll index 7840559..7d759089 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -14,8 +14,9 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id ; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 +; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v5, 42 -- cgit v1.1 From d0f72f88606b78447fb7b61214651854c787c26f Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 8 Feb 2024 11:28:06 -0800 Subject: [RISCV] Consider truncate semantics in performBUILD_VECTORCombine (#81168) Fixes https://github.com/llvm/llvm-project/issues/80910. Per the documentation in ISDOpcodes.h, for BUILD_VECTOR "The types of the operands must match the vector element type, except that integer types are allowed to be larger than the element type, in which case the operands are implicitly truncated." This transform was assuming that the scalar operand type matched the result type. This resulted in essentially performing a truncate before a binop, instead of after. As demonstrated by the test case changes, this is often not legal. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 + .../RISCV/rvv/fixed-vectors-buildvec-of-binop.ll | 6 +- .../CodeGen/RISCV/rvv/fixed-vectors-vselect.ll | 513 +++++++++++++++------ .../test/CodeGen/RISCV/urem-seteq-illegal-types.ll | 53 +-- 4 files changed, 399 insertions(+), 179 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 27037f4..0799cc2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14956,6 +14956,11 @@ static SDValue performBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, if (!TLI.isOperationLegalOrCustom(Opcode, VT) || !TLI.isTypeLegal(VT)) return SDValue(); + // This BUILD_VECTOR involves an implicit truncation, and sinking + // truncates through binops is non-trivial. + if (N->op_begin()->getValueType() != VT.getVectorElementType()) + return SDValue(); + SmallVector LHSOps; SmallVector RHSOps; for (SDValue Op : N->ops()) { @@ -14983,6 +14988,7 @@ static SDValue performBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, // have different LHS and RHS types. if (Op.getOperand(0).getValueType() != Op.getOperand(1).getValueType()) return SDValue(); + RHSOps.push_back(Op.getOperand(1)); } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll index e376688..af7d7f7a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll @@ -589,7 +589,8 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b, ret <8 x i32> %v3 } -; FIXME: This is currently showing a miscompile, we effectively +; Here we can not pull the ashr through into the vector domain due to +; the truncate semantics of the build_vector. Doing so would ; truncate before the ashr instead of after it, so if %a or %b ; is e.g. UINT32_MAX+1 we get different result. define <2 x i32> @build_vec_of_trunc_op(i64 %a, i64 %b) { @@ -608,10 +609,11 @@ define <2 x i32> @build_vec_of_trunc_op(i64 %a, i64 %b) { ; ; RV64-LABEL: build_vec_of_trunc_op: ; RV64: # %bb.0: # %entry +; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: srli a1, a1, 1 ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 ; RV64-NEXT: vslide1down.vx v8, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 1 ; RV64-NEXT: ret entry: %conv11.i = ashr i64 %a, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll index cd47720..ead41b0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll @@ -3,30 +3,65 @@ ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64 define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) { -; CHECK-LABEL: vselect_vv_v6i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: lbu a2, 0(a2) -; CHECK-NEXT: vle32.v v8, (a1) -; CHECK-NEXT: srli a1, a2, 1 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a2 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a1, a2, 2 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a1, a2, 3 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a1, a2, 4 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a2, a2, 5 -; CHECK-NEXT: vslide1down.vx v10, v10, a2 -; CHECK-NEXT: vslidedown.vi v10, v10, 2 -; CHECK-NEXT: vand.vi v10, v10, 1 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; CHECK-NEXT: vle32.v v8, (a0), v0.t -; CHECK-NEXT: vse32.v v8, (a3) -; CHECK-NEXT: ret +; RV32-LABEL: vselect_vv_v6i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: lbu a2, 0(a2) +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: slli a1, a2, 30 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: andi a4, a2, 1 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v10, a4 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a2, 29 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a2, 28 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a2, 27 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: srli a2, a2, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: vslidedown.vi v10, v10, 2 +; RV32-NEXT: vand.vi v10, v10, 1 +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV32-NEXT: vle32.v v8, (a0), v0.t +; RV32-NEXT: vse32.v v8, (a3) +; RV32-NEXT: ret +; +; RV64-LABEL: vselect_vv_v6i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: lbu a2, 0(a2) +; RV64-NEXT: vle32.v v8, (a1) +; RV64-NEXT: slli a1, a2, 62 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: andi a4, a2, 1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v10, a4 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a2, 61 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a2, 60 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a2, 59 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: srli a2, a2, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: vslidedown.vi v10, v10, 2 +; RV64-NEXT: vand.vi v10, v10, 1 +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64-NEXT: vle32.v v8, (a0), v0.t +; RV64-NEXT: vse32.v v8, (a3) +; RV64-NEXT: ret %va = load <6 x i32>, ptr %a %vb = load <6 x i32>, ptr %b %vcc = load <6 x i1>, ptr %cc @@ -36,31 +71,67 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) { } define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) { -; CHECK-LABEL: vselect_vx_v6i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: lbu a2, 0(a2) -; CHECK-NEXT: vle32.v v8, (a1) -; CHECK-NEXT: srli a1, a2, 1 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a2 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a1, a2, 2 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a1, a2, 3 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a1, a2, 4 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a2, a2, 5 -; CHECK-NEXT: vslide1down.vx v10, v10, a2 -; CHECK-NEXT: vslidedown.vi v10, v10, 2 -; CHECK-NEXT: vand.vi v10, v10, 1 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: vse32.v v8, (a3) -; CHECK-NEXT: ret +; RV32-LABEL: vselect_vx_v6i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: lbu a2, 0(a2) +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: slli a1, a2, 30 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: andi a4, a2, 1 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v10, a4 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a2, 29 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a2, 28 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a2, 27 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: srli a2, a2, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: vslidedown.vi v10, v10, 2 +; RV32-NEXT: vand.vi v10, v10, 1 +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: vse32.v v8, (a3) +; RV32-NEXT: ret +; +; RV64-LABEL: vselect_vx_v6i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: lbu a2, 0(a2) +; RV64-NEXT: vle32.v v8, (a1) +; RV64-NEXT: slli a1, a2, 62 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: andi a4, a2, 1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v10, a4 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a2, 61 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a2, 60 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a2, 59 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: srli a2, a2, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: vslidedown.vi v10, v10, 2 +; RV64-NEXT: vand.vi v10, v10, 1 +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: vse32.v v8, (a3) +; RV64-NEXT: ret %vb = load <6 x i32>, ptr %b %ahead = insertelement <6 x i32> poison, i32 %a, i32 0 %va = shufflevector <6 x i32> %ahead, <6 x i32> poison, <6 x i32> zeroinitializer @@ -71,31 +142,67 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) { } define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) { -; CHECK-LABEL: vselect_vi_v6i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: lbu a1, 0(a1) -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: srli a0, a1, 1 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a0, a1, 2 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a0, a1, 3 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a0, a1, 4 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a1, a1, 5 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: vslidedown.vi v10, v10, 2 -; CHECK-NEXT: vand.vi v10, v10, 1 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmerge.vim v8, v8, -1, v0 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: vse32.v v8, (a2) -; CHECK-NEXT: ret +; RV32-LABEL: vselect_vi_v6i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: slli a0, a1, 30 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: andi a3, a1, 1 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v10, a3 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: slli a0, a1, 29 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: slli a0, a1, 27 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: srli a1, a1, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: vslidedown.vi v10, v10, 2 +; RV32-NEXT: vand.vi v10, v10, 1 +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vim v8, v8, -1, v0 +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: vse32.v v8, (a2) +; RV32-NEXT: ret +; +; RV64-LABEL: vselect_vi_v6i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: slli a0, a1, 62 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: andi a3, a1, 1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v10, a3 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: slli a0, a1, 61 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: slli a0, a1, 59 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: srli a1, a1, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: vslidedown.vi v10, v10, 2 +; RV64-NEXT: vand.vi v10, v10, 1 +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vim v8, v8, -1, v0 +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: vse32.v v8, (a2) +; RV64-NEXT: ret %vb = load <6 x i32>, ptr %b %a = insertelement <6 x i32> poison, i32 -1, i32 0 %va = shufflevector <6 x i32> %a, <6 x i32> poison, <6 x i32> zeroinitializer @@ -107,30 +214,65 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) { define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) { -; CHECK-LABEL: vselect_vv_v6f32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: lbu a2, 0(a2) -; CHECK-NEXT: vle32.v v8, (a1) -; CHECK-NEXT: srli a1, a2, 1 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a2 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a1, a2, 2 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a1, a2, 3 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a1, a2, 4 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: srli a2, a2, 5 -; CHECK-NEXT: vslide1down.vx v10, v10, a2 -; CHECK-NEXT: vslidedown.vi v10, v10, 2 -; CHECK-NEXT: vand.vi v10, v10, 1 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; CHECK-NEXT: vle32.v v8, (a0), v0.t -; CHECK-NEXT: vse32.v v8, (a3) -; CHECK-NEXT: ret +; RV32-LABEL: vselect_vv_v6f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: lbu a2, 0(a2) +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: slli a1, a2, 30 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: andi a4, a2, 1 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v10, a4 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a2, 29 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a2, 28 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a2, 27 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: srli a2, a2, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: vslidedown.vi v10, v10, 2 +; RV32-NEXT: vand.vi v10, v10, 1 +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV32-NEXT: vle32.v v8, (a0), v0.t +; RV32-NEXT: vse32.v v8, (a3) +; RV32-NEXT: ret +; +; RV64-LABEL: vselect_vv_v6f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: lbu a2, 0(a2) +; RV64-NEXT: vle32.v v8, (a1) +; RV64-NEXT: slli a1, a2, 62 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: andi a4, a2, 1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v10, a4 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a2, 61 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a2, 60 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a2, 59 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: srli a2, a2, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: vslidedown.vi v10, v10, 2 +; RV64-NEXT: vand.vi v10, v10, 1 +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64-NEXT: vle32.v v8, (a0), v0.t +; RV64-NEXT: vse32.v v8, (a3) +; RV64-NEXT: ret %va = load <6 x float>, ptr %a %vb = load <6 x float>, ptr %b %vcc = load <6 x i1>, ptr %cc @@ -140,31 +282,67 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) { } define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) { -; CHECK-LABEL: vselect_vx_v6f32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: lbu a1, 0(a1) -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: srli a0, a1, 1 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a0, a1, 2 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a0, a1, 3 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a0, a1, 4 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a1, a1, 5 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: vslidedown.vi v10, v10, 2 -; CHECK-NEXT: vand.vi v10, v10, 1 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: vse32.v v8, (a2) -; CHECK-NEXT: ret +; RV32-LABEL: vselect_vx_v6f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: slli a0, a1, 30 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: andi a3, a1, 1 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v10, a3 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: slli a0, a1, 29 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: slli a0, a1, 27 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: srli a1, a1, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: vslidedown.vi v10, v10, 2 +; RV32-NEXT: vand.vi v10, v10, 1 +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vfmerge.vfm v8, v8, fa0, v0 +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: vse32.v v8, (a2) +; RV32-NEXT: ret +; +; RV64-LABEL: vselect_vx_v6f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: slli a0, a1, 62 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: andi a3, a1, 1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v10, a3 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: slli a0, a1, 61 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: slli a0, a1, 59 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: srli a1, a1, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: vslidedown.vi v10, v10, 2 +; RV64-NEXT: vand.vi v10, v10, 1 +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vfmerge.vfm v8, v8, fa0, v0 +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: vse32.v v8, (a2) +; RV64-NEXT: ret %vb = load <6 x float>, ptr %b %ahead = insertelement <6 x float> poison, float %a, i32 0 %va = shufflevector <6 x float> %ahead, <6 x float> poison, <6 x i32> zeroinitializer @@ -175,31 +353,67 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) { } define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) { -; CHECK-LABEL: vselect_vfpzero_v6f32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: lbu a1, 0(a1) -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: srli a0, a1, 1 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a0, a1, 2 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a0, a1, 3 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a0, a1, 4 -; CHECK-NEXT: vslide1down.vx v10, v10, a0 -; CHECK-NEXT: srli a1, a1, 5 -; CHECK-NEXT: vslide1down.vx v10, v10, a1 -; CHECK-NEXT: vslidedown.vi v10, v10, 2 -; CHECK-NEXT: vand.vi v10, v10, 1 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: vse32.v v8, (a2) -; CHECK-NEXT: ret +; RV32-LABEL: vselect_vfpzero_v6f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: slli a0, a1, 30 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: andi a3, a1, 1 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v10, a3 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: slli a0, a1, 29 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: slli a0, a1, 27 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: srli a1, a1, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: vslidedown.vi v10, v10, 2 +; RV32-NEXT: vand.vi v10, v10, 1 +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vim v8, v8, 0, v0 +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: vse32.v v8, (a2) +; RV32-NEXT: ret +; +; RV64-LABEL: vselect_vfpzero_v6f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: slli a0, a1, 62 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: andi a3, a1, 1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v10, a3 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: slli a0, a1, 61 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: slli a0, a1, 59 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: srli a1, a1, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: vslidedown.vi v10, v10, 2 +; RV64-NEXT: vand.vi v10, v10, 1 +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vim v8, v8, 0, v0 +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: vse32.v v8, (a2) +; RV64-NEXT: ret %vb = load <6 x float>, ptr %b %a = insertelement <6 x float> poison, float 0.0, i32 0 %va = shufflevector <6 x float> %a, <6 x float> poison, <6 x i32> zeroinitializer @@ -497,6 +711,3 @@ define <64 x i1> @vselect_v64i1(<64 x i1> %a, <64 x i1> %b, <64 x i1> %cc) { %v = select <64 x i1> %cc, <64 x i1> %a, <64 x i1> %b ret <64 x i1> %v } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; RV32: {{.*}} -; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index 4544cba..c016e8f 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -521,33 +521,35 @@ define void @test_urem_vec(ptr %X) nounwind { ; ; RV32MV-LABEL: test_urem_vec: ; RV32MV: # %bb.0: -; RV32MV-NEXT: lbu a1, 4(a0) -; RV32MV-NEXT: lw a2, 0(a0) -; RV32MV-NEXT: slli a1, a1, 10 -; RV32MV-NEXT: srli a3, a2, 22 -; RV32MV-NEXT: or a1, a3, a1 -; RV32MV-NEXT: srli a3, a2, 11 +; RV32MV-NEXT: lw a1, 0(a0) +; RV32MV-NEXT: andi a2, a1, 2047 ; RV32MV-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV32MV-NEXT: vmv.v.x v8, a2 +; RV32MV-NEXT: lbu a2, 4(a0) +; RV32MV-NEXT: slli a3, a1, 10 +; RV32MV-NEXT: srli a3, a3, 21 ; RV32MV-NEXT: vslide1down.vx v8, v8, a3 +; RV32MV-NEXT: slli a2, a2, 10 +; RV32MV-NEXT: srli a1, a1, 22 +; RV32MV-NEXT: or a1, a1, a2 +; RV32MV-NEXT: andi a1, a1, 2047 ; RV32MV-NEXT: vslide1down.vx v8, v8, a1 +; RV32MV-NEXT: lui a1, %hi(.LCPI4_0) +; RV32MV-NEXT: addi a1, a1, %lo(.LCPI4_0) +; RV32MV-NEXT: vle16.v v9, (a1) ; RV32MV-NEXT: vslidedown.vi v8, v8, 1 -; RV32MV-NEXT: li a1, 2047 -; RV32MV-NEXT: lui a2, %hi(.LCPI4_0) -; RV32MV-NEXT: addi a2, a2, %lo(.LCPI4_0) -; RV32MV-NEXT: vle16.v v9, (a2) -; RV32MV-NEXT: vand.vx v8, v8, a1 ; RV32MV-NEXT: vid.v v10 ; RV32MV-NEXT: vsub.vv v8, v8, v10 ; RV32MV-NEXT: vmul.vv v8, v8, v9 ; RV32MV-NEXT: vadd.vv v9, v8, v8 -; RV32MV-NEXT: lui a2, 41121 -; RV32MV-NEXT: addi a2, a2, -1527 +; RV32MV-NEXT: lui a1, 41121 +; RV32MV-NEXT: addi a1, a1, -1527 ; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32MV-NEXT: vmv.s.x v10, a2 +; RV32MV-NEXT: vmv.s.x v10, a1 ; RV32MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV32MV-NEXT: vsext.vf2 v11, v10 ; RV32MV-NEXT: vsll.vv v9, v9, v11 +; RV32MV-NEXT: li a1, 2047 ; RV32MV-NEXT: vand.vx v8, v8, a1 ; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32MV-NEXT: vmv.v.i v10, 1 @@ -585,31 +587,30 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64MV-NEXT: lwu a2, 0(a0) ; RV64MV-NEXT: slli a1, a1, 32 ; RV64MV-NEXT: or a1, a2, a1 +; RV64MV-NEXT: slli a2, a1, 42 +; RV64MV-NEXT: srli a2, a2, 53 +; RV64MV-NEXT: andi a3, a1, 2047 ; RV64MV-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64MV-NEXT: vmv.v.x v8, a1 -; RV64MV-NEXT: slli a1, a1, 24 -; RV64MV-NEXT: srli a1, a1, 24 -; RV64MV-NEXT: srli a2, a1, 11 +; RV64MV-NEXT: vmv.v.x v8, a3 ; RV64MV-NEXT: vslide1down.vx v8, v8, a2 ; RV64MV-NEXT: srli a1, a1, 22 ; RV64MV-NEXT: vslide1down.vx v8, v8, a1 +; RV64MV-NEXT: lui a1, %hi(.LCPI4_0) +; RV64MV-NEXT: addi a1, a1, %lo(.LCPI4_0) +; RV64MV-NEXT: vle16.v v9, (a1) ; RV64MV-NEXT: vslidedown.vi v8, v8, 1 -; RV64MV-NEXT: li a1, 2047 -; RV64MV-NEXT: lui a2, %hi(.LCPI4_0) -; RV64MV-NEXT: addi a2, a2, %lo(.LCPI4_0) -; RV64MV-NEXT: vle16.v v9, (a2) -; RV64MV-NEXT: vand.vx v8, v8, a1 ; RV64MV-NEXT: vid.v v10 ; RV64MV-NEXT: vsub.vv v8, v8, v10 ; RV64MV-NEXT: vmul.vv v8, v8, v9 ; RV64MV-NEXT: vadd.vv v9, v8, v8 -; RV64MV-NEXT: lui a2, 41121 -; RV64MV-NEXT: addi a2, a2, -1527 +; RV64MV-NEXT: lui a1, 41121 +; RV64MV-NEXT: addi a1, a1, -1527 ; RV64MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64MV-NEXT: vmv.s.x v10, a2 +; RV64MV-NEXT: vmv.s.x v10, a1 ; RV64MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64MV-NEXT: vsext.vf2 v11, v10 ; RV64MV-NEXT: vsll.vv v9, v9, v11 +; RV64MV-NEXT: li a1, 2047 ; RV64MV-NEXT: vand.vx v8, v8, a1 ; RV64MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64MV-NEXT: vmv.v.i v10, 1 -- cgit v1.1 From 35c4f025f9d4c398eff0c8e49a47a5c7067939ba Mon Sep 17 00:00:00 2001 From: Jacob Lambert Date: Thu, 8 Feb 2024 11:35:04 -0800 Subject: [NFC][clang][Driver] Specify options for with -save-temps= (#80921) --- clang/include/clang/Driver/Options.td | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4b232b8..4f498db 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5392,7 +5392,9 @@ def regcall4 : Flag<["-"], "regcall4">, Group, MarshallingInfoFlag>; def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[NoXarchOption]>, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, - HelpText<"Save intermediate compilation results.">; + HelpText<"Save intermediate compilation results. can be set to cwd for " + "current working directory, or obj which will save temporary files in the " + "same directory as the final output file">; def save_temps : Flag<["-", "--"], "save-temps">, Flags<[NoXarchOption]>, Visibility<[ClangOption, FlangOption, FC1Option]>, Alias, AliasArgs<["cwd"]>, -- cgit v1.1 From 3b57b647a9bb821137f91dfbc2172a9947f620cc Mon Sep 17 00:00:00 2001 From: Natalie Chouinard Date: Thu, 8 Feb 2024 14:35:44 -0500 Subject: [HLSL][SPIR-V] Add create.handle intrinsic (#81038) Add a SPIR-V target-specific intrinsic for creating handles, which is used for lowering HLSL resources types like RWBuffer. `llvm/lib/TargetParser/Triple.cpp`: SPIR-V intrinsics use "spv" as the target prefix, not "spirv". As far as I can tell, this is the first one that is used via the `CGBuiltin` codepath, which relies on `getArchTypePrefix`, so I've corrected it here. `clang/lib/Basic/Targets/SPIR.h`: When records are laid out in the lowering from AST to IR, they were incorrectly offset because these Pointer attributes were defaulting to 32. Related to #81036 --- clang/lib/Basic/Targets/SPIR.h | 1 + clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl | 4 ++++ llvm/include/llvm/IR/IntrinsicsSPIRV.td | 4 ++++ llvm/lib/IR/Function.cpp | 1 + llvm/lib/TargetParser/Triple.cpp | 2 +- 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index e6235f3..e25991e 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -310,6 +310,7 @@ public: assert(Triple.getEnvironment() >= llvm::Triple::Pixel && Triple.getEnvironment() <= llvm::Triple::Amplification && "Logical SPIR-V environment must be a valid shader stage."); + PointerWidth = PointerAlign = 64; // SPIR-V IDs are represented with a single 32-bit word. SizeType = TargetInfo::UnsignedInt; diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl index 2b9c66d..74b3f59 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV RWBuffer Buf; @@ -7,3 +8,6 @@ RWBuffer Buf; // CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1) // CHECK: store ptr %[[HandleRes]], ptr %h, align 4 + +// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1) +// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8 diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index ea0074d..057dc64 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -38,4 +38,8 @@ let TargetPrefix = "spv" in { // Expect, Assume Intrinsics def int_spv_assume : Intrinsic<[], [llvm_i1_ty]>; def int_spv_expect : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; + + // The following intrinsic(s) are mirrored from IntrinsicsDirectX.td for HLSL support. + def int_spv_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">, + Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>; } diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 22e2455..fceffbc 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -44,6 +44,7 @@ #include "llvm/IR/IntrinsicsR600.h" #include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/IntrinsicsS390.h" +#include "llvm/IR/IntrinsicsSPIRV.h" #include "llvm/IR/IntrinsicsVE.h" #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/IntrinsicsX86.h" diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index 3494ae5..96dbd5c 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -188,7 +188,7 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) { case spirv: case spirv32: - case spirv64: return "spirv"; + case spirv64: return "spv"; case kalimba: return "kalimba"; case lanai: return "lanai"; -- cgit v1.1 From e5924d64991abb4da111317ff5e8d9147265354a Mon Sep 17 00:00:00 2001 From: Yinying Li <107574043+yinying-lisa-li@users.noreply.github.com> Date: Thu, 8 Feb 2024 19:38:42 +0000 Subject: [mlir][sparse] Implement parsing n out of m (#79935) 1. Add parsing methods for block[n, m]. 2. Encode n and m with the newly extended 64-bit LevelType enum. 3. Update 2:4 methods names/comments to n:m. --- mlir/include/mlir-c/Dialect/SparseTensor.h | 28 +-- mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h | 205 +++++++++++++-------- .../SparseTensor/IR/SparseTensorAttrDefs.td | 5 +- .../Dialect/SparseTensor/IR/SparseTensorType.h | 2 +- .../mlir/Dialect/SparseTensor/Utils/Merger.h | 2 +- .../mlir/ExecutionEngine/SparseTensor/Storage.h | 20 +- mlir/lib/Bindings/Python/DialectSparseTensor.cpp | 2 +- mlir/lib/CAPI/Dialect/SparseTensor.cpp | 49 +++-- .../SparseTensor/IR/Detail/LvlTypeParser.cpp | 54 ++++-- .../Dialect/SparseTensor/IR/Detail/LvlTypeParser.h | 6 +- .../SparseTensor/IR/SparseTensorDialect.cpp | 16 +- .../SparseTensor/Transforms/SparseGPUCodegen.cpp | 2 +- .../Transforms/SparseTensorCodegen.cpp | 6 +- .../SparseTensor/Transforms/Sparsification.cpp | 2 +- .../Transforms/Utils/SparseTensorLevel.cpp | 19 +- mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp | 4 +- mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp | 2 +- mlir/test/CAPI/sparse_tensor.c | 6 +- .../Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir | 2 +- .../Dialect/SparseTensor/roundtrip_encoding.mlir | 30 ++- .../Dialect/SparseTensor/sparse_fill_zero.mlir | 2 +- .../SparseTensor/CPU/sparse_block_matmul.mlir | 2 +- .../Dialect/SparseTensor/CPU/sparse_ds.mlir | 2 +- .../GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir | 2 +- .../GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir | 2 +- mlir/test/python/dialects/sparse_tensor/dialect.py | 4 +- 26 files changed, 302 insertions(+), 174 deletions(-) diff --git a/mlir/include/mlir-c/Dialect/SparseTensor.h b/mlir/include/mlir-c/Dialect/SparseTensor.h index 42d8400..2c71b00 100644 --- a/mlir/include/mlir-c/Dialect/SparseTensor.h +++ b/mlir/include/mlir-c/Dialect/SparseTensor.h @@ -28,20 +28,20 @@ MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(SparseTensor, sparse_tensor); typedef uint64_t MlirSparseTensorLevelType; enum MlirBaseSparseTensorLevelType { - MLIR_SPARSE_TENSOR_LEVEL_DENSE = 4, // 0b00001_00 - MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED = 8, // 0b00010_00 - MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU = 9, // 0b00010_01 - MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO = 10, // 0b00010_10 - MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO = 11, // 0b00010_11 - MLIR_SPARSE_TENSOR_LEVEL_SINGLETON = 16, // 0b00100_00 - MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU = 17, // 0b00100_01 - MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO = 18, // 0b00100_10 - MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO = 19, // 0b00100_11 - MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED = 32, // 0b01000_00 - MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU = 33, // 0b01000_01 - MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NO = 34, // 0b01000_10 - MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU_NO = 35, // 0b01000_11 - MLIR_SPARSE_TENSOR_LEVEL_TWO_OUT_OF_FOUR = 64, // 0b10000_00 + MLIR_SPARSE_TENSOR_LEVEL_DENSE = 0x000000010000, + MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED = 0x000000020000, + MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU = 0x000000020001, + MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO = 0x000000020002, + MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO = 0x000000020003, + MLIR_SPARSE_TENSOR_LEVEL_SINGLETON = 0x000000040000, + MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU = 0x000000040001, + MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO = 0x000000040002, + MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO = 0x000000040003, + MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED = 0x000000080000, + MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU = 0x000000080001, + MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NO = 0x000000080002, + MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU_NO = 0x000000080003, + MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M = 0x000000100000, }; //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h index 86c52bf..e940d20 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h @@ -154,9 +154,10 @@ enum class Action : uint32_t { /// This enum defines all the sparse representations supportable by /// the SparseTensor dialect. We use a lightweight encoding to encode -/// both the "format" per se (dense, compressed, singleton, loose_compressed, -/// two-out-of-four) as well as the "properties" (ordered, unique). The -/// encoding is chosen for performance of the runtime library, and thus may +/// the "format" per se (dense, compressed, singleton, loose_compressed, +/// n-out-of-m), the "properties" (ordered, unique) as well as n and m when +/// the format is NOutOfM. +/// The encoding is chosen for performance of the runtime library, and thus may /// change in future versions; consequently, client code should use the /// predicate functions defined below, rather than relying on knowledge /// about the particular binary encoding. @@ -165,41 +166,75 @@ enum class Action : uint32_t { /// where we need to store an undefined or indeterminate `LevelType`. /// It should not be used externally, since it does not indicate an /// actual/representable format. +/// +/// Bit manipulations for LevelType: +/// +/// | 8-bit n | 8-bit m | 16-bit LevelFormat | 16-bit LevelProperty | +/// enum class LevelType : uint64_t { - Undef = 0, // 0b00000_00 - Dense = 4, // 0b00001_00 - Compressed = 8, // 0b00010_00 - CompressedNu = 9, // 0b00010_01 - CompressedNo = 10, // 0b00010_10 - CompressedNuNo = 11, // 0b00010_11 - Singleton = 16, // 0b00100_00 - SingletonNu = 17, // 0b00100_01 - SingletonNo = 18, // 0b00100_10 - SingletonNuNo = 19, // 0b00100_11 - LooseCompressed = 32, // 0b01000_00 - LooseCompressedNu = 33, // 0b01000_01 - LooseCompressedNo = 34, // 0b01000_10 - LooseCompressedNuNo = 35, // 0b01000_11 - TwoOutOfFour = 64, // 0b10000_00 + Undef = 0x000000000000, + Dense = 0x000000010000, + Compressed = 0x000000020000, + CompressedNu = 0x000000020001, + CompressedNo = 0x000000020002, + CompressedNuNo = 0x000000020003, + Singleton = 0x000000040000, + SingletonNu = 0x000000040001, + SingletonNo = 0x000000040002, + SingletonNuNo = 0x000000040003, + LooseCompressed = 0x000000080000, + LooseCompressedNu = 0x000000080001, + LooseCompressedNo = 0x000000080002, + LooseCompressedNuNo = 0x000000080003, + NOutOfM = 0x000000100000, }; /// This enum defines all supported storage format without the level properties. enum class LevelFormat : uint64_t { - Dense = 4, // 0b00001_00 - Compressed = 8, // 0b00010_00 - Singleton = 16, // 0b00100_00 - LooseCompressed = 32, // 0b01000_00 - TwoOutOfFour = 64, // 0b10000_00 + Dense = 0x00010000, + Compressed = 0x00020000, + Singleton = 0x00040000, + LooseCompressed = 0x00080000, + NOutOfM = 0x00100000, }; /// This enum defines all the nondefault properties for storage formats. enum class LevelPropertyNondefault : uint64_t { - Nonunique = 1, // 0b00000_01 - Nonordered = 2, // 0b00000_10 + Nonunique = 0x0001, + Nonordered = 0x0002, }; +/// Get N of NOutOfM level type. +constexpr uint64_t getN(LevelType lt) { + return (static_cast(lt) >> 32) & 0xff; +} + +/// Get M of NOutOfM level type. +constexpr uint64_t getM(LevelType lt) { + return (static_cast(lt) >> 40) & 0xff; +} + +/// Convert N of NOutOfM level type to the stored bits. +constexpr uint64_t nToBits(uint64_t n) { return n << 32; } + +/// Convert M of NOutOfM level type to the stored bits. +constexpr uint64_t mToBits(uint64_t m) { return m << 40; } + +/// Check if the `LevelType` is NOutOfM (regardless of +/// properties and block sizes). +constexpr bool isNOutOfMLT(LevelType lt) { + return ((static_cast(lt) & 0x100000) == + static_cast(LevelType::NOutOfM)); +} + +/// Check if the `LevelType` is NOutOfM with the correct block sizes. +constexpr bool isValidNOutOfMLT(LevelType lt, uint64_t n, uint64_t m) { + return isNOutOfMLT(lt) && getN(lt) == n && getM(lt) == m; +} + /// Returns string representation of the given dimension level type. -constexpr const char *toMLIRString(LevelType lt) { +constexpr const char *toMLIRString(LevelType lvlType) { + auto lt = static_cast(static_cast(lvlType) & 0xffffffff); switch (lt) { case LevelType::Undef: return "undef"; @@ -229,21 +264,22 @@ constexpr const char *toMLIRString(LevelType lt) { return "loose_compressed(nonordered)"; case LevelType::LooseCompressedNuNo: return "loose_compressed(nonunique, nonordered)"; - case LevelType::TwoOutOfFour: - return "block2_4"; + case LevelType::NOutOfM: + return "structured"; } return ""; } /// Check that the `LevelType` contains a valid (possibly undefined) value. constexpr bool isValidLT(LevelType lt) { - const uint64_t formatBits = static_cast(lt) >> 2; - const uint64_t propertyBits = static_cast(lt) & 3; - // If undefined or dense, then must be unique and ordered. + const uint64_t formatBits = static_cast(lt) & 0xffff0000; + const uint64_t propertyBits = static_cast(lt) & 0xffff; + // If undefined/dense/NOutOfM, then must be unique and ordered. // Otherwise, the format must be one of the known ones. - return (formatBits <= 1 || formatBits == 16) + return (formatBits <= 0x10000 || formatBits == 0x100000) ? (propertyBits == 0) - : (formatBits == 2 || formatBits == 4 || formatBits == 8); + : (formatBits == 0x20000 || formatBits == 0x40000 || + formatBits == 0x80000); } /// Check if the `LevelType` is the special undefined value. @@ -251,34 +287,28 @@ constexpr bool isUndefLT(LevelType lt) { return lt == LevelType::Undef; } /// Check if the `LevelType` is dense (regardless of properties). constexpr bool isDenseLT(LevelType lt) { - return (static_cast(lt) & ~3) == + return (static_cast(lt) & ~0xffff) == static_cast(LevelType::Dense); } /// Check if the `LevelType` is compressed (regardless of properties). constexpr bool isCompressedLT(LevelType lt) { - return (static_cast(lt) & ~3) == + return (static_cast(lt) & ~0xffff) == static_cast(LevelType::Compressed); } /// Check if the `LevelType` is singleton (regardless of properties). constexpr bool isSingletonLT(LevelType lt) { - return (static_cast(lt) & ~3) == + return (static_cast(lt) & ~0xffff) == static_cast(LevelType::Singleton); } /// Check if the `LevelType` is loose compressed (regardless of properties). constexpr bool isLooseCompressedLT(LevelType lt) { - return (static_cast(lt) & ~3) == + return (static_cast(lt) & ~0xffff) == static_cast(LevelType::LooseCompressed); } -/// Check if the `LevelType` is 2OutOf4 (regardless of properties). -constexpr bool is2OutOf4LT(LevelType lt) { - return (static_cast(lt) & ~3) == - static_cast(LevelType::TwoOutOfFour); -} - /// Check if the `LevelType` needs positions array. constexpr bool isWithPosLT(LevelType lt) { return isCompressedLT(lt) || isLooseCompressedLT(lt); @@ -287,17 +317,19 @@ constexpr bool isWithPosLT(LevelType lt) { /// Check if the `LevelType` needs coordinates array. constexpr bool isWithCrdLT(LevelType lt) { return isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) || - is2OutOf4LT(lt); + isNOutOfMLT(lt); } /// Check if the `LevelType` is ordered (regardless of storage format). constexpr bool isOrderedLT(LevelType lt) { return !(static_cast(lt) & 2); + return !(static_cast(lt) & 2); } /// Check if the `LevelType` is unique (regardless of storage format). constexpr bool isUniqueLT(LevelType lt) { return !(static_cast(lt) & 1); + return !(static_cast(lt) & 1); } /// Convert a LevelType to its corresponding LevelFormat. @@ -305,21 +337,25 @@ constexpr bool isUniqueLT(LevelType lt) { constexpr std::optional getLevelFormat(LevelType lt) { if (lt == LevelType::Undef) return std::nullopt; - return static_cast(static_cast(lt) & ~3); + return static_cast(static_cast(lt) & 0xffff0000); } /// Convert a LevelFormat to its corresponding LevelType with the given /// properties. Returns std::nullopt when the properties are not applicable /// for the input level format. constexpr std::optional buildLevelType(LevelFormat lf, bool ordered, - bool unique) { - auto lt = static_cast(static_cast(lf) | - (ordered ? 0 : 2) | (unique ? 0 : 1)); + bool unique, uint64_t n = 0, + uint64_t m = 0) { + uint64_t newN = n << 32; + uint64_t newM = m << 40; + auto lt = + static_cast(static_cast(lf) | (ordered ? 0 : 2) | + (unique ? 0 : 1) | newN | newM); return isValidLT(lt) ? std::optional(lt) : std::nullopt; } // -// Ensure the above methods work as indended. +// Ensure the above methods work as intended. // static_assert( @@ -341,7 +377,7 @@ static_assert( LevelFormat::LooseCompressed && *getLevelFormat(LevelType::LooseCompressedNuNo) == LevelFormat::LooseCompressed && - *getLevelFormat(LevelType::TwoOutOfFour) == LevelFormat::TwoOutOfFour), + *getLevelFormat(LevelType::NOutOfM) == LevelFormat::NOutOfM), "getLevelFormat conversion is broken"); static_assert( @@ -373,14 +409,29 @@ static_assert( LevelType::LooseCompressedNo && *buildLevelType(LevelFormat::LooseCompressed, false, false) == LevelType::LooseCompressedNuNo && - buildLevelType(LevelFormat::TwoOutOfFour, false, true) == std::nullopt && - buildLevelType(LevelFormat::TwoOutOfFour, true, false) == std::nullopt && - buildLevelType(LevelFormat::TwoOutOfFour, false, false) == std::nullopt && - *buildLevelType(LevelFormat::TwoOutOfFour, true, true) == - LevelType::TwoOutOfFour), + buildLevelType(LevelFormat::NOutOfM, false, true) == std::nullopt && + buildLevelType(LevelFormat::NOutOfM, true, false) == std::nullopt && + buildLevelType(LevelFormat::NOutOfM, false, false) == std::nullopt && + *buildLevelType(LevelFormat::NOutOfM, true, true) == LevelType::NOutOfM), "buildLevelType conversion is broken"); static_assert( + (getN(*buildLevelType(LevelFormat::NOutOfM, true, true, 2, 4)) == 2 && + getM(*buildLevelType(LevelFormat::NOutOfM, true, true, 2, 4)) == 4 && + getN(*buildLevelType(LevelFormat::NOutOfM, true, true, 8, 10)) == 8 && + getM(*buildLevelType(LevelFormat::NOutOfM, true, true, 8, 10)) == 10), + "getN/M conversion is broken"); + +static_assert( + (isValidNOutOfMLT(*buildLevelType(LevelFormat::NOutOfM, true, true, 2, 4), + 2, 4) && + isValidNOutOfMLT(*buildLevelType(LevelFormat::NOutOfM, true, true, 8, 10), + 8, 10) && + !isValidNOutOfMLT(*buildLevelType(LevelFormat::NOutOfM, true, true, 3, 4), + 2, 4)), + "isValidNOutOfMLT definition is broken"); + +static_assert( (isValidLT(LevelType::Undef) && isValidLT(LevelType::Dense) && isValidLT(LevelType::Compressed) && isValidLT(LevelType::CompressedNu) && isValidLT(LevelType::CompressedNo) && @@ -391,7 +442,7 @@ static_assert( isValidLT(LevelType::LooseCompressedNu) && isValidLT(LevelType::LooseCompressedNo) && isValidLT(LevelType::LooseCompressedNuNo) && - isValidLT(LevelType::TwoOutOfFour)), + isValidLT(LevelType::NOutOfM)), "isValidLT definition is broken"); static_assert((isDenseLT(LevelType::Dense) && @@ -407,7 +458,7 @@ static_assert((isDenseLT(LevelType::Dense) && !isDenseLT(LevelType::LooseCompressedNu) && !isDenseLT(LevelType::LooseCompressedNo) && !isDenseLT(LevelType::LooseCompressedNuNo) && - !isDenseLT(LevelType::TwoOutOfFour)), + !isDenseLT(LevelType::NOutOfM)), "isDenseLT definition is broken"); static_assert((!isCompressedLT(LevelType::Dense) && @@ -423,7 +474,7 @@ static_assert((!isCompressedLT(LevelType::Dense) && !isCompressedLT(LevelType::LooseCompressedNu) && !isCompressedLT(LevelType::LooseCompressedNo) && !isCompressedLT(LevelType::LooseCompressedNuNo) && - !isCompressedLT(LevelType::TwoOutOfFour)), + !isCompressedLT(LevelType::NOutOfM)), "isCompressedLT definition is broken"); static_assert((!isSingletonLT(LevelType::Dense) && @@ -439,7 +490,7 @@ static_assert((!isSingletonLT(LevelType::Dense) && !isSingletonLT(LevelType::LooseCompressedNu) && !isSingletonLT(LevelType::LooseCompressedNo) && !isSingletonLT(LevelType::LooseCompressedNuNo) && - !isSingletonLT(LevelType::TwoOutOfFour)), + !isSingletonLT(LevelType::NOutOfM)), "isSingletonLT definition is broken"); static_assert((!isLooseCompressedLT(LevelType::Dense) && @@ -455,24 +506,24 @@ static_assert((!isLooseCompressedLT(LevelType::Dense) && isLooseCompressedLT(LevelType::LooseCompressedNu) && isLooseCompressedLT(LevelType::LooseCompressedNo) && isLooseCompressedLT(LevelType::LooseCompressedNuNo) && - !isLooseCompressedLT(LevelType::TwoOutOfFour)), + !isLooseCompressedLT(LevelType::NOutOfM)), "isLooseCompressedLT definition is broken"); -static_assert((!is2OutOf4LT(LevelType::Dense) && - !is2OutOf4LT(LevelType::Compressed) && - !is2OutOf4LT(LevelType::CompressedNu) && - !is2OutOf4LT(LevelType::CompressedNo) && - !is2OutOf4LT(LevelType::CompressedNuNo) && - !is2OutOf4LT(LevelType::Singleton) && - !is2OutOf4LT(LevelType::SingletonNu) && - !is2OutOf4LT(LevelType::SingletonNo) && - !is2OutOf4LT(LevelType::SingletonNuNo) && - !is2OutOf4LT(LevelType::LooseCompressed) && - !is2OutOf4LT(LevelType::LooseCompressedNu) && - !is2OutOf4LT(LevelType::LooseCompressedNo) && - !is2OutOf4LT(LevelType::LooseCompressedNuNo) && - is2OutOf4LT(LevelType::TwoOutOfFour)), - "is2OutOf4LT definition is broken"); +static_assert((!isNOutOfMLT(LevelType::Dense) && + !isNOutOfMLT(LevelType::Compressed) && + !isNOutOfMLT(LevelType::CompressedNu) && + !isNOutOfMLT(LevelType::CompressedNo) && + !isNOutOfMLT(LevelType::CompressedNuNo) && + !isNOutOfMLT(LevelType::Singleton) && + !isNOutOfMLT(LevelType::SingletonNu) && + !isNOutOfMLT(LevelType::SingletonNo) && + !isNOutOfMLT(LevelType::SingletonNuNo) && + !isNOutOfMLT(LevelType::LooseCompressed) && + !isNOutOfMLT(LevelType::LooseCompressedNu) && + !isNOutOfMLT(LevelType::LooseCompressedNo) && + !isNOutOfMLT(LevelType::LooseCompressedNuNo) && + isNOutOfMLT(LevelType::NOutOfM)), + "isNOutOfMLT definition is broken"); static_assert((isOrderedLT(LevelType::Dense) && isOrderedLT(LevelType::Compressed) && @@ -487,7 +538,7 @@ static_assert((isOrderedLT(LevelType::Dense) && isOrderedLT(LevelType::LooseCompressedNu) && !isOrderedLT(LevelType::LooseCompressedNo) && !isOrderedLT(LevelType::LooseCompressedNuNo) && - isOrderedLT(LevelType::TwoOutOfFour)), + isOrderedLT(LevelType::NOutOfM)), "isOrderedLT definition is broken"); static_assert((isUniqueLT(LevelType::Dense) && @@ -503,7 +554,7 @@ static_assert((isUniqueLT(LevelType::Dense) && !isUniqueLT(LevelType::LooseCompressedNu) && isUniqueLT(LevelType::LooseCompressedNo) && !isUniqueLT(LevelType::LooseCompressedNuNo) && - isUniqueLT(LevelType::TwoOutOfFour)), + isUniqueLT(LevelType::NOutOfM)), "isUniqueLT definition is broken"); /// Bit manipulations for affine encoding. diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td index 12c1068..5b3b971 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td @@ -145,7 +145,8 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", - **compressed** : only nonzeros along this level are stored - **loose_compressed** : as compressed, but allows for free space between regions - **singleton** : a variant of the compressed format, where coordinates have no siblings - - **block2_4** : the compression uses a 2:4 encoding per 1x4 block + - **structured[n, m]** : the compression uses a n:m encoding + (viz. n out of m consecutive elements are nonzero) For a compressed level, each position interval is represented in a compact way with a lowerbound `pos(i)` and an upperbound `pos(i+1) - 1`, which implies @@ -374,7 +375,7 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", bool isCompressedLvl(::mlir::sparse_tensor::Level l) const { return isCompressedLT(getLvlType(l)); } bool isSingletonLvl(::mlir::sparse_tensor::Level l) const { return isSingletonLT(getLvlType(l)); } bool isLooseCompressedLvl(::mlir::sparse_tensor::Level l) const { return isLooseCompressedLT(getLvlType(l)); } - bool isTwoOutOfFourLvl(::mlir::sparse_tensor::Level l) const { return is2OutOf4LT(getLvlType(l)); } + bool isNOutOfMLvl(::mlir::sparse_tensor::Level l) const { return isNOutOfMLT(getLvlType(l)); } bool isOrderedLvl(::mlir::sparse_tensor::Level l) const { return isOrderedLT(getLvlType(l)); } bool isUniqueLvl(::mlir::sparse_tensor::Level l) const { return isUniqueLT(getLvlType(l)); } diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h index 4c98129..4e2b85d 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h @@ -291,7 +291,7 @@ public: return isLooseCompressedLT(getLvlType(l)); } bool isSingletonLvl(Level l) const { return isSingletonLT(getLvlType(l)); } - bool is2OutOf4Lvl(Level l) const { return is2OutOf4LT(getLvlType(l)); } + bool isNOutOfMLvl(Level l) const { return isNOutOfMLT(getLvlType(l)); } bool isOrderedLvl(Level l) const { return isOrderedLT(getLvlType(l)); } bool isUniqueLvl(Level l) const { return isUniqueLT(getLvlType(l)); } bool isWithPos(Level l) const { return isWithPosLT(getLvlType(l)); } diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h index 4a34bb2..490ef30 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h @@ -510,7 +510,7 @@ public: if (isLvlWithNonTrivialIdxExp(b)) { auto lt = getLoopDependentLevelType(b); return isCompressedLT(lt) || isSingletonLT(lt) || - isLooseCompressedLT(lt) || is2OutOf4LT(lt); + isLooseCompressedLT(lt) || isNOutOfMLT(lt); } return false; } diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h index 01c5f23..1418217 100644 --- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h +++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h @@ -123,8 +123,8 @@ public: /// Safely checks if the level uses singleton storage. bool isSingletonLvl(uint64_t l) const { return isSingletonLT(getLvlType(l)); } - /// Safely checks if the level uses 2 out of 4 storage. - bool is2OutOf4Lvl(uint64_t l) const { return is2OutOf4LT(getLvlType(l)); } + /// Safely checks if the level uses n out of m storage. + bool isNOutOfMLvl(uint64_t l) const { return isNOutOfMLT(getLvlType(l)); } /// Safely checks if the level is ordered. bool isOrderedLvl(uint64_t l) const { return isOrderedLT(getLvlType(l)); } @@ -450,7 +450,7 @@ private: void appendCrd(uint64_t lvl, uint64_t full, uint64_t crd) { if (!isDenseLvl(lvl)) { assert(isCompressedLvl(lvl) || isLooseCompressedLvl(lvl) || - isSingletonLvl(lvl) || is2OutOf4Lvl(lvl)); + isSingletonLvl(lvl) || isNOutOfMLvl(lvl)); coordinates[lvl].push_back(detail::checkOverflowCast(crd)); } else { // Dense level. assert(crd >= full && "Coordinate was already filled"); @@ -473,7 +473,7 @@ private: return positions[l][parentSz]; if (isLooseCompressedLvl(l)) return positions[l][2 * parentSz - 1]; - if (isSingletonLvl(l) || is2OutOf4Lvl(l)) + if (isSingletonLvl(l) || isNOutOfMLvl(l)) return parentSz; // new size same as the parent assert(isDenseLvl(l)); return parentSz * getLvlSize(l); @@ -527,7 +527,7 @@ private: uint64_t pos = coordinates[l].size(); positions[l].insert(positions[l].end(), 2 * count, detail::checkOverflowCast

(pos)); - } else if (isSingletonLvl(l) || is2OutOf4Lvl(l)) { + } else if (isSingletonLvl(l) || isNOutOfMLvl(l)) { return; // Nothing to finalize. } else { // Dense dimension. assert(isDenseLvl(l)); @@ -624,7 +624,7 @@ private: lvlCursor[l] = static_cast(coordinatesL[pos]); toCOO(pos, l + 1, dimCoords); } - } else if (isSingletonLvl(l) || is2OutOf4Lvl(l)) { + } else if (isSingletonLvl(l) || isNOutOfMLvl(l)) { assert(parentPos < coordinates[l].size()); lvlCursor[l] = static_cast(coordinates[l][parentPos]); toCOO(parentPos, l + 1, dimCoords); @@ -721,8 +721,8 @@ SparseTensorStorage::SparseTensorStorage( } else if (isSingletonLvl(l)) { coordinates[l].reserve(sz); sz = 1; - } else if (is2OutOf4Lvl(l)) { - assert(l == lvlRank - 1 && "unexpected 2:4 usage"); + } else if (isNOutOfMLvl(l)) { + assert(l == lvlRank - 1 && "unexpected n:m usage"); sz = detail::checkedMul(sz, lvlSizes[l]) / 2; coordinates[l].reserve(sz); values.reserve(sz); @@ -791,8 +791,8 @@ SparseTensorStorage::SparseTensorStorage( } } else if (isSingletonLvl(l)) { assert(0 && "general singleton not supported yet"); - } else if (is2OutOf4Lvl(l)) { - assert(0 && "2Out4 not supported yet"); + } else if (isNOutOfMLvl(l)) { + assert(0 && "n ouf of m not supported yet"); } else { assert(isDenseLvl(l)); } diff --git a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp index 698367a..607534c 100644 --- a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp +++ b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp @@ -25,7 +25,7 @@ using namespace mlir::python::adaptors; static void populateDialectSparseTensorSubmodule(const py::module &m) { py::enum_(m, "LevelType", py::module_local()) .value("dense", MLIR_SPARSE_TENSOR_LEVEL_DENSE) - .value("compressed24", MLIR_SPARSE_TENSOR_LEVEL_TWO_OUT_OF_FOUR) + .value("n_out_of_m", MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M) .value("compressed", MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED) .value("compressed_nu", MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU) .value("compressed_no", MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO) diff --git a/mlir/lib/CAPI/Dialect/SparseTensor.cpp b/mlir/lib/CAPI/Dialect/SparseTensor.cpp index e4534ad..a34b9a29 100644 --- a/mlir/lib/CAPI/Dialect/SparseTensor.cpp +++ b/mlir/lib/CAPI/Dialect/SparseTensor.cpp @@ -20,25 +20,36 @@ MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(SparseTensor, sparse_tensor, mlir::sparse_tensor::SparseTensorDialect) // Ensure the C-API enums are int-castable to C++ equivalents. -static_assert(static_cast(MLIR_SPARSE_TENSOR_LEVEL_DENSE) == - static_cast(LevelType::Dense) && - static_cast(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED) == - static_cast(LevelType::Compressed) && - static_cast(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU) == - static_cast(LevelType::CompressedNu) && - static_cast(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO) == - static_cast(LevelType::CompressedNo) && - static_cast(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO) == - static_cast(LevelType::CompressedNuNo) && - static_cast(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON) == - static_cast(LevelType::Singleton) && - static_cast(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU) == - static_cast(LevelType::SingletonNu) && - static_cast(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO) == - static_cast(LevelType::SingletonNo) && - static_cast(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO) == - static_cast(LevelType::SingletonNuNo), - "MlirSparseTensorLevelType (C-API) and LevelType (C++) mismatch"); +static_assert( + static_cast(MLIR_SPARSE_TENSOR_LEVEL_DENSE) == + static_cast(LevelType::Dense) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED) == + static_cast(LevelType::Compressed) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU) == + static_cast(LevelType::CompressedNu) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO) == + static_cast(LevelType::CompressedNo) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO) == + static_cast(LevelType::CompressedNuNo) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON) == + static_cast(LevelType::Singleton) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU) == + static_cast(LevelType::SingletonNu) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO) == + static_cast(LevelType::SingletonNo) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO) == + static_cast(LevelType::SingletonNuNo) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED) == + static_cast(LevelType::LooseCompressed) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU) == + static_cast(LevelType::LooseCompressedNu) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NO) == + static_cast(LevelType::LooseCompressedNo) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU_NO) == + static_cast(LevelType::LooseCompressedNuNo) && + static_cast(MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M) == + static_cast(LevelType::NOutOfM), + "MlirSparseTensorLevelType (C-API) and LevelType (C++) mismatch"); bool mlirAttributeIsASparseTensorEncodingAttr(MlirAttribute attr) { return isa(unwrap(attr)); diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp index eb7ea63..752d6e6 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp @@ -29,12 +29,21 @@ using namespace mlir::sparse_tensor::ir_detail; // `LvlTypeParser` implementation. //===----------------------------------------------------------------------===// -FailureOr LvlTypeParser::parseLvlType(AsmParser &parser) const { +FailureOr LvlTypeParser::parseLvlType(AsmParser &parser) const { StringRef base; const auto loc = parser.getCurrentLocation(); ERROR_IF(failed(parser.parseOptionalKeyword(&base)), "expected valid level format (e.g. dense, compressed or singleton)") - uint8_t properties = 0; + uint64_t properties = 0; + SmallVector structure; + + if (base.compare("structured") == 0) { + ParseResult res = parser.parseCommaSeparatedList( + mlir::OpAsmParser::Delimiter::OptionalSquare, + [&]() -> ParseResult { return parseStructure(parser, &structure); }, + " in block n out of m"); + FAILURE_IF_FAILED(res) + } ParseResult res = parser.parseCommaSeparatedList( mlir::OpAsmParser::Delimiter::OptionalParen, @@ -44,15 +53,20 @@ FailureOr LvlTypeParser::parseLvlType(AsmParser &parser) const { // Set the base bit for properties. if (base.compare("dense") == 0) { - properties |= static_cast(LevelFormat::Dense); + properties |= static_cast(LevelFormat::Dense); } else if (base.compare("compressed") == 0) { - properties |= static_cast(LevelFormat::Compressed); - } else if (base.compare("block2_4") == 0) { - properties |= static_cast(LevelFormat::TwoOutOfFour); + properties |= static_cast(LevelFormat::Compressed); + } else if (base.compare("structured") == 0) { + if (structure.size() != 2) { + parser.emitError(loc, "expected exactly 2 structure sizes"); + return failure(); + } + properties |= static_cast(LevelFormat::NOutOfM); + properties |= nToBits(structure[0]) | mToBits(structure[1]); } else if (base.compare("loose_compressed") == 0) { - properties |= static_cast(LevelFormat::LooseCompressed); + properties |= static_cast(LevelFormat::LooseCompressed); } else if (base.compare("singleton") == 0) { - properties |= static_cast(LevelFormat::Singleton); + properties |= static_cast(LevelFormat::Singleton); } else { parser.emitError(loc, "unknown level format: ") << base; return failure(); @@ -64,15 +78,15 @@ FailureOr LvlTypeParser::parseLvlType(AsmParser &parser) const { } ParseResult LvlTypeParser::parseProperty(AsmParser &parser, - uint8_t *properties) const { + uint64_t *properties) const { StringRef strVal; auto loc = parser.getCurrentLocation(); ERROR_IF(failed(parser.parseOptionalKeyword(&strVal)), "expected valid level property (e.g. nonordered, nonunique or high)") if (strVal.compare("nonunique") == 0) { - *properties |= static_cast(LevelPropertyNondefault::Nonunique); + *properties |= static_cast(LevelPropertyNondefault::Nonunique); } else if (strVal.compare("nonordered") == 0) { - *properties |= static_cast(LevelPropertyNondefault::Nonordered); + *properties |= static_cast(LevelPropertyNondefault::Nonordered); } else { parser.emitError(loc, "unknown level property: ") << strVal; return failure(); @@ -80,4 +94,22 @@ ParseResult LvlTypeParser::parseProperty(AsmParser &parser, return success(); } +ParseResult +LvlTypeParser::parseStructure(AsmParser &parser, + SmallVector *structure) const { + int intVal; + auto loc = parser.getCurrentLocation(); + OptionalParseResult intValParseResult = parser.parseOptionalInteger(intVal); + if (intValParseResult.has_value()) { + if (failed(*intValParseResult)) { + parser.emitError(loc, "failed to parse block size"); + return failure(); + } + structure->push_back(intVal); + return success(); + } + parser.emitError(loc, "expected valid integer for block size"); + return failure(); +} + //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h index 5e2f11b..6a13112 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h +++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h @@ -18,10 +18,12 @@ namespace ir_detail { class LvlTypeParser { public: LvlTypeParser() = default; - FailureOr parseLvlType(AsmParser &parser) const; + FailureOr parseLvlType(AsmParser &parser) const; private: - ParseResult parseProperty(AsmParser &parser, uint8_t *properties) const; + ParseResult parseProperty(AsmParser &parser, uint64_t *properties) const; + ParseResult parseStructure(AsmParser &parser, + SmallVector *structure) const; }; } // namespace ir_detail diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 27125bc..67b1d79 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -613,16 +613,28 @@ void SparseTensorEncodingAttr::printDimensions( } } +std::string getNOutOfMString(LevelType lt) { + if (isNOutOfMLT(lt)) { + unsigned n = getN(lt); + unsigned m = getM(lt); + auto output = "[" + std::to_string(n) + ", " + std::to_string(m) + "]"; + return output; + } + return ""; +} + void SparseTensorEncodingAttr::printLevels(AffineMap &map, AsmPrinter &printer, ArrayRef lvlTypes) const { for (unsigned i = 0, n = map.getNumResults() - 1; i < n; i++) { map.getResult(i).print(printer.getStream()); - printer << " : " << toMLIRString(lvlTypes[i]) << ", "; + printer << " : " << toMLIRString(lvlTypes[i]) + << getNOutOfMString(lvlTypes[i]) << ", "; } if (map.getNumResults() >= 1) { auto lastIndex = map.getNumResults() - 1; map.getResult(lastIndex).print(printer.getStream()); - printer << " : " << toMLIRString(lvlTypes[lastIndex]); + printer << " : " << toMLIRString(lvlTypes[lastIndex]) + << getNOutOfMString(lvlTypes[lastIndex]); } } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp index dd3af9d..3f352c8 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -451,7 +451,7 @@ static bool isAdmissibleBSR(SparseTensorType &aTp) { /// Test for 2:4 matrix with suitable metadata. static bool isAdmissible24(SparseTensorType &aTp) { return aTp.getDimRank() == 2 && aTp.getLvlRank() == 3 && aTp.isDenseLvl(0) && - aTp.isDenseLvl(1) && aTp.is2OutOf4Lvl(2) && isAdmissibleMetaData(aTp); + aTp.isDenseLvl(1) && aTp.isNOutOfMLvl(2) && isAdmissibleMetaData(aTp); } /// Test for conversion into 2:4 matrix. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp index 491501a..d4459c6 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp @@ -130,7 +130,7 @@ static void allocSchemeForRank(OpBuilder &builder, Location loc, createPushback(builder, loc, desc, SparseTensorFieldKind::PosMemRef, lvl, /*value=*/posZero, /*repeat=*/linear); return; - } else if (isSingletonLT(lt) || is2OutOf4LT(lt)) { + } else if (isSingletonLT(lt) || isNOutOfMLT(lt)) { return; // nothing to do } // Keep compounding the size, but nothing needs to be initialized @@ -409,7 +409,7 @@ static void genEndInsert(OpBuilder &builder, Location loc, } } else { assert(isDenseLT(lt) || isLooseCompressedLT(lt) || isSingletonLT(lt) || - is2OutOf4LT(lt)); + isNOutOfMLT(lt)); } } } @@ -488,7 +488,7 @@ public: } parentPos = genCompressed(builder, loc, desc, coords, value, parentPos, lvl); - } else if (isSingletonLT(lt) || is2OutOf4LT(lt)) { + } else if (isSingletonLT(lt) || isNOutOfMLT(lt)) { // Create: // coordinates[lvl].push_back(coords[lvl]) // positions[lvl] = positions[lvl-1] diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp index ab38ab5..8f2ae60 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -891,7 +891,7 @@ static scf::IfOp genIf(CodegenEnv &env, OpBuilder &builder, LoopId curr, assert(curr == env.merger().loop(b)); Value clause; if (isCompressedLT(lt) || isSingletonLT(lt) || - isLooseCompressedLT(lt) || is2OutOf4LT(lt)) { + isLooseCompressedLT(lt) || isNOutOfMLT(lt)) { assert(lvl.has_value()); const Value crd = env.emitter().getCoord(tid, *lvl); const Value lvar = env.getLoopVar(curr); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp index 4ba9ecb..c85f820 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp @@ -139,18 +139,19 @@ public: } }; -class TwoOutFourLevel : public SparseLevel { +class NOutOfMLevel : public SparseLevel { public: - TwoOutFourLevel(unsigned tid, Level lvl, LevelType lt, Value lvlSize, - Value crdBuffer) + NOutOfMLevel(unsigned tid, Level lvl, LevelType lt, Value lvlSize, + Value crdBuffer) : SparseLevel(tid, lvl, lt, lvlSize, crdBuffer) {} ValuePair peekRangeAt(OpBuilder &b, Location l, Value p, Value max) const override { - assert(max == nullptr && isUnique() && "2:4 level can not be non-unique."); - // Each 2:4 blk has exactly two specified elements. - Value posLo = MULI(p, C_IDX(2)); - return {posLo, ADDI(posLo, C_IDX(2))}; + assert(max == nullptr && isUnique() && "n:m level can not be non-unique."); + // Each n:m blk has exactly n specified elements. + auto n = getN(lt); + Value posLo = MULI(p, C_IDX(n)); + return {posLo, ADDI(posLo, C_IDX(n))}; } }; @@ -1291,9 +1292,9 @@ sparse_tensor::makeSparseTensorLevel(OpBuilder &b, Location l, Value t, Value crd = genToCoordinates(b, l, t, lvl); return std::make_unique(tid, lvl, lt, sz, crd); } - case LevelFormat::TwoOutOfFour: { + case LevelFormat::NOutOfM: { Value crd = genToCoordinates(b, l, t, lvl); - return std::make_unique(tid, lvl, lt, sz, crd); + return std::make_unique(tid, lvl, lt, sz, crd); } } llvm_unreachable("unrecognizable level format"); diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp index 6cdf5f8..96537cb 100644 --- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp +++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp @@ -489,7 +489,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) { if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) { const auto lt = getLvlType(b); if (!isCompressedLT(lt) && !isSingletonLT(lt) && - !isLooseCompressedLT(lt) && !is2OutOf4LT(lt)) { + !isLooseCompressedLT(lt) && !isNOutOfMLT(lt)) { if (reset) simple.reset(b); reset = true; @@ -670,7 +670,7 @@ bool Merger::hasAnySparse(const BitVector &bits) const { for (TensorLoopId b : bits.set_bits()) { const auto lt = getLvlType(b); if (isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) || - is2OutOf4LT(lt)) + isNOutOfMLT(lt)) return true; } return hasSparseIdxReduction(bits); diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp index 0c7b3a2..9e8b240 100644 --- a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp +++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp @@ -45,7 +45,7 @@ SparseTensorStorageBase::SparseTensorStorageBase( // NOLINT for (uint64_t l = 0; l < lvlRank; l++) { assert(lvlSizes[l] > 0 && "Level size zero has trivial storage"); assert(isDenseLvl(l) || isCompressedLvl(l) || isLooseCompressedLvl(l) || - isSingletonLvl(l) || is2OutOf4Lvl(l)); + isSingletonLvl(l) || isNOutOfMLvl(l)); } } diff --git a/mlir/test/CAPI/sparse_tensor.c b/mlir/test/CAPI/sparse_tensor.c index 2c6ad55..a8b9f90 100644 --- a/mlir/test/CAPI/sparse_tensor.c +++ b/mlir/test/CAPI/sparse_tensor.c @@ -38,9 +38,9 @@ static int testRoundtripEncoding(MlirContext ctx) { mlirSparseTensorEncodingAttrGetDimToLvl(originalAttr); // CHECK: (d0, d1)[s0] -> (s0, d0, d1) mlirAffineMapDump(dimToLvl); - // CHECK: level_type: 4 - // CHECK: level_type: 8 - // CHECK: level_type: 8 + // CHECK: level_type: 65536 + // CHECK: level_type: 131072 + // CHECK: level_type: 131072 MlirAffineMap lvlToDim = mlirSparseTensorEncodingAttrGetLvlToDim(originalAttr); int lvlRank = mlirSparseTensorEncodingGetLvlRank(originalAttr); diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir index 6fe7ec9..8293169 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir @@ -4,7 +4,7 @@ map = ( i, j ) -> ( i : dense, j floordiv 4 : dense, - j mod 4 : block2_4 + j mod 4 : structured[2, 4] ) }> diff --git a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir index 20702bb..6452063 100644 --- a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir +++ b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir @@ -207,12 +207,12 @@ func.func private @BSR_explicit(%arg0: tensor) { map = ( i, j ) -> ( i : dense, j floordiv 4 : dense, - j mod 4 : block2_4 + j mod 4 : structured[2, 4] ), crdWidth = 8 // we would even like just 2-bits }> -// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : block2_4), crdWidth = 8 }> +// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]), crdWidth = 8 }> // CHECK-LABEL: func private @NV_24( // CHECK-SAME: tensor func.func private @NV_24(%arg0: tensor) { @@ -226,11 +226,11 @@ func.func private @NV_24(%arg0: tensor) { ( i : dense, j : dense, k floordiv 4 : dense, - k mod 4 : block2_4 + k mod 4 : structured[2, 4] ) }> -// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d1 : dense, d2 floordiv 4 : dense, d2 mod 4 : block2_4) }> +// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d1 : dense, d2 floordiv 4 : dense, d2 mod 4 : structured[2, 4]) }> // CHECK-LABEL: func private @NV_24( // CHECK-SAME: tensor func.func private @NV_24(%arg0: tensor) { @@ -244,13 +244,31 @@ func.func private @NV_24(%arg0: tensor) { ( i : dense, k floordiv 4 : dense, j : dense, - k mod 4 : block2_4 + k mod 4 : structured[2, 4] ) }> -// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d2 floordiv 4 : dense, d1 : dense, d2 mod 4 : block2_4) }> +// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d2 floordiv 4 : dense, d1 : dense, d2 mod 4 : structured[2, 4]) }> // CHECK-LABEL: func private @NV_24( // CHECK-SAME: tensor func.func private @NV_24(%arg0: tensor) { return } + +// ----- + +#NOutOfM = #sparse_tensor.encoding<{ + map = ( i, j, k ) -> + ( i : dense, + k floordiv 8 : dense, + j : dense, + k mod 8 : structured[5, 8] + ) +}> + +// CHECK-DAG: #[[$NOutOfM:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d2 floordiv 8 : dense, d1 : dense, d2 mod 8 : structured[5, 8]) }> +// CHECK-LABEL: func private @NOutOfM( +// CHECK-SAME: tensor +func.func private @NOutOfM(%arg0: tensor) { + return +} diff --git a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir index 7c494b2..d04fbe8 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir @@ -14,7 +14,7 @@ // CHECK-DAG: %[[VAL_8:.*]] = arith.constant true // CHECK-DAG: %[[VAL_9:.*]] = arith.constant 100 : index // CHECK-DAG: %[[VAL_10:.*]] = arith.constant 300 : index -// CHECK-DAG: %[[VAL_11:.*]] = arith.constant 8 : i64 +// CHECK-DAG: %[[VAL_11:.*]] = arith.constant 131072 : i64 // CHECK: %[[VAL_12:.*]] = memref.alloca() : memref<2xi64> // CHECK: %[[VAL_13:.*]] = memref.cast %[[VAL_12]] : memref<2xi64> to memref // CHECK: memref.store %[[VAL_11]], %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<2xi64> diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir index 4bc080f..e47ac46 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir @@ -59,7 +59,7 @@ map = ( i, j ) -> ( i : dense, j floordiv 4 : dense, - j mod 4 : block2_4 + j mod 4 : structured[2, 4] ), }> diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir index df5b48a..ec5c758 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir @@ -41,7 +41,7 @@ #NV_24 = #sparse_tensor.encoding<{ map = ( i, j ) -> ( i : dense, j floordiv 4 : dense, - j mod 4 : block2_4), + j mod 4 : structured[2, 4]), crdWidth = 8 }> diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir index 17b50b4..b0f63f1 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir @@ -20,7 +20,7 @@ map = ( i, j ) -> ( i : dense, j floordiv 4 : dense, - j mod 4 : block2_4 + j mod 4 : structured[2, 4] ) }> diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir index eb99a02..311cb60 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir @@ -20,7 +20,7 @@ map = ( i, j ) -> ( i : dense, j floordiv 4 : dense, - j mod 4 : block2_4 + j mod 4 : structured[2, 4] ) }> diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py index 946a224..412c579 100644 --- a/mlir/test/python/dialects/sparse_tensor/dialect.py +++ b/mlir/test/python/dialects/sparse_tensor/dialect.py @@ -28,7 +28,7 @@ def testEncodingAttr1D(): # CHECK: equal: True print(f"equal: {casted == parsed}") - # CHECK: lvl_types: [8] + # CHECK: lvl_types: [131072] print(f"lvl_types: {casted.lvl_types}") # CHECK: dim_to_lvl: (d0) -> (d0) print(f"dim_to_lvl: {casted.dim_to_lvl}") @@ -70,7 +70,7 @@ def testEncodingAttr2D(): # CHECK: equal: True print(f"equal: {casted == parsed}") - # CHECK: lvl_types: [4, 8] + # CHECK: lvl_types: [65536, 131072] print(f"lvl_types: {casted.lvl_types}") # CHECK: dim_to_lvl: (d0, d1) -> (d1, d0) print(f"dim_to_lvl: {casted.dim_to_lvl}") -- cgit v1.1 From d1fdb416299c0efa5979ed989f7c1f39973dcb73 Mon Sep 17 00:00:00 2001 From: John Demme Date: Thu, 8 Feb 2024 11:39:06 -0800 Subject: [MLIR][Python] Add method for getting the live operation objects (#78663) Currently, a method exists to get the count of the operation objects which are still alive. This helps for sanity checking, but isn't terribly useful for debugging. This new method returns the actual operation objects which are still alive. This allows Python code like the following: ``` gc.collect() live_ops = ir.Context.current._get_live_operation_objects() for op in live_ops: print(f"Warning: {op} is still live. Referrers:") for referrer in gc.get_referrers(op)[0]: print(f" {referrer}") ``` --- mlir/lib/Bindings/Python/IRCore.cpp | 9 +++++++++ mlir/lib/Bindings/Python/IRModule.h | 3 +++ mlir/python/mlir/_mlir_libs/_mlir/ir.pyi | 1 + mlir/test/python/ir/module.py | 4 ++++ 4 files changed, 17 insertions(+) diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 5412c3d..8a7951d 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -636,6 +636,13 @@ size_t PyMlirContext::getLiveCount() { return getLiveContexts().size(); } size_t PyMlirContext::getLiveOperationCount() { return liveOperations.size(); } +std::vector PyMlirContext::getLiveOperationObjects() { + std::vector liveObjects; + for (auto &entry : liveOperations) + liveObjects.push_back(entry.second.second); + return liveObjects; +} + size_t PyMlirContext::clearLiveOperations() { for (auto &op : liveOperations) op.second.second->setInvalid(); @@ -2546,6 +2553,8 @@ void mlir::python::populateIRCore(py::module &m) { return ref.releaseObject(); }) .def("_get_live_operation_count", &PyMlirContext::getLiveOperationCount) + .def("_get_live_operation_objects", + &PyMlirContext::getLiveOperationObjects) .def("_clear_live_operations", &PyMlirContext::clearLiveOperations) .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount) .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index 79b7e0c..48f39c9 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -201,6 +201,9 @@ public: /// Gets the count of live context objects. Used for testing. static size_t getLiveCount(); + /// Get a list of Python objects which are still in the live context map. + std::vector getLiveOperationObjects(); + /// Gets the count of live operations associated with this context. /// Used for testing. size_t getLiveOperationCount(); diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi index 57a8599..344abb6 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi @@ -985,6 +985,7 @@ class Context: def _get_context_again(self) -> Context: ... def _get_live_module_count(self) -> int: ... def _get_live_operation_count(self) -> int: ... + def _get_live_operation_objects(self) -> List[Operation]: ... def append_dialect_registry(self, registry: DialectRegistry) -> None: ... def attach_diagnostic_handler( self, callback: Callable[[Diagnostic], bool] diff --git a/mlir/test/python/ir/module.py b/mlir/test/python/ir/module.py index a5c38a6..ecafcb4 100644 --- a/mlir/test/python/ir/module.py +++ b/mlir/test/python/ir/module.py @@ -105,6 +105,10 @@ def testModuleOperation(): assert ctx._get_live_module_count() == 1 op1 = module.operation assert ctx._get_live_operation_count() == 1 + live_ops = ctx._get_live_operation_objects() + assert len(live_ops) == 1 + assert live_ops[0] is op1 + live_ops = None # CHECK: module @successfulParse print(op1) -- cgit v1.1 From 705fcd4e0addee6e9e13541dbcbc81cec9748a83 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 8 Feb 2024 11:50:53 -0800 Subject: Revert "[lldb] Expand background symbol lookup" (#81182) Reverts llvm/llvm-project#80890 --- lldb/include/lldb/Core/ModuleList.h | 23 +---------------------- lldb/include/lldb/lldb-enumerations.h | 6 ------ lldb/source/Core/CoreProperties.td | 7 +------ lldb/source/Core/ModuleList.cpp | 13 ++++--------- lldb/source/Host/common/Host.cpp | 2 -- lldb/source/Symbol/SymbolLocator.cpp | 22 ++++++---------------- 6 files changed, 12 insertions(+), 61 deletions(-) diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h index 43d931a..d78f7c5 100644 --- a/lldb/include/lldb/Core/ModuleList.h +++ b/lldb/include/lldb/Core/ModuleList.h @@ -47,26 +47,6 @@ class UUID; class VariableList; struct ModuleFunctionSearchOptions; -static constexpr OptionEnumValueElement g_auto_download_enum_values[] = { - { - lldb::eSymbolDownloadOff, - "off", - "Disable automatically downloading symbols.", - }, - { - lldb::eSymbolDownloadBackground, - "background", - "Download symbols in the background for images as they appear in the " - "backtrace.", - }, - { - lldb::eSymbolDownloadForeground, - "foreground", - "Download symbols in the foreground for images as they appear in the " - "backtrace.", - }, -}; - class ModuleListProperties : public Properties { mutable llvm::sys::RWMutex m_symlink_paths_mutex; PathMappingList m_symlink_paths; @@ -80,6 +60,7 @@ public: bool SetClangModulesCachePath(const FileSpec &path); bool GetEnableExternalLookup() const; bool SetEnableExternalLookup(bool new_value); + bool GetEnableBackgroundLookup() const; bool GetEnableLLDBIndexCache() const; bool SetEnableLLDBIndexCache(bool new_value); uint64_t GetLLDBIndexCacheMaxByteSize(); @@ -90,8 +71,6 @@ public: bool GetLoadSymbolOnDemand(); - lldb::SymbolDownload GetSymbolAutoDownload() const; - PathMappingList GetSymlinkMappings() const; }; diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 4640533..7e9b538 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -1314,12 +1314,6 @@ enum class ChildCacheState { ///< re-use what we computed the last time we called Update. }; -enum SymbolDownload { - eSymbolDownloadOff = 0, - eSymbolDownloadBackground = 1, - eSymbolDownloadForeground = 2, -}; - } // namespace lldb #endif // LLDB_LLDB_ENUMERATIONS_H diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td index 9c4aa2d..8d81967 100644 --- a/lldb/source/Core/CoreProperties.td +++ b/lldb/source/Core/CoreProperties.td @@ -8,12 +8,7 @@ let Definition = "modulelist" in { def EnableBackgroundLookup: Property<"enable-background-lookup", "Boolean">, Global, DefaultFalse, - Desc<"Alias for backward compatibility: when enabled this is the equivalent to 'symbols.download background'.">; - def AutoDownload: Property<"auto-download", "Enum">, - Global, - DefaultEnumValue<"eSymbolDownloadOff">, - EnumValues<"OptionEnumValues(g_auto_download_enum_values)">, - Desc<"On macOS, automatically download symbols with dsymForUUID (or an equivalent script/binary) for relevant images in the debug session.">; + Desc<"On macOS, enable calling dsymForUUID (or an equivalent script/binary) in the background to locate symbol files that weren't found.">; def ClangModulesCachePath: Property<"clang-modules-cache-path", "FileSpec">, Global, DefaultStringValue<"">, diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp index b03490b..b7f3936 100644 --- a/lldb/source/Core/ModuleList.cpp +++ b/lldb/source/Core/ModuleList.cpp @@ -104,15 +104,10 @@ bool ModuleListProperties::SetEnableExternalLookup(bool new_value) { return SetPropertyAtIndex(ePropertyEnableExternalLookup, new_value); } -SymbolDownload ModuleListProperties::GetSymbolAutoDownload() const { - // Backward compatibility alias. - if (GetPropertyAtIndexAs(ePropertyEnableBackgroundLookup, false)) - return eSymbolDownloadBackground; - - const uint32_t idx = ePropertyAutoDownload; - return GetPropertyAtIndexAs( - idx, static_cast( - g_modulelist_properties[idx].default_uint_value)); +bool ModuleListProperties::GetEnableBackgroundLookup() const { + const uint32_t idx = ePropertyEnableBackgroundLookup; + return GetPropertyAtIndexAs( + idx, g_modulelist_properties[idx].default_uint_value != 0); } FileSpec ModuleListProperties::GetClangModulesCachePath() const { diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp index b72ba7e..f4cec97 100644 --- a/lldb/source/Host/common/Host.cpp +++ b/lldb/source/Host/common/Host.cpp @@ -550,8 +550,6 @@ llvm::Error Host::OpenFileInExternalEditor(llvm::StringRef editor, } bool Host::IsInteractiveGraphicSession() { return false; } - -bool Host::IsNetworkLimited() { return false; } #endif std::unique_ptr Host::CreateDefaultConnection(llvm::StringRef url) { diff --git a/lldb/source/Symbol/SymbolLocator.cpp b/lldb/source/Symbol/SymbolLocator.cpp index 93a5bc4..918f13ed 100644 --- a/lldb/source/Symbol/SymbolLocator.cpp +++ b/lldb/source/Symbol/SymbolLocator.cpp @@ -10,7 +10,6 @@ #include "lldb/Core/Debugger.h" #include "lldb/Core/PluginManager.h" -#include "lldb/Host/Host.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Support/ThreadPool.h" @@ -19,10 +18,12 @@ using namespace lldb; using namespace lldb_private; void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) { + if (!ModuleList::GetGlobalModuleListProperties().GetEnableBackgroundLookup()) + return; + static llvm::SmallSet g_seen_uuids; static std::mutex g_mutex; - - auto lookup = [=]() { + Debugger::GetThreadPool().async([=]() { { std::lock_guard guard(g_mutex); if (g_seen_uuids.count(uuid)) @@ -35,23 +36,12 @@ void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) { module_spec.GetUUID() = uuid; if (!PluginManager::DownloadObjectAndSymbolFile(module_spec, error, /*force_lookup=*/true, - /*copy_executable=*/true)) + /*copy_executable=*/false)) return; if (error.Fail()) return; Debugger::ReportSymbolChange(module_spec); - }; - - switch (ModuleList::GetGlobalModuleListProperties().GetSymbolAutoDownload()) { - case eSymbolDownloadOff: - break; - case eSymbolDownloadBackground: - Debugger::GetThreadPool().async(lookup); - break; - case eSymbolDownloadForeground: - lookup(); - break; - }; + }); } -- cgit v1.1 From b8545e1ece271df16185d446503474c105d6398a Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 8 Feb 2024 12:15:33 -0800 Subject: [RISCV] Consider all subvector extracts within a single VREG cheap (#81032) This adjusts the isSubVectorExtractCheap callback to consider any extract which fits entirely within the first VLEN bits of the src vector (and uses a 5 bit immediate for the slide) as cheap. These can be done via a single m1 vslide1down.vi instruction. This allows our generic DAG combine logic to kick in and recognize a few more cases where shuffle source is longer than the dest, but that using a wider shuffle is still profitable. (Or as shown in the test diff, we can split the wider source and do two narrower shuffles.) --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 27 +++-- .../RISCV/rvv/fixed-vectors-int-shuffles.ll | 110 ++++----------------- 2 files changed, 40 insertions(+), 97 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 0799cc2..a62610b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2173,19 +2173,34 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, if (ResVT.isScalableVector() || SrcVT.isScalableVector()) return false; + EVT EltVT = ResVT.getVectorElementType(); + assert(EltVT == SrcVT.getVectorElementType() && "Should hold for node"); + + // The smallest type we can slide is i8. + // TODO: We can extract index 0 from a mask vector without a slide. + if (EltVT == MVT::i1) + return false; + unsigned ResElts = ResVT.getVectorNumElements(); unsigned SrcElts = SrcVT.getVectorNumElements(); + unsigned MinVLen = Subtarget.getRealMinVLen(); + unsigned MinVLMAX = MinVLen / EltVT.getSizeInBits(); + + // If we're extracting only data from the first VLEN bits of the source + // then we can always do this with an m1 vslidedown.vx. Restricting the + // Index ensures we can use a vslidedown.vi. + // TODO: We can generalize this when the exact VLEN is known. + if (Index + ResElts <= MinVLMAX && Index < 31) + return true; + // Convervatively only handle extracting half of a vector. - // TODO: Relax this. + // TODO: For sizes which aren't multiples of VLEN sizes, this may not be + // a cheap extract. However, this case is important in practice for + // shuffled extracts of longer vectors. How resolve? if ((ResElts * 2) != SrcElts) return false; - // The smallest type we can slide is i8. - // TODO: We can extract index 0 from a mask vector without a slide. - if (ResVT.getVectorElementType() == MVT::i1) - return false; - // Slide can support arbitrary index, but we only treat vslidedown.vi as // cheap. if (Index >= 32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index acad71b..0e8d9cf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -722,97 +722,25 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) { ; FIXME: This could be expressed as a vrgather.vv define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) { -; RV32-LABEL: shuffle_v64i8_v8i8: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: li a0, 64 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; RV32-NEXT: vse8.v v8, (a1) -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 8 -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vmv.x.s a1, v8 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 16 -; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 24 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v8, v10, a0 -; RV32-NEXT: lbu a0, 32(sp) -; RV32-NEXT: lbu a1, 40(sp) -; RV32-NEXT: lbu a2, 48(sp) -; RV32-NEXT: lbu a3, 56(sp) -; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 -; RV32-NEXT: ret -; -; RV64-LABEL: shuffle_v64i8_v8i8: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: li a0, 64 -; RV64-NEXT: mv a1, sp -; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; RV64-NEXT: vse8.v v8, (a1) -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 8 -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: vmv.x.s a1, v8 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 16 -; RV64-NEXT: vmv.x.s a0, v12 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 24 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v8, v10, a0 -; RV64-NEXT: lbu a0, 32(sp) -; RV64-NEXT: lbu a1, 40(sp) -; RV64-NEXT: lbu a2, 48(sp) -; RV64-NEXT: lbu a3, 56(sp) -; RV64-NEXT: vslide1down.vx v8, v8, a0 -; RV64-NEXT: vslide1down.vx v8, v8, a1 -; RV64-NEXT: vslide1down.vx v8, v8, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a3 -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 -; RV64-NEXT: ret +; CHECK-LABEL: shuffle_v64i8_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vsll.vi v14, v12, 3 +; CHECK-NEXT: vrgather.vv v12, v8, v14 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: li a1, 240 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: lui a1, 98561 +; CHECK-NEXT: addi a1, a1, -2048 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu +; CHECK-NEXT: vrgather.vv v12, v8, v10, v0.t +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: ret %s = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <8 x i32> ret <8 x i8> %s } -- cgit v1.1 From 5f4b40c90a51248b097de7b5bc89c6976d4c3298 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 8 Feb 2024 12:39:04 -0800 Subject: [lldb] Expand background symbol download (#80890) LLDB has a setting (symbols.enable-background-lookup) that calls dsymForUUID on a background thread for images as they appear in the current backtrace. Originally, the laziness of only looking up symbols for images in the backtrace only existed to bring the number of dsymForUUID calls down to a manageable number. Users have requesting the same functionality but blocking. This gives them the same user experience as enabling dsymForUUID globally, but without the massive upfront cost of having to download all the images, the majority of which they'll likely not need. This patch renames the setting to have a more generic name (symbols.auto-download) and changes its values from a boolean to an enum. Users can now specify "off", "background" and "foreground". The default remains "off" although I'll probably change that in the near future. --- lldb/include/lldb/Core/ModuleList.h | 23 ++++++++++++++++++++++- lldb/include/lldb/lldb-enumerations.h | 6 ++++++ lldb/source/Core/CoreProperties.td | 7 ++++++- lldb/source/Core/ModuleList.cpp | 13 +++++++++---- lldb/source/Symbol/SymbolLocator.cpp | 22 ++++++++++++++++------ 5 files changed, 59 insertions(+), 12 deletions(-) diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h index d78f7c5..43d931a 100644 --- a/lldb/include/lldb/Core/ModuleList.h +++ b/lldb/include/lldb/Core/ModuleList.h @@ -47,6 +47,26 @@ class UUID; class VariableList; struct ModuleFunctionSearchOptions; +static constexpr OptionEnumValueElement g_auto_download_enum_values[] = { + { + lldb::eSymbolDownloadOff, + "off", + "Disable automatically downloading symbols.", + }, + { + lldb::eSymbolDownloadBackground, + "background", + "Download symbols in the background for images as they appear in the " + "backtrace.", + }, + { + lldb::eSymbolDownloadForeground, + "foreground", + "Download symbols in the foreground for images as they appear in the " + "backtrace.", + }, +}; + class ModuleListProperties : public Properties { mutable llvm::sys::RWMutex m_symlink_paths_mutex; PathMappingList m_symlink_paths; @@ -60,7 +80,6 @@ public: bool SetClangModulesCachePath(const FileSpec &path); bool GetEnableExternalLookup() const; bool SetEnableExternalLookup(bool new_value); - bool GetEnableBackgroundLookup() const; bool GetEnableLLDBIndexCache() const; bool SetEnableLLDBIndexCache(bool new_value); uint64_t GetLLDBIndexCacheMaxByteSize(); @@ -71,6 +90,8 @@ public: bool GetLoadSymbolOnDemand(); + lldb::SymbolDownload GetSymbolAutoDownload() const; + PathMappingList GetSymlinkMappings() const; }; diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 7e9b538..4640533 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -1314,6 +1314,12 @@ enum class ChildCacheState { ///< re-use what we computed the last time we called Update. }; +enum SymbolDownload { + eSymbolDownloadOff = 0, + eSymbolDownloadBackground = 1, + eSymbolDownloadForeground = 2, +}; + } // namespace lldb #endif // LLDB_LLDB_ENUMERATIONS_H diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td index 8d81967..9c4aa2d 100644 --- a/lldb/source/Core/CoreProperties.td +++ b/lldb/source/Core/CoreProperties.td @@ -8,7 +8,12 @@ let Definition = "modulelist" in { def EnableBackgroundLookup: Property<"enable-background-lookup", "Boolean">, Global, DefaultFalse, - Desc<"On macOS, enable calling dsymForUUID (or an equivalent script/binary) in the background to locate symbol files that weren't found.">; + Desc<"Alias for backward compatibility: when enabled this is the equivalent to 'symbols.download background'.">; + def AutoDownload: Property<"auto-download", "Enum">, + Global, + DefaultEnumValue<"eSymbolDownloadOff">, + EnumValues<"OptionEnumValues(g_auto_download_enum_values)">, + Desc<"On macOS, automatically download symbols with dsymForUUID (or an equivalent script/binary) for relevant images in the debug session.">; def ClangModulesCachePath: Property<"clang-modules-cache-path", "FileSpec">, Global, DefaultStringValue<"">, diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp index b7f3936..b03490b 100644 --- a/lldb/source/Core/ModuleList.cpp +++ b/lldb/source/Core/ModuleList.cpp @@ -104,10 +104,15 @@ bool ModuleListProperties::SetEnableExternalLookup(bool new_value) { return SetPropertyAtIndex(ePropertyEnableExternalLookup, new_value); } -bool ModuleListProperties::GetEnableBackgroundLookup() const { - const uint32_t idx = ePropertyEnableBackgroundLookup; - return GetPropertyAtIndexAs( - idx, g_modulelist_properties[idx].default_uint_value != 0); +SymbolDownload ModuleListProperties::GetSymbolAutoDownload() const { + // Backward compatibility alias. + if (GetPropertyAtIndexAs(ePropertyEnableBackgroundLookup, false)) + return eSymbolDownloadBackground; + + const uint32_t idx = ePropertyAutoDownload; + return GetPropertyAtIndexAs( + idx, static_cast( + g_modulelist_properties[idx].default_uint_value)); } FileSpec ModuleListProperties::GetClangModulesCachePath() const { diff --git a/lldb/source/Symbol/SymbolLocator.cpp b/lldb/source/Symbol/SymbolLocator.cpp index 918f13ed..93a5bc4 100644 --- a/lldb/source/Symbol/SymbolLocator.cpp +++ b/lldb/source/Symbol/SymbolLocator.cpp @@ -10,6 +10,7 @@ #include "lldb/Core/Debugger.h" #include "lldb/Core/PluginManager.h" +#include "lldb/Host/Host.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Support/ThreadPool.h" @@ -18,12 +19,10 @@ using namespace lldb; using namespace lldb_private; void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) { - if (!ModuleList::GetGlobalModuleListProperties().GetEnableBackgroundLookup()) - return; - static llvm::SmallSet g_seen_uuids; static std::mutex g_mutex; - Debugger::GetThreadPool().async([=]() { + + auto lookup = [=]() { { std::lock_guard guard(g_mutex); if (g_seen_uuids.count(uuid)) @@ -36,12 +35,23 @@ void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) { module_spec.GetUUID() = uuid; if (!PluginManager::DownloadObjectAndSymbolFile(module_spec, error, /*force_lookup=*/true, - /*copy_executable=*/false)) + /*copy_executable=*/true)) return; if (error.Fail()) return; Debugger::ReportSymbolChange(module_spec); - }); + }; + + switch (ModuleList::GetGlobalModuleListProperties().GetSymbolAutoDownload()) { + case eSymbolDownloadOff: + break; + case eSymbolDownloadBackground: + Debugger::GetThreadPool().async(lookup); + break; + case eSymbolDownloadForeground: + lookup(); + break; + }; } -- cgit v1.1 From 3f9d8d892e2de2ac2542cb8e88ae5317f3282244 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Fri, 9 Feb 2024 06:11:20 +0900 Subject: [Coverage] MCDCRecordProcessor: Find `ExecVectors` directly (#80816) Deprecate `TestVectors`, since no one uses it. This affects the output order of ExecVectors. The current impl emits sorted by binary value of ExecVector. This impl emits along the traversal of `buildTestVector()`. --- llvm/lib/ProfileData/Coverage/CoverageMapping.cpp | 31 ++++++++--------------- llvm/test/tools/llvm-cov/mcdc-const.test | 28 ++++++++++---------- llvm/test/tools/llvm-cov/mcdc-general.test | 16 ++++++------ 3 files changed, 33 insertions(+), 42 deletions(-) diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp index 6b189c3..eb0996e 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp @@ -253,9 +253,6 @@ class MCDCRecordProcessor { /// Mapping of calculated MC/DC Independence Pairs for each condition. MCDCRecord::TVPairMap IndependencePairs; - /// Total number of possible Test Vectors for the boolean expression. - MCDCRecord::TestVectors TestVectors; - /// Actual executed Test Vectors for the boolean expression, based on /// ExecutedTestVectorBitmap. MCDCRecord::TestVectors ExecVectors; @@ -267,18 +264,20 @@ public: : Bitmap(Bitmap), Region(Region), Branches(Branches), NumConditions(Region.MCDCParams.NumConditions), BitmapIdx(Region.MCDCParams.BitmapIdx * CHAR_BIT), - Folded(NumConditions, false), IndependencePairs(NumConditions), - TestVectors((size_t)1 << NumConditions) {} + Folded(NumConditions, false), IndependencePairs(NumConditions) {} private: void recordTestVector(MCDCRecord::TestVector &TV, unsigned Index, MCDCRecord::CondState Result) { + if (!Bitmap[BitmapIdx + Index]) + return; + // Copy the completed test vector to the vector of testvectors. - TestVectors[Index] = TV; + ExecVectors.push_back(TV); // The final value (T,F) is equal to the last non-dontcare state on the // path (in a short-circuiting system). - TestVectors[Index].push_back(Result); + ExecVectors.back().push_back(Result); } // Walk the binary decision diagram and try assigning both false and true to @@ -308,13 +307,11 @@ private: /// Walk the bits in the bitmap. A bit set to '1' indicates that the test /// vector at the corresponding index was executed during a test run. void findExecutedTestVectors() { - for (unsigned Idx = 0; Idx < (1u << NumConditions); ++Idx) { - assert(BitmapIdx + Idx < Bitmap.size() && "Bitmap overrun"); - if (Bitmap[BitmapIdx + Idx] == 0) - continue; - assert(!TestVectors[Idx].empty() && "Test Vector doesn't exist."); - ExecVectors.push_back(TestVectors[Idx]); - } + // Walk the binary decision diagram to enumerate all possible test vectors. + // We start at the root node (ID == 1) with all values being DontCare. + // `Index` encodes the bitmask of true values and is initially 0. + MCDCRecord::TestVector TV(NumConditions, MCDCRecord::MCDC_DontCare); + buildTestVector(TV, 1, 0); } // Find an independence pair for each condition: @@ -380,12 +377,6 @@ public: Folded[I++] = (B->Count.isZero() && B->FalseCount.isZero()); } - // Walk the binary decision diagram to enumerate all possible test vectors. - // We start at the root node (ID == 1) with all values being DontCare. - // `Index` encodes the bitmask of true values and is initially 0. - MCDCRecord::TestVector TV(NumConditions, MCDCRecord::MCDC_DontCare); - buildTestVector(TV, 1, 0); - // Using Profile Bitmap from runtime, mark the executed test vectors. findExecutedTestVectors(); diff --git a/llvm/test/tools/llvm-cov/mcdc-const.test b/llvm/test/tools/llvm-cov/mcdc-const.test index 0b2c9c9..5424625 100644 --- a/llvm/test/tools/llvm-cov/mcdc-const.test +++ b/llvm/test/tools/llvm-cov/mcdc-const.test @@ -61,8 +61,8 @@ // CHECKFULLCASE: | C1-Pair: constant folded // CHECKFULLCASE-NEXT: | C2-Pair: not covered // CHECKFULLCASE: | MC/DC Coverage for Decision: 0.00% -// CHECKFULLCASE: | 1 { T, C = T } -// CHECKFULLCASE-NEXT: | 2 { F, C = T } +// CHECKFULLCASE: | 1 { F, C = T } +// CHECKFULLCASE-NEXT: | 2 { T, C = T } // CHECKFULLCASE: | C1-Pair: not covered // CHECKFULLCASE-NEXT: | C2-Pair: constant folded // CHECKFULLCASE: | MC/DC Coverage for Decision: 0.00% @@ -106,8 +106,8 @@ // CHECKFULLCASE-NEXT: | C2-Pair: not covered // CHECKFULLCASE-NEXT: | C3-Pair: not covered // CHECKFULLCASE: | MC/DC Coverage for Decision: 0.00% -// CHECKFULLCASE: | 1 { T, C, - = T } -// CHECKFULLCASE-NEXT: | 2 { F, C, - = T } +// CHECKFULLCASE: | 1 { F, C, - = T } +// CHECKFULLCASE-NEXT: | 2 { T, C, - = T } // CHECKFULLCASE: | C1-Pair: not covered // CHECKFULLCASE-NEXT: | C2-Pair: constant folded // CHECKFULLCASE-NEXT: | C3-Pair: not covered @@ -118,8 +118,8 @@ // CHECKFULLCASE-NEXT: | C2-Pair: not covered // CHECKFULLCASE-NEXT: | C3-Pair: not covered // CHECKFULLCASE: | MC/DC Coverage for Decision: 0.00% -// CHECKFULLCASE: | 1 { T, C, - = T } -// CHECKFULLCASE-NEXT: | 2 { F, C, T = T } +// CHECKFULLCASE: | 1 { F, C, T = T } +// CHECKFULLCASE-NEXT: | 2 { T, C, - = T } // CHECKFULLCASE: | C1-Pair: not covered // CHECKFULLCASE-NEXT: | C2-Pair: constant folded // CHECKFULLCASE-NEXT: | C3-Pair: not covered @@ -151,26 +151,26 @@ // CHECKFULLCASE-NEXT: | C2-Pair: constant folded // CHECKFULLCASE-NEXT: | C3-Pair: covered: (2,3) // CHECKFULLCASE: | MC/DC Coverage for Decision: 100.00% -// CHECKFULLCASE: | 1 { T, -, C = T } -// CHECKFULLCASE-NEXT: | 2 { F, T, C = T } +// CHECKFULLCASE: | 1 { F, T, C = T } +// CHECKFULLCASE-NEXT: | 2 { T, -, C = T } // CHECKFULLCASE: | C1-Pair: not covered // CHECKFULLCASE-NEXT: | C2-Pair: not covered // CHECKFULLCASE-NEXT: | C3-Pair: constant folded // CHECKFULLCASE: | MC/DC Coverage for Decision: 0.00% -// CHECKFULLCASE: | 1 { T, C, - = T } -// CHECKFULLCASE-NEXT: | 2 { F, C, - = T } +// CHECKFULLCASE: | 1 { F, C, - = T } +// CHECKFULLCASE-NEXT: | 2 { T, C, - = T } // CHECKFULLCASE: | C1-Pair: not covered // CHECKFULLCASE-NEXT: | C2-Pair: constant folded // CHECKFULLCASE-NEXT: | C3-Pair: not covered // CHECKFULLCASE: | MC/DC Coverage for Decision: 0.00% -// CHECKFULLCASE: | 1 { T, -, C = T } -// CHECKFULLCASE-NEXT: | 2 { F, T, C = T } +// CHECKFULLCASE: | 1 { F, T, C = T } +// CHECKFULLCASE-NEXT: | 2 { T, -, C = T } // CHECKFULLCASE: | C1-Pair: not covered // CHECKFULLCASE-NEXT: | C2-Pair: not covered // CHECKFULLCASE-NEXT: | C3-Pair: constant folded // CHECKFULLCASE: | MC/DC Coverage for Decision: 0.00% -// CHECKFULLCASE: | 1 { T, C, - = T } -// CHECKFULLCASE-NEXT: | 2 { F, C, T = T } +// CHECKFULLCASE: | 1 { F, C, T = T } +// CHECKFULLCASE-NEXT: | 2 { T, C, - = T } // CHECKFULLCASE: | C1-Pair: not covered // CHECKFULLCASE-NEXT: | C2-Pair: constant folded // CHECKFULLCASE-NEXT: | C3-Pair: not covered diff --git a/llvm/test/tools/llvm-cov/mcdc-general.test b/llvm/test/tools/llvm-cov/mcdc-general.test index 753036b..4b59ce5 100644 --- a/llvm/test/tools/llvm-cov/mcdc-general.test +++ b/llvm/test/tools/llvm-cov/mcdc-general.test @@ -19,16 +19,16 @@ // CHECK-NEXT: | // CHECK-NEXT: | C1, C2, C3, C4 Result // CHECK-NEXT: | 1 { F, -, F, - = F } -// CHECK-NEXT: | 2 { T, F, F, - = F } -// CHECK-NEXT: | 3 { F, -, T, F = F } +// CHECK-NEXT: | 2 { F, -, T, F = F } +// CHECK-NEXT: | 3 { T, F, F, - = F } // CHECK-NEXT: | 4 { T, F, T, F = F } -// CHECK-NEXT: | 5 { T, T, -, - = T } -// CHECK-NEXT: | 6 { T, F, T, T = T } +// CHECK-NEXT: | 5 { T, F, T, T = T } +// CHECK-NEXT: | 6 { T, T, -, - = T } // CHECK-NEXT: | -// CHECK-NEXT: | C1-Pair: covered: (1,5) -// CHECK-NEXT: | C2-Pair: covered: (2,5) -// CHECK-NEXT: | C3-Pair: covered: (2,6) -// CHECK-NEXT: | C4-Pair: covered: (4,6) +// CHECK-NEXT: | C1-Pair: covered: (1,6) +// CHECK-NEXT: | C2-Pair: covered: (3,6) +// CHECK-NEXT: | C3-Pair: covered: (3,5) +// CHECK-NEXT: | C4-Pair: covered: (4,5) // CHECK-NEXT: | MC/DC Coverage for Decision: 100.00% // CHECK-NEXT: | // CHECK-NEXT: ------------------ -- cgit v1.1 From 581857278961b41bc1676499f92167b97a5e4c58 Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Thu, 8 Feb 2024 13:20:47 -0800 Subject: [Object][Wasm] Generate symbol info from name section names (#81063) Currently symbol info is generated from a linking section or from export names. This PR generates symbols in a WasmObjectFile from the name section as well, which allows tools like objdump and nm to show useful information for more linked binaries. There are some limitations: most notably that we don't assume any particular ABI, so we don't get detailed information about data symbols if the segments are merged (which is the default). Covers most of the desired functionality from #76107 --- llvm/lib/Object/WasmObjectFile.cpp | 49 +++++++++++- .../wasm-linked-namesec-with-linkingsec.yaml | 40 ++++++++++ llvm/test/Object/wasm-linked-symbol-table.yaml | 75 +++++++++++++++++++ .../wasm/linked-symbol-table-namesec.yaml | 87 ++++++++++++++++++++++ .../llvm-objdump/wasm/linked-symbol-table.yaml | 75 ------------------- 5 files changed, 247 insertions(+), 79 deletions(-) create mode 100644 llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml create mode 100644 llvm/test/Object/wasm-linked-symbol-table.yaml create mode 100644 llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml delete mode 100644 llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp index 8c1bbe9..ea17154 100644 --- a/llvm/lib/Object/WasmObjectFile.cpp +++ b/llvm/lib/Object/WasmObjectFile.cpp @@ -508,10 +508,17 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) { llvm::DenseSet SeenGlobals; llvm::DenseSet SeenSegments; + // If there is symbol info from the export section, this info will supersede + // it, but not info from a linking section + if (!HasLinkingSection) { + Symbols.clear(); + } + while (Ctx.Ptr < Ctx.End) { uint8_t Type = readUint8(Ctx); uint32_t Size = readVaruint32(Ctx); const uint8_t *SubSectionEnd = Ctx.Ptr + Size; + switch (Type) { case wasm::WASM_NAMES_FUNCTION: case wasm::WASM_NAMES_GLOBAL: @@ -521,6 +528,16 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) { uint32_t Index = readVaruint32(Ctx); StringRef Name = readString(Ctx); wasm::NameType nameType = wasm::NameType::FUNCTION; + wasm::WasmSymbolInfo Info{Name, + /*Kind */ wasm::WASM_SYMBOL_TYPE_FUNCTION, + /* Flags */ 0, + /* ImportModule */ std::nullopt, + /* ImportName */ std::nullopt, + /* ExportName */ std::nullopt, + {/* ElementIndex */ Index}}; + const wasm::WasmSignature *Signature = nullptr; + const wasm::WasmGlobalType *GlobalType = nullptr; + const wasm::WasmTableType *TableType = nullptr; if (Type == wasm::WASM_NAMES_FUNCTION) { if (!SeenFunctions.insert(Index).second) return make_error( @@ -529,26 +546,50 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) { return make_error("invalid function name entry", object_error::parse_failed); - if (isDefinedFunctionIndex(Index)) - getDefinedFunction(Index).DebugName = Name; + if (isDefinedFunctionIndex(Index)) { + wasm::WasmFunction &F = getDefinedFunction(Index); + F.DebugName = Name; + Signature = &Signatures[F.SigIndex]; + if (F.ExportName) { + Info.ExportName = F.ExportName; + Info.Flags |= wasm::WASM_SYMBOL_BINDING_GLOBAL; + } else { + Info.Flags |= wasm::WASM_SYMBOL_BINDING_LOCAL; + } + } else { + Info.Flags |= wasm::WASM_SYMBOL_UNDEFINED; + } } else if (Type == wasm::WASM_NAMES_GLOBAL) { - nameType = wasm::NameType::GLOBAL; if (!SeenGlobals.insert(Index).second) return make_error("global named more than once", object_error::parse_failed); if (!isValidGlobalIndex(Index) || Name.empty()) return make_error("invalid global name entry", object_error::parse_failed); + nameType = wasm::NameType::GLOBAL; + Info.Kind = wasm::WASM_SYMBOL_TYPE_GLOBAL; + if (isDefinedGlobalIndex(Index)) { + GlobalType = &getDefinedGlobal(Index).Type; + } else { + Info.Flags |= wasm::WASM_SYMBOL_UNDEFINED; + } } else { - nameType = wasm::NameType::DATA_SEGMENT; if (!SeenSegments.insert(Index).second) return make_error( "segment named more than once", object_error::parse_failed); if (Index > DataSegments.size()) return make_error("invalid data segment name entry", object_error::parse_failed); + nameType = wasm::NameType::DATA_SEGMENT; + Info.Kind = wasm::WASM_SYMBOL_TYPE_DATA; + Info.Flags |= wasm::WASM_SYMBOL_BINDING_LOCAL; + assert(Index < DataSegments.size()); + Info.DataRef = wasm::WasmDataReference{ + Index, 0, DataSegments[Index].Data.Content.size()}; } DebugNames.push_back(wasm::WasmDebugName{nameType, Index, Name}); + if (!HasLinkingSection) + Symbols.emplace_back(Info, GlobalType, TableType, Signature); } break; } diff --git a/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml b/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml new file mode 100644 index 0000000..c730417 --- /dev/null +++ b/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml @@ -0,0 +1,40 @@ +# RUN: yaml2obj %s -o %t.wasm +# RUN: llvm-nm -P %t.wasm | FileCheck %s +# +# Test that names from the linking section override those from the name section +# CHECK: foo T 1 0 +# CHECK-NOT: my_func_local_name + +--- !WASM +FileHeader: + Version: 0x1 +Sections: + - Type: TYPE + Signatures: + - Index: 0 + ParamTypes: [] + ReturnTypes: [] + - Type: FUNCTION + FunctionTypes: [ 0, 0 ] + - Type: CODE + Functions: + - Index: 0 + Locals: + Body: 00 + - Index: 1 + Locals: + Body: 00 + - Type: CUSTOM + Name: linking + Version: 2 + SymbolTable: + - Index: 0 + Kind: FUNCTION + Name: foo + Flags: [ VISIBILITY_HIDDEN ] + Function: 0 + - Type: CUSTOM + Name: name + FunctionNames: + - Index: 1 + Name: my_func_local_name diff --git a/llvm/test/Object/wasm-linked-symbol-table.yaml b/llvm/test/Object/wasm-linked-symbol-table.yaml new file mode 100644 index 0000000..6dd949a --- /dev/null +++ b/llvm/test/Object/wasm-linked-symbol-table.yaml @@ -0,0 +1,75 @@ +# RUN: yaml2obj %s -o %t.wasm +# RUN: llvm-objdump -t %t.wasm | FileCheck %s +# +# CHECK: SYMBOL TABLE: +# CHECK-NEXT: 0000009f g F CODE my_func_export +# CHECK-NEXT: 0000002a g O DATA my_global_export +# CHECK-NEXT: 00000000 g TABLE my_table_export + +--- !WASM +FileHeader: + Version: 0x1 +Sections: + - Type: TYPE + Signatures: + - Index: 0 + ParamTypes: [] + ReturnTypes: [] + - Type: IMPORT + Imports: + - Module: env + Field: foo + Kind: FUNCTION + SigIndex: 0 + - Module: env + Field: bar + Kind: GLOBAL + GlobalType: I32 + GlobalMutable: true + - Module: env + Field: memory + Kind: MEMORY + Memory: + Minimum: 0x1 + - Type: FUNCTION + FunctionTypes: [ 0 ] + - Type: TABLE + Tables: + - Index: 0 + ElemType: FUNCREF + Limits: + Flags: [ HAS_MAX ] + Minimum: 0x1 + Maximum: 0x1 + - Type: GLOBAL + Globals: + - Index: 1 + Mutable: false + Type: I32 + InitExpr: + Opcode: I32_CONST + Value: 42 + - Type: EXPORT + Exports: + - Name: my_func_export + Kind: FUNCTION + Index: 1 + - Name: my_global_export + Kind: GLOBAL + Index: 1 + - Name: my_table_export + Kind: TABLE + Index: 0 + - Type: CODE + Functions: + - Index: 1 + Locals: + Body: 00 + - Type: DATA + Segments: + - SectionOffset: 0 + InitFlags: 0 + Offset: + Opcode: I32_CONST + Value: 0 + Content: '' diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml new file mode 100644 index 0000000..622a606 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml @@ -0,0 +1,87 @@ +# RUN: yaml2obj %s -o %t.wasm +# RUN: llvm-objdump -t %t.wasm | FileCheck %s +# +# CHECK: SYMBOL TABLE: +# CHECK-NEXT: 00000000 F *UND* my_func_import_name +# CHECK-NEXT: 00000083 g F CODE my_func_export_name +# CHECK-NEXT: 00000086 l F CODE my_func_local_name +# CHECK-NEXT: 00000000 *UND* my_global_import_name +# CHECK-NEXT: 00000001 g GLOBAL my_global_export_name +# CHECK-NEXT: 00000000 l O DATA my_datasegment_name + +--- !WASM +FileHeader: + Version: 0x1 +Sections: + - Type: TYPE + Signatures: + - Index: 0 + ParamTypes: [] + ReturnTypes: [] + - Type: IMPORT + Imports: + - Module: env + Field: foo + Kind: FUNCTION + SigIndex: 0 + - Module: env + Field: bar + Kind: GLOBAL + GlobalType: I32 + GlobalMutable: true + - Module: env + Field: memory + Kind: MEMORY + Memory: + Minimum: 0x1 + - Type: FUNCTION + FunctionTypes: [ 0, 0 ] + - Type: GLOBAL + Globals: + - Index: 1 + Mutable: false + Type: I32 + InitExpr: + Opcode: I32_CONST + Value: 42 + - Type: EXPORT + Exports: + - Name: my_func_export + Kind: FUNCTION + Index: 1 + - Name: my_global_export + Kind: GLOBAL + Index: 1 + - Type: CODE + Functions: + - Index: 1 + Locals: + Body: 00 + - Index: 2 + Locals: + Body: 00 + - Type: DATA + Segments: + - SectionOffset: 0 + InitFlags: 0 + Offset: + Opcode: I32_CONST + Value: 0 + Content: 'abcd1234' + - Type: CUSTOM + Name: name + FunctionNames: + - Index: 0 + Name: my_func_import_name + - Index: 1 + Name: my_func_export_name + - Index: 2 + Name: my_func_local_name + GlobalNames: + - Index: 0 + Name: my_global_import_name + - Index: 1 + Name: my_global_export_name + DataSegmentNames: + - Index: 0 + Name: my_datasegment_name diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml deleted file mode 100644 index 6dd949a..0000000 --- a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml +++ /dev/null @@ -1,75 +0,0 @@ -# RUN: yaml2obj %s -o %t.wasm -# RUN: llvm-objdump -t %t.wasm | FileCheck %s -# -# CHECK: SYMBOL TABLE: -# CHECK-NEXT: 0000009f g F CODE my_func_export -# CHECK-NEXT: 0000002a g O DATA my_global_export -# CHECK-NEXT: 00000000 g TABLE my_table_export - ---- !WASM -FileHeader: - Version: 0x1 -Sections: - - Type: TYPE - Signatures: - - Index: 0 - ParamTypes: [] - ReturnTypes: [] - - Type: IMPORT - Imports: - - Module: env - Field: foo - Kind: FUNCTION - SigIndex: 0 - - Module: env - Field: bar - Kind: GLOBAL - GlobalType: I32 - GlobalMutable: true - - Module: env - Field: memory - Kind: MEMORY - Memory: - Minimum: 0x1 - - Type: FUNCTION - FunctionTypes: [ 0 ] - - Type: TABLE - Tables: - - Index: 0 - ElemType: FUNCREF - Limits: - Flags: [ HAS_MAX ] - Minimum: 0x1 - Maximum: 0x1 - - Type: GLOBAL - Globals: - - Index: 1 - Mutable: false - Type: I32 - InitExpr: - Opcode: I32_CONST - Value: 42 - - Type: EXPORT - Exports: - - Name: my_func_export - Kind: FUNCTION - Index: 1 - - Name: my_global_export - Kind: GLOBAL - Index: 1 - - Name: my_table_export - Kind: TABLE - Index: 0 - - Type: CODE - Functions: - - Index: 1 - Locals: - Body: 00 - - Type: DATA - Segments: - - SectionOffset: 0 - InitFlags: 0 - Offset: - Opcode: I32_CONST - Value: 0 - Content: '' -- cgit v1.1 From cdde0d9602217eb0bc091b4de16197e6aa5bb132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 8 Feb 2024 16:03:22 +0100 Subject: [clang][Interp][NFC] Make a local variable const --- clang/lib/AST/Interp/Interp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index a76e633..1299a70 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -811,7 +811,7 @@ bool CMP3(InterpState &S, CodePtr OpPC, const ComparisonCategoryInfo *CmpInfo) { const auto *CmpValueInfo = CmpInfo->getValueInfo(CmpResult); assert(CmpValueInfo); assert(CmpValueInfo->hasValidIntValue()); - APSInt IntValue = CmpValueInfo->getIntValue(); + const APSInt &IntValue = CmpValueInfo->getIntValue(); return SetThreeWayComparisonField(S, OpPC, P, IntValue); } -- cgit v1.1 From 7c9c4983b1d493c2fdea76f99591f9ab49877306 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 8 Feb 2024 13:43:11 -0800 Subject: [DWARFLinkerParallel] Fix member initialization order (#81179) DWARFLinkerImpl::DWARFLinkerImpl initializes DebugStrStrings/DebugLineStrStrings/CommonSections using GlobalData but GlobalData is initialized after the three members. Move GlobalData before. Fix #81110 --- llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h index 527c7a0..7c17c5b 100644 --- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h +++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h @@ -351,6 +351,9 @@ protected: /// \defgroup Data members accessed sequentially. /// /// @{ + /// Data global for the whole linking process. + LinkingGlobalData GlobalData; + /// DwarfStringPoolEntries for .debug_str section. StringEntryToDwarfStringPoolEntryMap DebugStrStrings; @@ -368,9 +371,6 @@ protected: /// Overall compile units number. uint64_t OverallNumberOfCU = 0; - - /// Data global for the whole linking process. - LinkingGlobalData GlobalData; /// @} }; -- cgit v1.1 From f78c9b88b7a1a54cf67037f9088a3e48779b1e44 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 8 Feb 2024 13:39:58 -0800 Subject: [RISCV] Use MCPhysReg for AllPopRegs. NFC MCPhysReg is 2 bytes, while Register is 4 bytes. --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index e5b5103..b12b497 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -42,7 +42,7 @@ RISCVFrameLowering::RISCVFrameLowering(const RISCVSubtarget &STI) /*TransientStackAlignment=*/Align(16)), STI(STI) {} -static const Register AllPopRegs[] = { +static const MCPhysReg AllPopRegs[] = { RISCV::X1, RISCV::X8, RISCV::X9, RISCV::X18, RISCV::X19, RISCV::X20, RISCV::X21, RISCV::X22, RISCV::X23, RISCV::X24, RISCV::X25, RISCV::X26, RISCV::X27}; -- cgit v1.1 From 4b54b474aa0ffb355faa63cc2d8f95fd321c887f Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Thu, 8 Feb 2024 13:53:18 -0800 Subject: [NVPTX][NFC] cleanup dead vars, use MAKE_CASE (#81161) Cleanup some dead variables. In addition, switch to a `MAKE_CASE` macro, similar to other targets, to reduce boilerplate. --- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 1 - llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1048 +++++++++++---------------- 2 files changed, 428 insertions(+), 621 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 6c4879b..cdfc288 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1530,7 +1530,6 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { if (isKernelFunction(*F)) { if (isSampler(*I) || isImage(*I)) { if (isImage(*I)) { - std::string sname = std::string(I->getName()); if (isImageWriteOnly(*I) || isImageReadWrite(*I)) { if (hasImageHandles) O << "\t.param .u64 .ptr .surfref "; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 24e0be2..c7bc623 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -858,623 +858,432 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, } const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { + +#define MAKE_CASE(V) \ + case V: \ + return #V; + switch ((NVPTXISD::NodeType)Opcode) { case NVPTXISD::FIRST_NUMBER: break; - case NVPTXISD::CALL: - return "NVPTXISD::CALL"; - case NVPTXISD::RET_GLUE: - return "NVPTXISD::RET_GLUE"; - case NVPTXISD::LOAD_PARAM: - return "NVPTXISD::LOAD_PARAM"; - case NVPTXISD::Wrapper: - return "NVPTXISD::Wrapper"; - case NVPTXISD::DeclareParam: - return "NVPTXISD::DeclareParam"; - case NVPTXISD::DeclareScalarParam: - return "NVPTXISD::DeclareScalarParam"; - case NVPTXISD::DeclareRet: - return "NVPTXISD::DeclareRet"; - case NVPTXISD::DeclareScalarRet: - return "NVPTXISD::DeclareScalarRet"; - case NVPTXISD::DeclareRetParam: - return "NVPTXISD::DeclareRetParam"; - case NVPTXISD::PrintCall: - return "NVPTXISD::PrintCall"; - case NVPTXISD::PrintConvergentCall: - return "NVPTXISD::PrintConvergentCall"; - case NVPTXISD::PrintCallUni: - return "NVPTXISD::PrintCallUni"; - case NVPTXISD::PrintConvergentCallUni: - return "NVPTXISD::PrintConvergentCallUni"; - case NVPTXISD::LoadParam: - return "NVPTXISD::LoadParam"; - case NVPTXISD::LoadParamV2: - return "NVPTXISD::LoadParamV2"; - case NVPTXISD::LoadParamV4: - return "NVPTXISD::LoadParamV4"; - case NVPTXISD::StoreParam: - return "NVPTXISD::StoreParam"; - case NVPTXISD::StoreParamV2: - return "NVPTXISD::StoreParamV2"; - case NVPTXISD::StoreParamV4: - return "NVPTXISD::StoreParamV4"; - case NVPTXISD::StoreParamS32: - return "NVPTXISD::StoreParamS32"; - case NVPTXISD::StoreParamU32: - return "NVPTXISD::StoreParamU32"; - case NVPTXISD::CallArgBegin: - return "NVPTXISD::CallArgBegin"; - case NVPTXISD::CallArg: - return "NVPTXISD::CallArg"; - case NVPTXISD::LastCallArg: - return "NVPTXISD::LastCallArg"; - case NVPTXISD::CallArgEnd: - return "NVPTXISD::CallArgEnd"; - case NVPTXISD::CallVoid: - return "NVPTXISD::CallVoid"; - case NVPTXISD::CallVal: - return "NVPTXISD::CallVal"; - case NVPTXISD::CallSymbol: - return "NVPTXISD::CallSymbol"; - case NVPTXISD::Prototype: - return "NVPTXISD::Prototype"; - case NVPTXISD::MoveParam: - return "NVPTXISD::MoveParam"; - case NVPTXISD::StoreRetval: - return "NVPTXISD::StoreRetval"; - case NVPTXISD::StoreRetvalV2: - return "NVPTXISD::StoreRetvalV2"; - case NVPTXISD::StoreRetvalV4: - return "NVPTXISD::StoreRetvalV4"; - case NVPTXISD::PseudoUseParam: - return "NVPTXISD::PseudoUseParam"; - case NVPTXISD::RETURN: - return "NVPTXISD::RETURN"; - case NVPTXISD::CallSeqBegin: - return "NVPTXISD::CallSeqBegin"; - case NVPTXISD::CallSeqEnd: - return "NVPTXISD::CallSeqEnd"; - case NVPTXISD::CallPrototype: - return "NVPTXISD::CallPrototype"; - case NVPTXISD::ProxyReg: - return "NVPTXISD::ProxyReg"; - case NVPTXISD::LoadV2: - return "NVPTXISD::LoadV2"; - case NVPTXISD::LoadV4: - return "NVPTXISD::LoadV4"; - case NVPTXISD::LDGV2: - return "NVPTXISD::LDGV2"; - case NVPTXISD::LDGV4: - return "NVPTXISD::LDGV4"; - case NVPTXISD::LDUV2: - return "NVPTXISD::LDUV2"; - case NVPTXISD::LDUV4: - return "NVPTXISD::LDUV4"; - case NVPTXISD::StoreV2: - return "NVPTXISD::StoreV2"; - case NVPTXISD::StoreV4: - return "NVPTXISD::StoreV4"; - case NVPTXISD::FUN_SHFL_CLAMP: - return "NVPTXISD::FUN_SHFL_CLAMP"; - case NVPTXISD::FUN_SHFR_CLAMP: - return "NVPTXISD::FUN_SHFR_CLAMP"; - case NVPTXISD::IMAD: - return "NVPTXISD::IMAD"; - case NVPTXISD::BFE: - return "NVPTXISD::BFE"; - case NVPTXISD::BFI: - return "NVPTXISD::BFI"; - case NVPTXISD::PRMT: - return "NVPTXISD::PRMT"; - case NVPTXISD::SETP_F16X2: - return "NVPTXISD::SETP_F16X2"; - case NVPTXISD::SETP_BF16X2: - return "NVPTXISD::SETP_BF16X2"; - case NVPTXISD::Dummy: - return "NVPTXISD::Dummy"; - case NVPTXISD::MUL_WIDE_SIGNED: - return "NVPTXISD::MUL_WIDE_SIGNED"; - case NVPTXISD::MUL_WIDE_UNSIGNED: - return "NVPTXISD::MUL_WIDE_UNSIGNED"; - case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; - case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; - case NVPTXISD::Tex1DFloatFloatLevel: - return "NVPTXISD::Tex1DFloatFloatLevel"; - case NVPTXISD::Tex1DFloatFloatGrad: - return "NVPTXISD::Tex1DFloatFloatGrad"; - case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; - case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; - case NVPTXISD::Tex1DS32FloatLevel: - return "NVPTXISD::Tex1DS32FloatLevel"; - case NVPTXISD::Tex1DS32FloatGrad: - return "NVPTXISD::Tex1DS32FloatGrad"; - case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; - case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; - case NVPTXISD::Tex1DU32FloatLevel: - return "NVPTXISD::Tex1DU32FloatLevel"; - case NVPTXISD::Tex1DU32FloatGrad: - return "NVPTXISD::Tex1DU32FloatGrad"; - case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; - case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; - case NVPTXISD::Tex1DArrayFloatFloatLevel: - return "NVPTXISD::Tex1DArrayFloatFloatLevel"; - case NVPTXISD::Tex1DArrayFloatFloatGrad: - return "NVPTXISD::Tex1DArrayFloatFloatGrad"; - case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; - case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; - case NVPTXISD::Tex1DArrayS32FloatLevel: - return "NVPTXISD::Tex1DArrayS32FloatLevel"; - case NVPTXISD::Tex1DArrayS32FloatGrad: - return "NVPTXISD::Tex1DArrayS32FloatGrad"; - case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; - case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; - case NVPTXISD::Tex1DArrayU32FloatLevel: - return "NVPTXISD::Tex1DArrayU32FloatLevel"; - case NVPTXISD::Tex1DArrayU32FloatGrad: - return "NVPTXISD::Tex1DArrayU32FloatGrad"; - case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; - case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; - case NVPTXISD::Tex2DFloatFloatLevel: - return "NVPTXISD::Tex2DFloatFloatLevel"; - case NVPTXISD::Tex2DFloatFloatGrad: - return "NVPTXISD::Tex2DFloatFloatGrad"; - case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; - case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; - case NVPTXISD::Tex2DS32FloatLevel: - return "NVPTXISD::Tex2DS32FloatLevel"; - case NVPTXISD::Tex2DS32FloatGrad: - return "NVPTXISD::Tex2DS32FloatGrad"; - case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; - case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; - case NVPTXISD::Tex2DU32FloatLevel: - return "NVPTXISD::Tex2DU32FloatLevel"; - case NVPTXISD::Tex2DU32FloatGrad: - return "NVPTXISD::Tex2DU32FloatGrad"; - case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; - case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; - case NVPTXISD::Tex2DArrayFloatFloatLevel: - return "NVPTXISD::Tex2DArrayFloatFloatLevel"; - case NVPTXISD::Tex2DArrayFloatFloatGrad: - return "NVPTXISD::Tex2DArrayFloatFloatGrad"; - case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; - case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; - case NVPTXISD::Tex2DArrayS32FloatLevel: - return "NVPTXISD::Tex2DArrayS32FloatLevel"; - case NVPTXISD::Tex2DArrayS32FloatGrad: - return "NVPTXISD::Tex2DArrayS32FloatGrad"; - case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; - case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; - case NVPTXISD::Tex2DArrayU32FloatLevel: - return "NVPTXISD::Tex2DArrayU32FloatLevel"; - case NVPTXISD::Tex2DArrayU32FloatGrad: - return "NVPTXISD::Tex2DArrayU32FloatGrad"; - case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; - case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; - case NVPTXISD::Tex3DFloatFloatLevel: - return "NVPTXISD::Tex3DFloatFloatLevel"; - case NVPTXISD::Tex3DFloatFloatGrad: - return "NVPTXISD::Tex3DFloatFloatGrad"; - case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; - case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; - case NVPTXISD::Tex3DS32FloatLevel: - return "NVPTXISD::Tex3DS32FloatLevel"; - case NVPTXISD::Tex3DS32FloatGrad: - return "NVPTXISD::Tex3DS32FloatGrad"; - case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; - case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; - case NVPTXISD::Tex3DU32FloatLevel: - return "NVPTXISD::Tex3DU32FloatLevel"; - case NVPTXISD::Tex3DU32FloatGrad: - return "NVPTXISD::Tex3DU32FloatGrad"; - case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; - case NVPTXISD::TexCubeFloatFloatLevel: - return "NVPTXISD::TexCubeFloatFloatLevel"; - case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; - case NVPTXISD::TexCubeS32FloatLevel: - return "NVPTXISD::TexCubeS32FloatLevel"; - case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; - case NVPTXISD::TexCubeU32FloatLevel: - return "NVPTXISD::TexCubeU32FloatLevel"; - case NVPTXISD::TexCubeArrayFloatFloat: - return "NVPTXISD::TexCubeArrayFloatFloat"; - case NVPTXISD::TexCubeArrayFloatFloatLevel: - return "NVPTXISD::TexCubeArrayFloatFloatLevel"; - case NVPTXISD::TexCubeArrayS32Float: - return "NVPTXISD::TexCubeArrayS32Float"; - case NVPTXISD::TexCubeArrayS32FloatLevel: - return "NVPTXISD::TexCubeArrayS32FloatLevel"; - case NVPTXISD::TexCubeArrayU32Float: - return "NVPTXISD::TexCubeArrayU32Float"; - case NVPTXISD::TexCubeArrayU32FloatLevel: - return "NVPTXISD::TexCubeArrayU32FloatLevel"; - case NVPTXISD::Tld4R2DFloatFloat: - return "NVPTXISD::Tld4R2DFloatFloat"; - case NVPTXISD::Tld4G2DFloatFloat: - return "NVPTXISD::Tld4G2DFloatFloat"; - case NVPTXISD::Tld4B2DFloatFloat: - return "NVPTXISD::Tld4B2DFloatFloat"; - case NVPTXISD::Tld4A2DFloatFloat: - return "NVPTXISD::Tld4A2DFloatFloat"; - case NVPTXISD::Tld4R2DS64Float: - return "NVPTXISD::Tld4R2DS64Float"; - case NVPTXISD::Tld4G2DS64Float: - return "NVPTXISD::Tld4G2DS64Float"; - case NVPTXISD::Tld4B2DS64Float: - return "NVPTXISD::Tld4B2DS64Float"; - case NVPTXISD::Tld4A2DS64Float: - return "NVPTXISD::Tld4A2DS64Float"; - case NVPTXISD::Tld4R2DU64Float: - return "NVPTXISD::Tld4R2DU64Float"; - case NVPTXISD::Tld4G2DU64Float: - return "NVPTXISD::Tld4G2DU64Float"; - case NVPTXISD::Tld4B2DU64Float: - return "NVPTXISD::Tld4B2DU64Float"; - case NVPTXISD::Tld4A2DU64Float: - return "NVPTXISD::Tld4A2DU64Float"; - - case NVPTXISD::TexUnified1DFloatS32: - return "NVPTXISD::TexUnified1DFloatS32"; - case NVPTXISD::TexUnified1DFloatFloat: - return "NVPTXISD::TexUnified1DFloatFloat"; - case NVPTXISD::TexUnified1DFloatFloatLevel: - return "NVPTXISD::TexUnified1DFloatFloatLevel"; - case NVPTXISD::TexUnified1DFloatFloatGrad: - return "NVPTXISD::TexUnified1DFloatFloatGrad"; - case NVPTXISD::TexUnified1DS32S32: - return "NVPTXISD::TexUnified1DS32S32"; - case NVPTXISD::TexUnified1DS32Float: - return "NVPTXISD::TexUnified1DS32Float"; - case NVPTXISD::TexUnified1DS32FloatLevel: - return "NVPTXISD::TexUnified1DS32FloatLevel"; - case NVPTXISD::TexUnified1DS32FloatGrad: - return "NVPTXISD::TexUnified1DS32FloatGrad"; - case NVPTXISD::TexUnified1DU32S32: - return "NVPTXISD::TexUnified1DU32S32"; - case NVPTXISD::TexUnified1DU32Float: - return "NVPTXISD::TexUnified1DU32Float"; - case NVPTXISD::TexUnified1DU32FloatLevel: - return "NVPTXISD::TexUnified1DU32FloatLevel"; - case NVPTXISD::TexUnified1DU32FloatGrad: - return "NVPTXISD::TexUnified1DU32FloatGrad"; - case NVPTXISD::TexUnified1DArrayFloatS32: - return "NVPTXISD::TexUnified1DArrayFloatS32"; - case NVPTXISD::TexUnified1DArrayFloatFloat: - return "NVPTXISD::TexUnified1DArrayFloatFloat"; - case NVPTXISD::TexUnified1DArrayFloatFloatLevel: - return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; - case NVPTXISD::TexUnified1DArrayFloatFloatGrad: - return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; - case NVPTXISD::TexUnified1DArrayS32S32: - return "NVPTXISD::TexUnified1DArrayS32S32"; - case NVPTXISD::TexUnified1DArrayS32Float: - return "NVPTXISD::TexUnified1DArrayS32Float"; - case NVPTXISD::TexUnified1DArrayS32FloatLevel: - return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; - case NVPTXISD::TexUnified1DArrayS32FloatGrad: - return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; - case NVPTXISD::TexUnified1DArrayU32S32: - return "NVPTXISD::TexUnified1DArrayU32S32"; - case NVPTXISD::TexUnified1DArrayU32Float: - return "NVPTXISD::TexUnified1DArrayU32Float"; - case NVPTXISD::TexUnified1DArrayU32FloatLevel: - return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; - case NVPTXISD::TexUnified1DArrayU32FloatGrad: - return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; - case NVPTXISD::TexUnified2DFloatS32: - return "NVPTXISD::TexUnified2DFloatS32"; - case NVPTXISD::TexUnified2DFloatFloat: - return "NVPTXISD::TexUnified2DFloatFloat"; - case NVPTXISD::TexUnified2DFloatFloatLevel: - return "NVPTXISD::TexUnified2DFloatFloatLevel"; - case NVPTXISD::TexUnified2DFloatFloatGrad: - return "NVPTXISD::TexUnified2DFloatFloatGrad"; - case NVPTXISD::TexUnified2DS32S32: - return "NVPTXISD::TexUnified2DS32S32"; - case NVPTXISD::TexUnified2DS32Float: - return "NVPTXISD::TexUnified2DS32Float"; - case NVPTXISD::TexUnified2DS32FloatLevel: - return "NVPTXISD::TexUnified2DS32FloatLevel"; - case NVPTXISD::TexUnified2DS32FloatGrad: - return "NVPTXISD::TexUnified2DS32FloatGrad"; - case NVPTXISD::TexUnified2DU32S32: - return "NVPTXISD::TexUnified2DU32S32"; - case NVPTXISD::TexUnified2DU32Float: - return "NVPTXISD::TexUnified2DU32Float"; - case NVPTXISD::TexUnified2DU32FloatLevel: - return "NVPTXISD::TexUnified2DU32FloatLevel"; - case NVPTXISD::TexUnified2DU32FloatGrad: - return "NVPTXISD::TexUnified2DU32FloatGrad"; - case NVPTXISD::TexUnified2DArrayFloatS32: - return "NVPTXISD::TexUnified2DArrayFloatS32"; - case NVPTXISD::TexUnified2DArrayFloatFloat: - return "NVPTXISD::TexUnified2DArrayFloatFloat"; - case NVPTXISD::TexUnified2DArrayFloatFloatLevel: - return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; - case NVPTXISD::TexUnified2DArrayFloatFloatGrad: - return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; - case NVPTXISD::TexUnified2DArrayS32S32: - return "NVPTXISD::TexUnified2DArrayS32S32"; - case NVPTXISD::TexUnified2DArrayS32Float: - return "NVPTXISD::TexUnified2DArrayS32Float"; - case NVPTXISD::TexUnified2DArrayS32FloatLevel: - return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; - case NVPTXISD::TexUnified2DArrayS32FloatGrad: - return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; - case NVPTXISD::TexUnified2DArrayU32S32: - return "NVPTXISD::TexUnified2DArrayU32S32"; - case NVPTXISD::TexUnified2DArrayU32Float: - return "NVPTXISD::TexUnified2DArrayU32Float"; - case NVPTXISD::TexUnified2DArrayU32FloatLevel: - return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; - case NVPTXISD::TexUnified2DArrayU32FloatGrad: - return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; - case NVPTXISD::TexUnified3DFloatS32: - return "NVPTXISD::TexUnified3DFloatS32"; - case NVPTXISD::TexUnified3DFloatFloat: - return "NVPTXISD::TexUnified3DFloatFloat"; - case NVPTXISD::TexUnified3DFloatFloatLevel: - return "NVPTXISD::TexUnified3DFloatFloatLevel"; - case NVPTXISD::TexUnified3DFloatFloatGrad: - return "NVPTXISD::TexUnified3DFloatFloatGrad"; - case NVPTXISD::TexUnified3DS32S32: - return "NVPTXISD::TexUnified3DS32S32"; - case NVPTXISD::TexUnified3DS32Float: - return "NVPTXISD::TexUnified3DS32Float"; - case NVPTXISD::TexUnified3DS32FloatLevel: - return "NVPTXISD::TexUnified3DS32FloatLevel"; - case NVPTXISD::TexUnified3DS32FloatGrad: - return "NVPTXISD::TexUnified3DS32FloatGrad"; - case NVPTXISD::TexUnified3DU32S32: - return "NVPTXISD::TexUnified3DU32S32"; - case NVPTXISD::TexUnified3DU32Float: - return "NVPTXISD::TexUnified3DU32Float"; - case NVPTXISD::TexUnified3DU32FloatLevel: - return "NVPTXISD::TexUnified3DU32FloatLevel"; - case NVPTXISD::TexUnified3DU32FloatGrad: - return "NVPTXISD::TexUnified3DU32FloatGrad"; - case NVPTXISD::TexUnifiedCubeFloatFloat: - return "NVPTXISD::TexUnifiedCubeFloatFloat"; - case NVPTXISD::TexUnifiedCubeFloatFloatLevel: - return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; - case NVPTXISD::TexUnifiedCubeS32Float: - return "NVPTXISD::TexUnifiedCubeS32Float"; - case NVPTXISD::TexUnifiedCubeS32FloatLevel: - return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; - case NVPTXISD::TexUnifiedCubeU32Float: - return "NVPTXISD::TexUnifiedCubeU32Float"; - case NVPTXISD::TexUnifiedCubeU32FloatLevel: - return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; - case NVPTXISD::TexUnifiedCubeArrayFloatFloat: - return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; - case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: - return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; - case NVPTXISD::TexUnifiedCubeArrayS32Float: - return "NVPTXISD::TexUnifiedCubeArrayS32Float"; - case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: - return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; - case NVPTXISD::TexUnifiedCubeArrayU32Float: - return "NVPTXISD::TexUnifiedCubeArrayU32Float"; - case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: - return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; - case NVPTXISD::TexUnifiedCubeFloatFloatGrad: - return "NVPTXISD::TexUnifiedCubeFloatFloatGrad"; - case NVPTXISD::TexUnifiedCubeS32FloatGrad: - return "NVPTXISD::TexUnifiedCubeS32FloatGrad"; - case NVPTXISD::TexUnifiedCubeU32FloatGrad: - return "NVPTXISD::TexUnifiedCubeU32FloatGrad"; - case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: - return "NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad"; - case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: - return "NVPTXISD::TexUnifiedCubeArrayS32FloatGrad"; - case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: - return "NVPTXISD::TexUnifiedCubeArrayU32FloatGrad"; - case NVPTXISD::Tld4UnifiedR2DFloatFloat: - return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; - case NVPTXISD::Tld4UnifiedG2DFloatFloat: - return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; - case NVPTXISD::Tld4UnifiedB2DFloatFloat: - return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; - case NVPTXISD::Tld4UnifiedA2DFloatFloat: - return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; - case NVPTXISD::Tld4UnifiedR2DS64Float: - return "NVPTXISD::Tld4UnifiedR2DS64Float"; - case NVPTXISD::Tld4UnifiedG2DS64Float: - return "NVPTXISD::Tld4UnifiedG2DS64Float"; - case NVPTXISD::Tld4UnifiedB2DS64Float: - return "NVPTXISD::Tld4UnifiedB2DS64Float"; - case NVPTXISD::Tld4UnifiedA2DS64Float: - return "NVPTXISD::Tld4UnifiedA2DS64Float"; - case NVPTXISD::Tld4UnifiedR2DU64Float: - return "NVPTXISD::Tld4UnifiedR2DU64Float"; - case NVPTXISD::Tld4UnifiedG2DU64Float: - return "NVPTXISD::Tld4UnifiedG2DU64Float"; - case NVPTXISD::Tld4UnifiedB2DU64Float: - return "NVPTXISD::Tld4UnifiedB2DU64Float"; - case NVPTXISD::Tld4UnifiedA2DU64Float: - return "NVPTXISD::Tld4UnifiedA2DU64Float"; - - case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; - case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; - case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; - case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; - case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; - case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; - case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; - case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; - case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; - case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; - case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; - - case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; - case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; - case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; - case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; - case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; - case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; - case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; - case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; - case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; - case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; - case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; - - case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; - case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; - case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; - case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; - case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; - case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; - case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; - case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; - case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; - case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; - case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; - - case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; - case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; - case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; - case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; - case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; - case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; - case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; - case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; - case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; - case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; - case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; - - case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; - case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; - case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; - case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; - case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; - case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; - case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; - case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; - case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; - case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; - case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; - - case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; - case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; - case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; - case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; - case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; - case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; - case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; - case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; - case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; - case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; - case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; - - case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; - case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; - case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; - case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; - case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; - case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; - case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; - case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; - case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; - case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; - case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; - - case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; - case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; - case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; - case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; - case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; - case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; - case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; - case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; - case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; - case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; - case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; - - case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; - case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; - case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; - case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; - case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; - case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; - case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; - case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; - case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; - case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; - case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; - - case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; - case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; - case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; - case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; - case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; - case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; - case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; - case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; - case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; - case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; - case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; - - case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; - case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; - case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; - case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; - case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; - case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; - case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; - case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; - case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; - case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; - case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; - - case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; - case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; - case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; - case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; - case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; - case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; - case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; - case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; - case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; - case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; - case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; - - case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; - case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; - case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; - case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; - case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; - case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; - case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; - case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; - case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; - case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; - case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; - - case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; - case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; - case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; - case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; - case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; - case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; - case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; - case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; - case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; - case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; - case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; - - case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; - case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; - case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; - case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; - case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; - case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; - case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; - case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; - case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; - case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; - case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; + + MAKE_CASE(NVPTXISD::CALL) + MAKE_CASE(NVPTXISD::RET_GLUE) + MAKE_CASE(NVPTXISD::LOAD_PARAM) + MAKE_CASE(NVPTXISD::Wrapper) + MAKE_CASE(NVPTXISD::DeclareParam) + MAKE_CASE(NVPTXISD::DeclareScalarParam) + MAKE_CASE(NVPTXISD::DeclareRet) + MAKE_CASE(NVPTXISD::DeclareScalarRet) + MAKE_CASE(NVPTXISD::DeclareRetParam) + MAKE_CASE(NVPTXISD::PrintCall) + MAKE_CASE(NVPTXISD::PrintConvergentCall) + MAKE_CASE(NVPTXISD::PrintCallUni) + MAKE_CASE(NVPTXISD::PrintConvergentCallUni) + MAKE_CASE(NVPTXISD::LoadParam) + MAKE_CASE(NVPTXISD::LoadParamV2) + MAKE_CASE(NVPTXISD::LoadParamV4) + MAKE_CASE(NVPTXISD::StoreParam) + MAKE_CASE(NVPTXISD::StoreParamV2) + MAKE_CASE(NVPTXISD::StoreParamV4) + MAKE_CASE(NVPTXISD::StoreParamS32) + MAKE_CASE(NVPTXISD::StoreParamU32) + MAKE_CASE(NVPTXISD::CallArgBegin) + MAKE_CASE(NVPTXISD::CallArg) + MAKE_CASE(NVPTXISD::LastCallArg) + MAKE_CASE(NVPTXISD::CallArgEnd) + MAKE_CASE(NVPTXISD::CallVoid) + MAKE_CASE(NVPTXISD::CallVal) + MAKE_CASE(NVPTXISD::CallSymbol) + MAKE_CASE(NVPTXISD::Prototype) + MAKE_CASE(NVPTXISD::MoveParam) + MAKE_CASE(NVPTXISD::StoreRetval) + MAKE_CASE(NVPTXISD::StoreRetvalV2) + MAKE_CASE(NVPTXISD::StoreRetvalV4) + MAKE_CASE(NVPTXISD::PseudoUseParam) + MAKE_CASE(NVPTXISD::RETURN) + MAKE_CASE(NVPTXISD::CallSeqBegin) + MAKE_CASE(NVPTXISD::CallSeqEnd) + MAKE_CASE(NVPTXISD::CallPrototype) + MAKE_CASE(NVPTXISD::ProxyReg) + MAKE_CASE(NVPTXISD::LoadV2) + MAKE_CASE(NVPTXISD::LoadV4) + MAKE_CASE(NVPTXISD::LDGV2) + MAKE_CASE(NVPTXISD::LDGV4) + MAKE_CASE(NVPTXISD::LDUV2) + MAKE_CASE(NVPTXISD::LDUV4) + MAKE_CASE(NVPTXISD::StoreV2) + MAKE_CASE(NVPTXISD::StoreV4) + MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP) + MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP) + MAKE_CASE(NVPTXISD::IMAD) + MAKE_CASE(NVPTXISD::BFE) + MAKE_CASE(NVPTXISD::BFI) + MAKE_CASE(NVPTXISD::PRMT) + MAKE_CASE(NVPTXISD::SETP_F16X2) + MAKE_CASE(NVPTXISD::SETP_BF16X2) + MAKE_CASE(NVPTXISD::Dummy) + MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED) + MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED) + MAKE_CASE(NVPTXISD::Tex1DFloatS32) + MAKE_CASE(NVPTXISD::Tex1DFloatFloat) + MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel) + MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad) + MAKE_CASE(NVPTXISD::Tex1DS32S32) + MAKE_CASE(NVPTXISD::Tex1DS32Float) + MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel) + MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad) + MAKE_CASE(NVPTXISD::Tex1DU32S32) + MAKE_CASE(NVPTXISD::Tex1DU32Float) + MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel) + MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad) + MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32) + MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat) + MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel) + MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad) + MAKE_CASE(NVPTXISD::Tex1DArrayS32S32) + MAKE_CASE(NVPTXISD::Tex1DArrayS32Float) + MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel) + MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad) + MAKE_CASE(NVPTXISD::Tex1DArrayU32S32) + MAKE_CASE(NVPTXISD::Tex1DArrayU32Float) + MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel) + MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad) + MAKE_CASE(NVPTXISD::Tex2DFloatS32) + MAKE_CASE(NVPTXISD::Tex2DFloatFloat) + MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel) + MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad) + MAKE_CASE(NVPTXISD::Tex2DS32S32) + MAKE_CASE(NVPTXISD::Tex2DS32Float) + MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel) + MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad) + MAKE_CASE(NVPTXISD::Tex2DU32S32) + MAKE_CASE(NVPTXISD::Tex2DU32Float) + MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel) + MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad) + MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32) + MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat) + MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel) + MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad) + MAKE_CASE(NVPTXISD::Tex2DArrayS32S32) + MAKE_CASE(NVPTXISD::Tex2DArrayS32Float) + MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel) + MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad) + MAKE_CASE(NVPTXISD::Tex2DArrayU32S32) + MAKE_CASE(NVPTXISD::Tex2DArrayU32Float) + MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel) + MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad) + MAKE_CASE(NVPTXISD::Tex3DFloatS32) + MAKE_CASE(NVPTXISD::Tex3DFloatFloat) + MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel) + MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad) + MAKE_CASE(NVPTXISD::Tex3DS32S32) + MAKE_CASE(NVPTXISD::Tex3DS32Float) + MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel) + MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad) + MAKE_CASE(NVPTXISD::Tex3DU32S32) + MAKE_CASE(NVPTXISD::Tex3DU32Float) + MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel) + MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad) + MAKE_CASE(NVPTXISD::TexCubeFloatFloat) + MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel) + MAKE_CASE(NVPTXISD::TexCubeS32Float) + MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel) + MAKE_CASE(NVPTXISD::TexCubeU32Float) + MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel) + MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat) + MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel) + MAKE_CASE(NVPTXISD::TexCubeArrayS32Float) + MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel) + MAKE_CASE(NVPTXISD::TexCubeArrayU32Float) + MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel) + MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat) + MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat) + MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat) + MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat) + MAKE_CASE(NVPTXISD::Tld4R2DS64Float) + MAKE_CASE(NVPTXISD::Tld4G2DS64Float) + MAKE_CASE(NVPTXISD::Tld4B2DS64Float) + MAKE_CASE(NVPTXISD::Tld4A2DS64Float) + MAKE_CASE(NVPTXISD::Tld4R2DU64Float) + MAKE_CASE(NVPTXISD::Tld4G2DU64Float) + MAKE_CASE(NVPTXISD::Tld4B2DU64Float) + MAKE_CASE(NVPTXISD::Tld4A2DU64Float) + + MAKE_CASE(NVPTXISD::TexUnified1DFloatS32) + MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat) + MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel) + MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad) + MAKE_CASE(NVPTXISD::TexUnified1DS32S32) + MAKE_CASE(NVPTXISD::TexUnified1DS32Float) + MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnified1DU32S32) + MAKE_CASE(NVPTXISD::TexUnified1DU32Float) + MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32) + MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat) + MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel) + MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad) + MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32) + MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float) + MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32) + MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float) + MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnified2DFloatS32) + MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat) + MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel) + MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad) + MAKE_CASE(NVPTXISD::TexUnified2DS32S32) + MAKE_CASE(NVPTXISD::TexUnified2DS32Float) + MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnified2DU32S32) + MAKE_CASE(NVPTXISD::TexUnified2DU32Float) + MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32) + MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat) + MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel) + MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad) + MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32) + MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float) + MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32) + MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float) + MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnified3DFloatS32) + MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat) + MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel) + MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad) + MAKE_CASE(NVPTXISD::TexUnified3DS32S32) + MAKE_CASE(NVPTXISD::TexUnified3DS32Float) + MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnified3DU32S32) + MAKE_CASE(NVPTXISD::TexUnified3DU32Float) + MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat) + MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel) + MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float) + MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float) + MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat) + MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel) + MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float) + MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float) + MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel) + MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad) + MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad) + MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad) + MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad) + MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat) + MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat) + MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat) + MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat) + MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float) + MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float) + MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float) + MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float) + MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float) + MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float) + MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float) + MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float) + + MAKE_CASE(NVPTXISD::Suld1DI8Clamp) + MAKE_CASE(NVPTXISD::Suld1DI16Clamp) + MAKE_CASE(NVPTXISD::Suld1DI32Clamp) + MAKE_CASE(NVPTXISD::Suld1DI64Clamp) + MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp) + MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp) + MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp) + MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp) + MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp) + MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp) + MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp) + + MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp) + MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp) + MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp) + MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp) + MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp) + MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp) + MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp) + + MAKE_CASE(NVPTXISD::Suld2DI8Clamp) + MAKE_CASE(NVPTXISD::Suld2DI16Clamp) + MAKE_CASE(NVPTXISD::Suld2DI32Clamp) + MAKE_CASE(NVPTXISD::Suld2DI64Clamp) + MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp) + MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp) + MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp) + MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp) + MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp) + MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp) + MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp) + + MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp) + MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp) + MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp) + MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp) + MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp) + MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp) + MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp) + + MAKE_CASE(NVPTXISD::Suld3DI8Clamp) + MAKE_CASE(NVPTXISD::Suld3DI16Clamp) + MAKE_CASE(NVPTXISD::Suld3DI32Clamp) + MAKE_CASE(NVPTXISD::Suld3DI64Clamp) + MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp) + MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp) + MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp) + MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp) + MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp) + MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp) + MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp) + + MAKE_CASE(NVPTXISD::Suld1DI8Trap) + MAKE_CASE(NVPTXISD::Suld1DI16Trap) + MAKE_CASE(NVPTXISD::Suld1DI32Trap) + MAKE_CASE(NVPTXISD::Suld1DI64Trap) + MAKE_CASE(NVPTXISD::Suld1DV2I8Trap) + MAKE_CASE(NVPTXISD::Suld1DV2I16Trap) + MAKE_CASE(NVPTXISD::Suld1DV2I32Trap) + MAKE_CASE(NVPTXISD::Suld1DV2I64Trap) + MAKE_CASE(NVPTXISD::Suld1DV4I8Trap) + MAKE_CASE(NVPTXISD::Suld1DV4I16Trap) + MAKE_CASE(NVPTXISD::Suld1DV4I32Trap) + + MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap) + MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap) + MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap) + MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap) + MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap) + MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap) + MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap) + + MAKE_CASE(NVPTXISD::Suld2DI8Trap) + MAKE_CASE(NVPTXISD::Suld2DI16Trap) + MAKE_CASE(NVPTXISD::Suld2DI32Trap) + MAKE_CASE(NVPTXISD::Suld2DI64Trap) + MAKE_CASE(NVPTXISD::Suld2DV2I8Trap) + MAKE_CASE(NVPTXISD::Suld2DV2I16Trap) + MAKE_CASE(NVPTXISD::Suld2DV2I32Trap) + MAKE_CASE(NVPTXISD::Suld2DV2I64Trap) + MAKE_CASE(NVPTXISD::Suld2DV4I8Trap) + MAKE_CASE(NVPTXISD::Suld2DV4I16Trap) + MAKE_CASE(NVPTXISD::Suld2DV4I32Trap) + + MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap) + MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap) + MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap) + MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap) + MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap) + MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap) + MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap) + + MAKE_CASE(NVPTXISD::Suld3DI8Trap) + MAKE_CASE(NVPTXISD::Suld3DI16Trap) + MAKE_CASE(NVPTXISD::Suld3DI32Trap) + MAKE_CASE(NVPTXISD::Suld3DI64Trap) + MAKE_CASE(NVPTXISD::Suld3DV2I8Trap) + MAKE_CASE(NVPTXISD::Suld3DV2I16Trap) + MAKE_CASE(NVPTXISD::Suld3DV2I32Trap) + MAKE_CASE(NVPTXISD::Suld3DV2I64Trap) + MAKE_CASE(NVPTXISD::Suld3DV4I8Trap) + MAKE_CASE(NVPTXISD::Suld3DV4I16Trap) + MAKE_CASE(NVPTXISD::Suld3DV4I32Trap) + + MAKE_CASE(NVPTXISD::Suld1DI8Zero) + MAKE_CASE(NVPTXISD::Suld1DI16Zero) + MAKE_CASE(NVPTXISD::Suld1DI32Zero) + MAKE_CASE(NVPTXISD::Suld1DI64Zero) + MAKE_CASE(NVPTXISD::Suld1DV2I8Zero) + MAKE_CASE(NVPTXISD::Suld1DV2I16Zero) + MAKE_CASE(NVPTXISD::Suld1DV2I32Zero) + MAKE_CASE(NVPTXISD::Suld1DV2I64Zero) + MAKE_CASE(NVPTXISD::Suld1DV4I8Zero) + MAKE_CASE(NVPTXISD::Suld1DV4I16Zero) + MAKE_CASE(NVPTXISD::Suld1DV4I32Zero) + + MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero) + MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero) + MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero) + MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero) + MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero) + MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero) + MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero) + MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero) + + MAKE_CASE(NVPTXISD::Suld2DI8Zero) + MAKE_CASE(NVPTXISD::Suld2DI16Zero) + MAKE_CASE(NVPTXISD::Suld2DI32Zero) + MAKE_CASE(NVPTXISD::Suld2DI64Zero) + MAKE_CASE(NVPTXISD::Suld2DV2I8Zero) + MAKE_CASE(NVPTXISD::Suld2DV2I16Zero) + MAKE_CASE(NVPTXISD::Suld2DV2I32Zero) + MAKE_CASE(NVPTXISD::Suld2DV2I64Zero) + MAKE_CASE(NVPTXISD::Suld2DV4I8Zero) + MAKE_CASE(NVPTXISD::Suld2DV4I16Zero) + MAKE_CASE(NVPTXISD::Suld2DV4I32Zero) + + MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero) + MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero) + MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero) + MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero) + MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero) + MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero) + MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero) + MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero) + + MAKE_CASE(NVPTXISD::Suld3DI8Zero) + MAKE_CASE(NVPTXISD::Suld3DI16Zero) + MAKE_CASE(NVPTXISD::Suld3DI32Zero) + MAKE_CASE(NVPTXISD::Suld3DI64Zero) + MAKE_CASE(NVPTXISD::Suld3DV2I8Zero) + MAKE_CASE(NVPTXISD::Suld3DV2I16Zero) + MAKE_CASE(NVPTXISD::Suld3DV2I32Zero) + MAKE_CASE(NVPTXISD::Suld3DV2I64Zero) + MAKE_CASE(NVPTXISD::Suld3DV4I8Zero) + MAKE_CASE(NVPTXISD::Suld3DV4I16Zero) + MAKE_CASE(NVPTXISD::Suld3DV4I32Zero) } return nullptr; + +#undef MAKE_CASE } TargetLoweringBase::LegalizeTypeAction @@ -3070,8 +2879,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // See similar issue in LowerCall. unsigned InsIdx = 0; - int idx = 0; - for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { + for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) { Type *Ty = argTypes[i]; if (theArgs[i]->use_empty()) { @@ -3107,10 +2915,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( continue; } - // In the following cases, assign a node order of "idx+1" + // In the following cases, assign a node order of "i+1" // to newly created nodes. The SDNodes for params have to // appear in the same order as their order of appearance - // in the original function. "idx+1" holds that order. + // in the original function. "i+1" holds that order. if (!PAL.hasParamAttr(i, Attribute::ByVal)) { bool aggregateIsPacked = false; if (StructType *STy = dyn_cast(Ty)) @@ -3125,7 +2933,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty)); - SDValue Arg = getParamSymbol(DAG, idx, PtrVT); + SDValue Arg = getParamSymbol(DAG, i, PtrVT); int VecIdx = -1; // Index of the first element of the current vector. for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { if (VectorInfo[parti] & PVF_FIRST) { @@ -3159,7 +2967,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); if (P.getNode()) - P.getNode()->setIROrder(idx + 1); + P.getNode()->setIROrder(i + 1); for (unsigned j = 0; j < NumElts; ++j) { SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, DAG.getIntPtrConstant(j, dl)); @@ -3208,10 +3016,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( EVT ObjectVT = getValueType(DL, Ty); assert(ObjectVT == Ins[InsIdx].VT && "Ins type did not match function type"); - SDValue Arg = getParamSymbol(DAG, idx, PtrVT); + SDValue Arg = getParamSymbol(DAG, i, PtrVT); SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); if (p.getNode()) - p.getNode()->setIROrder(idx + 1); + p.getNode()->setIROrder(i + 1); InVals.push_back(p); } -- cgit v1.1 From 0572dabb71147fdc156d90a3ecd036d1652c2006 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Thu, 8 Feb 2024 21:56:57 +0000 Subject: [gn build] Add IntrinsicsSPIRV.h generator This was missing in the gn build for some reason, causing build errors like http://45.33.8.238/linux/130337/step_4.txt after 3b57b647. --- llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn index a594d2a..87e5860 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn @@ -85,6 +85,10 @@ gen_arch_intrinsics("IntrinsicsS390") { intrinsic_prefix = "s390" } +gen_arch_intrinsics("IntrinsicsSPIRV") { + intrinsic_prefix = "spv" +} + gen_arch_intrinsics("IntrinsicsVE") { intrinsic_prefix = "ve" } @@ -128,6 +132,7 @@ group("public_tablegen") { ":IntrinsicsR600", ":IntrinsicsRISCV", ":IntrinsicsS390", + ":IntrinsicsSPIRV", ":IntrinsicsVE", ":IntrinsicsWebAssembly", ":IntrinsicsX86", -- cgit v1.1 From a6f42adf9ba03d69e8bf2eaf5af2e7f8f6294b37 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Fri, 9 Feb 2024 07:17:16 +0900 Subject: [Bazel] Update for SPIRV --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index f720c18..6b947d4 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -772,6 +772,10 @@ llvm_target_intrinsics_list = [ "intrinsic_prefix": "s390", }, { + "name": "SPIRV", + "intrinsic_prefix": "spv", + }, + { "name": "VE", "intrinsic_prefix": "ve", }, -- cgit v1.1 From 7fe97f042036407a124bf1646a3f1124ddac3de5 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 8 Feb 2024 14:21:49 -0800 Subject: [BOLT] Always run CheckLargeFunctions in non-relocation mode (#80922) We run CheckLargeFunctions pass in non-relocation mode to prevent the emission of functions that later could not be written to the output due to their large size. The main reason behind the pass is to prevent the emission of metadata for such functions since this metadata becomes incorrect if the function is left unmodified. Currently, the pass is enabled in non-relocation mode only when debug info output is also enabled. As we emit increasingly more kinds of metadata, e.g. for the Linux kernel, it becomes more challenging to track metadata that needs to be fixed. Hence, I'm enabling the pass to always run in non-relocation mode. --- bolt/lib/Passes/BinaryPasses.cpp | 5 +---- bolt/lib/Rewrite/RewriteInstance.cpp | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index bcb1227..08dce2f 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -554,11 +554,8 @@ void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { if (BC.HasRelocations) return; - if (!opts::UpdateDebugSections) - return; - // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit - // incorrect debug info. + // incorrect meta data. ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { uint64_t HotSize, ColdSize; std::tie(HotSize, ColdSize) = diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 9a242d9..c909e31 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -3631,6 +3631,7 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) { Function.setImageAddress(FuncSection->getAllocAddress()); Function.setImageSize(FuncSection->getOutputSize()); if (Function.getImageSize() > Function.getMaxSize()) { + assert(!BC->isX86() && "Unexpected large function."); TooLarge = true; FailedAddresses.emplace_back(Function.getAddress()); } @@ -5367,6 +5368,7 @@ void RewriteInstance::rewriteFile() { continue; if (Function->getImageSize() > Function->getMaxSize()) { + assert(!BC->isX86() && "Unexpected large function."); if (opts::Verbosity >= 1) errs() << "BOLT-WARNING: new function size (0x" << Twine::utohexstr(Function->getImageSize()) -- cgit v1.1 From 7b5a9bb8f0f58b188655252f74b0941512e44389 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Thu, 8 Feb 2024 14:22:24 -0800 Subject: [github/CODEOWNERS] Add Alexander as BOLT reviewer. --- .github/CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 767f58e..3fe0cbb 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -103,4 +103,4 @@ /mlir/**/*SparseTensor*/ @aartbik @PeimingLiu @yinying-lisa-li @matthias-springer # BOLT -/bolt/ @aaupov @maksfb @rafaelauler @dcci +/bolt/ @aaupov @maksfb @rafaelauler @ayermolo @dcci -- cgit v1.1 From 3c42e10afdc518f6d8be5620289ef0da0bf03c5f Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Thu, 8 Feb 2024 14:27:14 -0800 Subject: Consider aggregate bases when checking if an InitListExpr is constant (#80519) This code was correct as written prior to C++17, which allowed bases to appear in the initializer list. This was observable by creating non-constant aggregate initialization at file scope in a compound literal, but since that behavior will change soon if we implement support for dynamic initialization, I also added a unit test for `isConstantInitializer`. This fixes at least one part of issue #80510 . --------- Co-authored-by: Aaron Ballman --- clang/docs/ReleaseNotes.rst | 4 ++ clang/lib/AST/Expr.cpp | 19 +++++++++ clang/test/SemaCXX/compound-literal.cpp | 21 ++++++++++ clang/unittests/AST/ASTExprTest.cpp | 68 +++++++++++++++++++++++++++++---- 4 files changed, 104 insertions(+), 8 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 32440ee..df3ad20 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -217,6 +217,10 @@ Bug Fixes to C++ Support Fixes (`#80971 ICE when explicit object parameter be a function parameter pack`) - Fixed a bug where abbreviated function templates would append their invented template parameters to an empty template parameter lists. +- Clang now classifies aggregate initialization in C++17 and newer as constant + or non-constant more accurately. Previously, only a subset of the initializer + elements were considered, misclassifying some initializers as constant. Fixes + some of (`#80510 `). Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index d665a08..8b10e28 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -3328,6 +3328,12 @@ bool Expr::isConstantInitializer(ASTContext &Ctx, bool IsForRef, DIUE->getUpdater()->isConstantInitializer(Ctx, false, Culprit); } case InitListExprClass: { + // C++ [dcl.init.aggr]p2: + // The elements of an aggregate are: + // - for an array, the array elements in increasing subscript order, or + // - for a class, the direct base classes in declaration order, followed + // by the direct non-static data members (11.4) that are not members of + // an anonymous union, in declaration order. const InitListExpr *ILE = cast(this); assert(ILE->isSemanticForm() && "InitListExpr must be in semantic form"); if (ILE->getType()->isArrayType()) { @@ -3342,6 +3348,19 @@ bool Expr::isConstantInitializer(ASTContext &Ctx, bool IsForRef, if (ILE->getType()->isRecordType()) { unsigned ElementNo = 0; RecordDecl *RD = ILE->getType()->castAs()->getDecl(); + + // In C++17, bases were added to the list of members used by aggregate + // initialization. + if (const auto *CXXRD = dyn_cast(RD)) { + for (unsigned i = 0, e = CXXRD->getNumBases(); i < e; i++) { + if (ElementNo < ILE->getNumInits()) { + const Expr *Elt = ILE->getInit(ElementNo++); + if (!Elt->isConstantInitializer(Ctx, false, Culprit)) + return false; + } + } + } + for (const auto *Field : RD->fields()) { // If this is a union, skip all the fields that aren't being initialized. if (RD->isUnion() && ILE->getInitializedFieldInUnion() != Field) diff --git a/clang/test/SemaCXX/compound-literal.cpp b/clang/test/SemaCXX/compound-literal.cpp index 5957099..a3d3b9f 100644 --- a/clang/test/SemaCXX/compound-literal.cpp +++ b/clang/test/SemaCXX/compound-literal.cpp @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify -ast-dump %s > %t-11 // RUN: FileCheck --input-file=%t-11 %s // RUN: FileCheck --input-file=%t-11 %s --check-prefix=CHECK-CXX11 +// RUN: %clang_cc1 -verify -std=c++17 %s // http://llvm.org/PR7905 namespace PR7905 { @@ -108,3 +109,23 @@ int computed_with_lambda = [] { return result; }(); #endif + +namespace DynamicFileScopeLiteral { +// This covers the case where we have a file-scope compound literal with a +// non-constant initializer in C++. Previously, we had a bug where Clang forgot +// to consider initializer list elements for bases. +struct Empty {}; +struct Foo : Empty { // expected-note 0+ {{candidate constructor}} + int x; + int y; +}; +int f(); +#if __cplusplus < 201103L +// expected-error@+6 {{non-aggregate type 'Foo' cannot be initialized with an initializer list}} +#elif __cplusplus < 201703L +// expected-error@+4 {{no matching constructor}} +#else +// expected-error@+2 {{initializer element is not a compile-time constant}} +#endif +Foo o = (Foo){ {}, 1, f() }; +} diff --git a/clang/unittests/AST/ASTExprTest.cpp b/clang/unittests/AST/ASTExprTest.cpp index ec75492..5ec6aea 100644 --- a/clang/unittests/AST/ASTExprTest.cpp +++ b/clang/unittests/AST/ASTExprTest.cpp @@ -20,17 +20,37 @@ using namespace clang; +using clang::ast_matchers::cxxRecordDecl; +using clang::ast_matchers::hasName; +using clang::ast_matchers::match; +using clang::ast_matchers::varDecl; +using clang::tooling::buildASTFromCode; + +static IntegerLiteral *createIntLiteral(ASTContext &Ctx, uint32_t Value) { + const int numBits = 32; + return IntegerLiteral::Create(Ctx, llvm::APInt(numBits, Value), Ctx.IntTy, + {}); +} + +const CXXRecordDecl *getCXXRecordDeclNode(ASTUnit *AST, + const std::string &Name) { + auto Result = + match(cxxRecordDecl(hasName(Name)).bind("record"), AST->getASTContext()); + EXPECT_FALSE(Result.empty()); + return Result[0].getNodeAs("record"); +} + +const VarDecl *getVariableNode(ASTUnit *AST, const std::string &Name) { + auto Result = match(varDecl(hasName(Name)).bind("var"), AST->getASTContext()); + EXPECT_EQ(Result.size(), 1u); + return Result[0].getNodeAs("var"); +} + TEST(ASTExpr, IgnoreExprCallbackForwarded) { constexpr char Code[] = ""; auto AST = tooling::buildASTFromCodeWithArgs(Code, /*Args=*/{"-std=c++20"}); ASTContext &Ctx = AST->getASTContext(); - auto createIntLiteral = [&](uint32_t Value) -> IntegerLiteral * { - const int numBits = 32; - return IntegerLiteral::Create(Ctx, llvm::APInt(numBits, Value), - Ctx.UnsignedIntTy, {}); - }; - struct IgnoreParens { Expr *operator()(Expr *E) & { return nullptr; } Expr *operator()(Expr *E) && { @@ -42,7 +62,7 @@ TEST(ASTExpr, IgnoreExprCallbackForwarded) { }; { - auto *IntExpr = createIntLiteral(10); + auto *IntExpr = createIntLiteral(Ctx, 10); ParenExpr *PE = new (Ctx) ParenExpr(SourceLocation{}, SourceLocation{}, IntExpr); EXPECT_EQ(IntExpr, IgnoreExprNodes(PE, IgnoreParens{})); @@ -50,9 +70,41 @@ TEST(ASTExpr, IgnoreExprCallbackForwarded) { { IgnoreParens CB{}; - auto *IntExpr = createIntLiteral(10); + auto *IntExpr = createIntLiteral(Ctx, 10); ParenExpr *PE = new (Ctx) ParenExpr(SourceLocation{}, SourceLocation{}, IntExpr); EXPECT_EQ(nullptr, IgnoreExprNodes(PE, CB)); } } + +TEST(ASTExpr, InitListIsConstantInitialized) { + auto AST = buildASTFromCode(R"cpp( + struct Empty {}; + struct Foo : Empty { int x, y; }; + int gv; + )cpp"); + ASTContext &Ctx = AST->getASTContext(); + const CXXRecordDecl *Empty = getCXXRecordDeclNode(AST.get(), "Empty"); + const CXXRecordDecl *Foo = getCXXRecordDeclNode(AST.get(), "Foo"); + + SourceLocation Loc{}; + InitListExpr *BaseInit = new (Ctx) InitListExpr(Ctx, Loc, {}, Loc); + BaseInit->setType(Ctx.getRecordType(Empty)); + Expr *Exprs[3] = { + BaseInit, + createIntLiteral(Ctx, 13), + createIntLiteral(Ctx, 42), + }; + InitListExpr *FooInit = new (Ctx) InitListExpr(Ctx, Loc, Exprs, Loc); + FooInit->setType(Ctx.getRecordType(Foo)); + EXPECT_TRUE(FooInit->isConstantInitializer(Ctx, false)); + + // Replace the last initializer with something non-constant and make sure + // this returns false. Previously we had a bug where we didn't count base + // initializers, and only iterated over fields. + const VarDecl *GV = getVariableNode(AST.get(), "gv"); + auto *Ref = new (Ctx) DeclRefExpr(Ctx, const_cast(GV), false, + Ctx.IntTy, VK_LValue, Loc); + (void)FooInit->updateInit(Ctx, 2, Ref); + EXPECT_FALSE(FooInit->isConstantInitializer(Ctx, false)); +} -- cgit v1.1 From 06c89bd59ca2279f76a41e851b7b2df634a6191e Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 9 Feb 2024 06:51:11 +0800 Subject: [RISCV] Check type is legal before combining mgather to vlse intrinsic (#81107) Otherwise we will crash since target intrinsics don't have their types legalized. Let the mgather get legalized first, then do the combine on the legal type. Fixes #81088 Co-authored-by: Craig Topper --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- .../RISCV/rvv/fixed-vectors-masked-gather.ll | 448 +++++++++++++++++++++ 2 files changed, 449 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a62610b..12c0cd5 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -15833,7 +15833,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, MGN->getMemOperand(), IndexType, MGN->getExtensionType()); if (Index.getOpcode() == ISD::BUILD_VECTOR && - MGN->getExtensionType() == ISD::NON_EXTLOAD) { + MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) { if (std::optional SimpleVID = isSimpleVIDSequence(Index); SimpleVID && SimpleVID->StepDenominator == 1) { const int64_t StepNumerator = SimpleVID->StepNumerator; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index df41ac1..890707c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -14638,5 +14638,453 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %allones, <8 x i16> poison) ret <8 x i16> %v } + +; v32i64 is not a legal type, so make sure we don't try to combine the mgather +; to a vlse intrinsic until it is legalized and split. +define <32 x i64> @mgather_strided_split(ptr %base) { +; RV32V-LABEL: mgather_strided_split: +; RV32V: # %bb.0: +; RV32V-NEXT: li a1, 16 +; RV32V-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32V-NEXT: vlse64.v v8, (a0), a1 +; RV32V-NEXT: addi a0, a0, 256 +; RV32V-NEXT: vlse64.v v16, (a0), a1 +; RV32V-NEXT: ret +; +; RV64V-LABEL: mgather_strided_split: +; RV64V: # %bb.0: +; RV64V-NEXT: li a1, 16 +; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64V-NEXT: vlse64.v v8, (a0), a1 +; RV64V-NEXT: addi a0, a0, 256 +; RV64V-NEXT: vlse64.v v16, (a0), a1 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_strided_split: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -512 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 512 +; RV32ZVE32F-NEXT: sw ra, 508(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s0, 504(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 500(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s3, 496(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s4, 492(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s5, 488(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s6, 484(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s7, 480(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s8, 476(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s9, 472(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s10, 468(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s11, 464(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset ra, -4 +; RV32ZVE32F-NEXT: .cfi_offset s0, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: .cfi_offset s3, -16 +; RV32ZVE32F-NEXT: .cfi_offset s4, -20 +; RV32ZVE32F-NEXT: .cfi_offset s5, -24 +; RV32ZVE32F-NEXT: .cfi_offset s6, -28 +; RV32ZVE32F-NEXT: .cfi_offset s7, -32 +; RV32ZVE32F-NEXT: .cfi_offset s8, -36 +; RV32ZVE32F-NEXT: .cfi_offset s9, -40 +; RV32ZVE32F-NEXT: .cfi_offset s10, -44 +; RV32ZVE32F-NEXT: .cfi_offset s11, -48 +; RV32ZVE32F-NEXT: addi s0, sp, 512 +; RV32ZVE32F-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVE32F-NEXT: andi sp, sp, -128 +; RV32ZVE32F-NEXT: li a2, 32 +; RV32ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32ZVE32F-NEXT: vid.v v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 4 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 216(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 208(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 252(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 248(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 244(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 236(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 228(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 220(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 240(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 232(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 224(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 212(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 204(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 200(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 196(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 192(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: addi a1, sp, 256 +; RV32ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32ZVE32F-NEXT: vse32.v v8, (a1) +; RV32ZVE32F-NEXT: lw a1, 288(sp) +; RV32ZVE32F-NEXT: lw a2, 292(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 188(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 184(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 296(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: sw a3, 180(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: sw a2, 176(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 300(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 172(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 168(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 304(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: sw a3, 164(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: sw a2, 160(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 308(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 156(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 152(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 312(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: sw a3, 148(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: sw a2, 144(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 316(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 140(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 136(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 320(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: sw a3, 132(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: sw a2, 128(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 324(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 124(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 120(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 328(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: sw a3, 116(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: sw a2, 112(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 332(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 104(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw ra, 4(a1) +; RV32ZVE32F-NEXT: lw a1, 336(sp) +; RV32ZVE32F-NEXT: lw s10, 0(a2) +; RV32ZVE32F-NEXT: lw s8, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 340(sp) +; RV32ZVE32F-NEXT: lw s6, 0(a1) +; RV32ZVE32F-NEXT: lw s4, 4(a1) +; RV32ZVE32F-NEXT: lw a4, 344(sp) +; RV32ZVE32F-NEXT: lw s2, 0(a2) +; RV32ZVE32F-NEXT: lw t5, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 348(sp) +; RV32ZVE32F-NEXT: lw t3, 0(a4) +; RV32ZVE32F-NEXT: lw t2, 4(a4) +; RV32ZVE32F-NEXT: lw a4, 352(sp) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a7, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 356(sp) +; RV32ZVE32F-NEXT: lw a6, 0(a4) +; RV32ZVE32F-NEXT: lw a5, 4(a4) +; RV32ZVE32F-NEXT: lw a4, 360(sp) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: sw a1, 108(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: sw a1, 100(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 364(sp) +; RV32ZVE32F-NEXT: lw s11, 0(a4) +; RV32ZVE32F-NEXT: lw s9, 4(a4) +; RV32ZVE32F-NEXT: lw a1, 368(sp) +; RV32ZVE32F-NEXT: lw s7, 0(a2) +; RV32ZVE32F-NEXT: lw s5, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 372(sp) +; RV32ZVE32F-NEXT: lw s3, 0(a1) +; RV32ZVE32F-NEXT: lw t6, 4(a1) +; RV32ZVE32F-NEXT: lw a2, 376(sp) +; RV32ZVE32F-NEXT: lw t4, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 380(sp) +; RV32ZVE32F-NEXT: lw t1, 4(a3) +; RV32ZVE32F-NEXT: lw a4, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a1) +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a5, 196(a0) +; RV32ZVE32F-NEXT: sw a6, 192(a0) +; RV32ZVE32F-NEXT: sw a7, 188(a0) +; RV32ZVE32F-NEXT: sw t0, 184(a0) +; RV32ZVE32F-NEXT: sw t2, 180(a0) +; RV32ZVE32F-NEXT: sw t3, 176(a0) +; RV32ZVE32F-NEXT: sw t5, 172(a0) +; RV32ZVE32F-NEXT: sw s2, 168(a0) +; RV32ZVE32F-NEXT: sw s4, 164(a0) +; RV32ZVE32F-NEXT: sw s6, 160(a0) +; RV32ZVE32F-NEXT: sw s8, 156(a0) +; RV32ZVE32F-NEXT: sw s10, 152(a0) +; RV32ZVE32F-NEXT: sw ra, 148(a0) +; RV32ZVE32F-NEXT: lw a5, 104(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 144(a0) +; RV32ZVE32F-NEXT: lw a5, 112(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 140(a0) +; RV32ZVE32F-NEXT: lw a5, 116(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 136(a0) +; RV32ZVE32F-NEXT: lw a5, 120(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 132(a0) +; RV32ZVE32F-NEXT: lw a5, 124(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 128(a0) +; RV32ZVE32F-NEXT: lw a5, 128(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 124(a0) +; RV32ZVE32F-NEXT: lw a5, 132(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 120(a0) +; RV32ZVE32F-NEXT: lw a5, 136(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 116(a0) +; RV32ZVE32F-NEXT: lw a5, 140(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 112(a0) +; RV32ZVE32F-NEXT: lw a5, 144(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 108(a0) +; RV32ZVE32F-NEXT: lw a5, 148(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 104(a0) +; RV32ZVE32F-NEXT: lw a5, 152(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 100(a0) +; RV32ZVE32F-NEXT: lw a5, 156(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 96(a0) +; RV32ZVE32F-NEXT: lw a5, 160(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 92(a0) +; RV32ZVE32F-NEXT: lw a5, 164(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 88(a0) +; RV32ZVE32F-NEXT: lw a5, 168(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 84(a0) +; RV32ZVE32F-NEXT: lw a5, 172(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 80(a0) +; RV32ZVE32F-NEXT: lw a5, 176(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 76(a0) +; RV32ZVE32F-NEXT: lw a5, 180(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 72(a0) +; RV32ZVE32F-NEXT: lw a5, 184(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 68(a0) +; RV32ZVE32F-NEXT: lw a5, 188(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 64(a0) +; RV32ZVE32F-NEXT: lw a5, 208(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 4(a0) +; RV32ZVE32F-NEXT: lw a5, 216(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 252(a0) +; RV32ZVE32F-NEXT: sw a2, 248(a0) +; RV32ZVE32F-NEXT: sw a3, 244(a0) +; RV32ZVE32F-NEXT: sw a4, 240(a0) +; RV32ZVE32F-NEXT: sw t1, 236(a0) +; RV32ZVE32F-NEXT: sw t4, 232(a0) +; RV32ZVE32F-NEXT: sw t6, 228(a0) +; RV32ZVE32F-NEXT: sw s3, 224(a0) +; RV32ZVE32F-NEXT: sw s5, 220(a0) +; RV32ZVE32F-NEXT: sw s7, 216(a0) +; RV32ZVE32F-NEXT: sw s9, 212(a0) +; RV32ZVE32F-NEXT: sw s11, 208(a0) +; RV32ZVE32F-NEXT: lw a1, 100(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 204(a0) +; RV32ZVE32F-NEXT: lw a1, 108(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 200(a0) +; RV32ZVE32F-NEXT: lw a1, 220(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 28(a0) +; RV32ZVE32F-NEXT: lw a1, 228(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 24(a0) +; RV32ZVE32F-NEXT: lw a1, 236(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 20(a0) +; RV32ZVE32F-NEXT: lw a1, 244(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 16(a0) +; RV32ZVE32F-NEXT: lw a1, 248(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 12(a0) +; RV32ZVE32F-NEXT: lw a1, 252(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 8(a0) +; RV32ZVE32F-NEXT: lw a1, 192(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 60(a0) +; RV32ZVE32F-NEXT: lw a1, 196(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 56(a0) +; RV32ZVE32F-NEXT: lw a1, 200(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 52(a0) +; RV32ZVE32F-NEXT: lw a1, 204(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 48(a0) +; RV32ZVE32F-NEXT: lw a1, 212(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 44(a0) +; RV32ZVE32F-NEXT: lw a1, 224(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 40(a0) +; RV32ZVE32F-NEXT: lw a1, 232(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 36(a0) +; RV32ZVE32F-NEXT: lw a1, 240(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 32(a0) +; RV32ZVE32F-NEXT: addi sp, s0, -512 +; RV32ZVE32F-NEXT: lw ra, 508(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s0, 504(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 500(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s3, 496(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s4, 492(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s5, 488(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s6, 484(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s7, 480(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s8, 476(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s9, 472(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s10, 468(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s11, 464(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 512 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_strided_split: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -144 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 144 +; RV64ZVE32F-NEXT: sd ra, 136(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s0, 128(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s1, 120(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s2, 112(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s3, 104(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s4, 96(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s5, 88(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s6, 80(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s7, 72(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s8, 64(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s9, 56(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s10, 48(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s11, 40(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: .cfi_offset ra, -8 +; RV64ZVE32F-NEXT: .cfi_offset s0, -16 +; RV64ZVE32F-NEXT: .cfi_offset s1, -24 +; RV64ZVE32F-NEXT: .cfi_offset s2, -32 +; RV64ZVE32F-NEXT: .cfi_offset s3, -40 +; RV64ZVE32F-NEXT: .cfi_offset s4, -48 +; RV64ZVE32F-NEXT: .cfi_offset s5, -56 +; RV64ZVE32F-NEXT: .cfi_offset s6, -64 +; RV64ZVE32F-NEXT: .cfi_offset s7, -72 +; RV64ZVE32F-NEXT: .cfi_offset s8, -80 +; RV64ZVE32F-NEXT: .cfi_offset s9, -88 +; RV64ZVE32F-NEXT: .cfi_offset s10, -96 +; RV64ZVE32F-NEXT: .cfi_offset s11, -104 +; RV64ZVE32F-NEXT: ld a2, 0(a1) +; RV64ZVE32F-NEXT: sd a2, 32(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: ld a2, 16(a1) +; RV64ZVE32F-NEXT: sd a2, 24(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: ld a2, 32(a1) +; RV64ZVE32F-NEXT: sd a2, 16(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: ld a2, 48(a1) +; RV64ZVE32F-NEXT: sd a2, 8(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: ld a2, 64(a1) +; RV64ZVE32F-NEXT: sd a2, 0(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: ld a7, 80(a1) +; RV64ZVE32F-NEXT: ld t0, 96(a1) +; RV64ZVE32F-NEXT: ld t1, 112(a1) +; RV64ZVE32F-NEXT: ld t2, 128(a1) +; RV64ZVE32F-NEXT: ld t3, 144(a1) +; RV64ZVE32F-NEXT: ld t4, 160(a1) +; RV64ZVE32F-NEXT: ld t5, 176(a1) +; RV64ZVE32F-NEXT: ld t6, 192(a1) +; RV64ZVE32F-NEXT: ld s0, 208(a1) +; RV64ZVE32F-NEXT: ld s1, 224(a1) +; RV64ZVE32F-NEXT: ld s2, 240(a1) +; RV64ZVE32F-NEXT: ld s3, 256(a1) +; RV64ZVE32F-NEXT: ld s4, 272(a1) +; RV64ZVE32F-NEXT: ld s5, 288(a1) +; RV64ZVE32F-NEXT: ld s6, 304(a1) +; RV64ZVE32F-NEXT: ld s7, 320(a1) +; RV64ZVE32F-NEXT: ld s8, 336(a1) +; RV64ZVE32F-NEXT: ld s9, 352(a1) +; RV64ZVE32F-NEXT: ld s10, 368(a1) +; RV64ZVE32F-NEXT: ld s11, 384(a1) +; RV64ZVE32F-NEXT: ld ra, 400(a1) +; RV64ZVE32F-NEXT: ld a6, 416(a1) +; RV64ZVE32F-NEXT: ld a5, 432(a1) +; RV64ZVE32F-NEXT: ld a2, 496(a1) +; RV64ZVE32F-NEXT: ld a3, 480(a1) +; RV64ZVE32F-NEXT: ld a4, 464(a1) +; RV64ZVE32F-NEXT: ld a1, 448(a1) +; RV64ZVE32F-NEXT: sd a2, 248(a0) +; RV64ZVE32F-NEXT: sd a3, 240(a0) +; RV64ZVE32F-NEXT: sd a4, 232(a0) +; RV64ZVE32F-NEXT: sd a1, 224(a0) +; RV64ZVE32F-NEXT: sd a5, 216(a0) +; RV64ZVE32F-NEXT: sd a6, 208(a0) +; RV64ZVE32F-NEXT: sd ra, 200(a0) +; RV64ZVE32F-NEXT: sd s11, 192(a0) +; RV64ZVE32F-NEXT: sd s10, 184(a0) +; RV64ZVE32F-NEXT: sd s9, 176(a0) +; RV64ZVE32F-NEXT: sd s8, 168(a0) +; RV64ZVE32F-NEXT: sd s7, 160(a0) +; RV64ZVE32F-NEXT: sd s6, 152(a0) +; RV64ZVE32F-NEXT: sd s5, 144(a0) +; RV64ZVE32F-NEXT: sd s4, 136(a0) +; RV64ZVE32F-NEXT: sd s3, 128(a0) +; RV64ZVE32F-NEXT: sd s2, 120(a0) +; RV64ZVE32F-NEXT: sd s1, 112(a0) +; RV64ZVE32F-NEXT: sd s0, 104(a0) +; RV64ZVE32F-NEXT: sd t6, 96(a0) +; RV64ZVE32F-NEXT: sd t5, 88(a0) +; RV64ZVE32F-NEXT: sd t4, 80(a0) +; RV64ZVE32F-NEXT: sd t3, 72(a0) +; RV64ZVE32F-NEXT: sd t2, 64(a0) +; RV64ZVE32F-NEXT: sd t1, 56(a0) +; RV64ZVE32F-NEXT: sd t0, 48(a0) +; RV64ZVE32F-NEXT: sd a7, 40(a0) +; RV64ZVE32F-NEXT: ld a1, 0(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: sd a1, 32(a0) +; RV64ZVE32F-NEXT: ld a1, 8(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: sd a1, 24(a0) +; RV64ZVE32F-NEXT: ld a1, 16(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: sd a1, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: sd a1, 8(a0) +; RV64ZVE32F-NEXT: ld a1, 32(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: sd a1, 0(a0) +; RV64ZVE32F-NEXT: ld ra, 136(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s0, 128(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s1, 120(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s2, 112(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s3, 104(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s4, 96(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s5, 88(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s6, 80(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s7, 72(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s8, 64(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s9, 56(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s10, 48(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s11, 40(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: addi sp, sp, 144 +; RV64ZVE32F-NEXT: ret + %ptrs = getelementptr inbounds i64, ptr %base, <32 x i64> + %x = call <32 x i64> @llvm.masked.gather.v32i64.v32p0(<32 x ptr> %ptrs, i32 8, <32 x i1> shufflevector(<32 x i1> insertelement(<32 x i1> poison, i1 true, i32 0), <32 x i1> poison, <32 x i32> zeroinitializer), <32 x i64> poison) + ret <32 x i64> %x +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV64: {{.*}} -- cgit v1.1 From f7201505a6ec7a0f904d2f09cece5c770058a991 Mon Sep 17 00:00:00 2001 From: Jerry Wu Date: Thu, 8 Feb 2024 14:52:09 -0800 Subject: [mlir] Add transformation to wrap scf::while in zero-trip-check (#81050) Add `scf::wrapWhileLoopInZeroTripCheck` to wrap scf while loop in zero-trip-check. --- .../mlir/Dialect/SCF/Transforms/Transforms.h | 41 +++++++ mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt | 1 + .../Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp | 132 +++++++++++++++++++++ .../SCF/wrap-while-loop-in-zero-trip-check.mlir | 130 ++++++++++++++++++++ mlir/test/lib/Dialect/SCF/CMakeLists.txt | 1 + .../lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp | 72 +++++++++++ mlir/tools/mlir-opt/mlir-opt.cpp | 2 + 7 files changed, 379 insertions(+) create mode 100644 mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp create mode 100644 mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir create mode 100644 mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h index e91f9e4..690cd14 100644 --- a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h @@ -30,6 +30,7 @@ namespace scf { class IfOp; class ForOp; class ParallelOp; +class WhileOp; /// Fuses all adjacent scf.parallel operations with identical bounds and step /// into one scf.parallel operations. Uses a naive aliasing and dependency @@ -181,6 +182,46 @@ FailureOr pipelineForLoop(RewriterBase &rewriter, ForOp forOp, const PipeliningOption &options, bool *modifiedIR = nullptr); +/// Create zero-trip-check around a `while` op and return the new loop op in the +/// check. The while loop is rotated to avoid evaluating the condition twice +/// +/// By default the check won't be created for do-while loop as it is not +/// required. `forceCreateCheck` can force the creation. +/// +/// It turns: +/// +/// scf.while (%arg0 = %init) : (i32) -> i64 { +/// %val = .., %arg0 : i64 +/// %cond = arith.cmpi .., %arg0 : i32 +/// scf.condition(%cond) %val : i64 +/// } do { +/// ^bb0(%arg1: i64): +/// %next = .., %arg1 : i32 +/// scf.yield %next : i32 +/// } +/// +/// into: +/// +/// %pre_val = .., %init : i64 +/// %pre_cond = arith.cmpi .., %init : i32 +/// scf.if %pre_cond -> i64 { +/// %res = scf.while (%arg1 = %va0) : (i64) -> i64 { +/// %next = .., %arg1 : i32 +/// %val = .., %next : i64 +/// %cond = arith.cmpi .., %next : i32 +/// scf.condition(%cond) %val : i64 +/// } do { +/// ^bb0(%arg2: i64): +/// %scf.yield %arg2 : i32 +/// } +/// scf.yield %res : i64 +/// } else { +/// scf.yield %pre_val : i64 +/// } +FailureOr wrapWhileLoopInZeroTripCheck(WhileOp whileOp, + RewriterBase &rewriter, + bool forceCreateCheck = false); + } // namespace scf } // namespace mlir diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt index fdaeb2f..e549420 100644 --- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt @@ -13,6 +13,7 @@ add_mlir_dialect_library(MLIRSCFTransforms ParallelLoopTiling.cpp StructuralTypeConversions.cpp TileUsingInterface.cpp + WrapInZeroTripCheck.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SCF diff --git a/mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp b/mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp new file mode 100644 index 0000000..f829208 --- /dev/null +++ b/mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp @@ -0,0 +1,132 @@ +//===- WrapInZeroTripCheck.cpp - Loop transforms to add zero-trip-check ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/PatternMatch.h" + +using namespace mlir; + +/// Create zero-trip-check around a `while` op and return the new loop op in the +/// check. The while loop is rotated to avoid evaluating the condition twice. +/// +/// Given an example below: +/// +/// scf.while (%arg0 = %init) : (i32) -> i64 { +/// %val = .., %arg0 : i64 +/// %cond = arith.cmpi .., %arg0 : i32 +/// scf.condition(%cond) %val : i64 +/// } do { +/// ^bb0(%arg1: i64): +/// %next = .., %arg1 : i32 +/// scf.yield %next : i32 +/// } +/// +/// First clone before block to the front of the loop: +/// +/// %pre_val = .., %init : i64 +/// %pre_cond = arith.cmpi .., %init : i32 +/// scf.while (%arg0 = %init) : (i32) -> i64 { +/// %val = .., %arg0 : i64 +/// %cond = arith.cmpi .., %arg0 : i32 +/// scf.condition(%cond) %val : i64 +/// } do { +/// ^bb0(%arg1: i64): +/// %next = .., %arg1 : i32 +/// scf.yield %next : i32 +/// } +/// +/// Create `if` op with the condition, rotate and move the loop into the else +/// branch: +/// +/// %pre_val = .., %init : i64 +/// %pre_cond = arith.cmpi .., %init : i32 +/// scf.if %pre_cond -> i64 { +/// %res = scf.while (%arg1 = %va0) : (i64) -> i64 { +/// // Original after block +/// %next = .., %arg1 : i32 +/// // Original before block +/// %val = .., %next : i64 +/// %cond = arith.cmpi .., %next : i32 +/// scf.condition(%cond) %val : i64 +/// } do { +/// ^bb0(%arg2: i64): +/// %scf.yield %arg2 : i32 +/// } +/// scf.yield %res : i64 +/// } else { +/// scf.yield %pre_val : i64 +/// } +FailureOr mlir::scf::wrapWhileLoopInZeroTripCheck( + scf::WhileOp whileOp, RewriterBase &rewriter, bool forceCreateCheck) { + // If the loop is in do-while form (after block only passes through values), + // there is no need to create a zero-trip-check as before block is always run. + if (!forceCreateCheck && isa(whileOp.getAfterBody()->front())) { + return whileOp; + } + + OpBuilder::InsertionGuard insertion_guard(rewriter); + + IRMapping mapper; + Block *beforeBlock = whileOp.getBeforeBody(); + // Clone before block before the loop for zero-trip-check. + for (auto [arg, init] : + llvm::zip_equal(beforeBlock->getArguments(), whileOp.getInits())) { + mapper.map(arg, init); + } + rewriter.setInsertionPoint(whileOp); + for (auto &op : *beforeBlock) { + if (isa(op)) { + break; + } + // Safe to clone everything as in a single block all defs have been cloned + // and added to mapper in order. + rewriter.insert(op.clone(mapper)); + } + + scf::ConditionOp condOp = whileOp.getConditionOp(); + Value clonedCondition = mapper.lookupOrDefault(condOp.getCondition()); + SmallVector clonedCondArgs = llvm::map_to_vector( + condOp.getArgs(), [&](Value arg) { return mapper.lookupOrDefault(arg); }); + + // Create rotated while loop. + auto newLoopOp = rewriter.create( + whileOp.getLoc(), whileOp.getResultTypes(), clonedCondArgs, + [&](OpBuilder &builder, Location loc, ValueRange args) { + // Rotate and move the loop body into before block. + auto newBlock = builder.getBlock(); + rewriter.mergeBlocks(whileOp.getAfterBody(), newBlock, args); + auto yieldOp = cast(newBlock->getTerminator()); + rewriter.mergeBlocks(whileOp.getBeforeBody(), newBlock, + yieldOp.getResults()); + rewriter.eraseOp(yieldOp); + }, + [&](OpBuilder &builder, Location loc, ValueRange args) { + // Pass through values. + builder.create(loc, args); + }); + + // Create zero-trip-check and move the while loop in. + auto ifOp = rewriter.create( + whileOp.getLoc(), clonedCondition, + [&](OpBuilder &builder, Location loc) { + // Then runs the while loop. + rewriter.moveOpBefore(newLoopOp, builder.getInsertionBlock(), + builder.getInsertionPoint()); + builder.create(loc, newLoopOp.getResults()); + }, + [&](OpBuilder &builder, Location loc) { + // Else returns the results from precondition. + builder.create(loc, clonedCondArgs); + }); + + rewriter.replaceOp(whileOp, ifOp); + + return newLoopOp; +} diff --git a/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir b/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir new file mode 100644 index 0000000..8954839 --- /dev/null +++ b/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir @@ -0,0 +1,130 @@ +// RUN: mlir-opt %s -test-wrap-scf-while-loop-in-zero-trip-check -split-input-file | FileCheck %s +// RUN: mlir-opt %s -test-wrap-scf-while-loop-in-zero-trip-check='force-create-check=true' -split-input-file | FileCheck %s --check-prefix FORCE-CREATE-CHECK + +func.func @wrap_while_loop_in_zero_trip_check(%bound : i32) -> i32 { + %cst0 = arith.constant 0 : i32 + %cst5 = arith.constant 5 : i32 + %res:2 = scf.while (%iter = %cst0) : (i32) -> (i32, i32) { + %cond = arith.cmpi slt, %iter, %bound : i32 + %inv = arith.addi %bound, %cst5 : i32 + scf.condition(%cond) %iter, %inv : i32, i32 + } do { + ^bb0(%arg1: i32, %arg2: i32): + %next = arith.addi %arg1, %arg2 : i32 + scf.yield %next : i32 + } + return %res#0 : i32 +} + +// CHECK-LABEL: func.func @wrap_while_loop_in_zero_trip_check( +// CHECK-SAME: %[[BOUND:.*]]: i32) -> i32 { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32 +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : i32 +// CHECK-DAG: %[[PRE_COND:.*]] = arith.cmpi slt, %[[C0]], %[[BOUND]] : i32 +// CHECK-DAG: %[[PRE_INV:.*]] = arith.addi %[[BOUND]], %[[C5]] : i32 +// CHECK: %[[IF:.*]]:2 = scf.if %[[PRE_COND]] -> (i32, i32) { +// CHECK: %[[WHILE:.*]]:2 = scf.while ( +// CHECK-SAME: %[[ARG1:.*]] = %[[C0]], %[[ARG2:.*]] = %[[PRE_INV]] +// CHECK-SAME: ) : (i32, i32) -> (i32, i32) { +// CHECK: %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[ARG2]] : i32 +// CHECK: %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32 +// CHECK: %[[INV:.*]] = arith.addi %[[BOUND]], %[[C5]] : i32 +// CHECK: scf.condition(%[[COND]]) %[[NEXT]], %[[INV]] : i32, i32 +// CHECK: } do { +// CHECK: ^bb0(%[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32): +// CHECK: scf.yield %[[ARG3]], %[[ARG4]] : i32, i32 +// CHECK: } +// CHECK: scf.yield %[[WHILE]]#0, %[[WHILE]]#1 : i32, i32 +// CHECK: } else { +// CHECK: scf.yield %[[C0]], %[[PRE_INV]] : i32, i32 +// CHECK: } +// CHECK: return %[[IF]]#0 : i32 + +// ----- + +func.func @wrap_while_loop_with_minimal_before_block(%bound : i32) -> i32 { + %cst0 = arith.constant 0 : i32 + %true = arith.constant true + %cst5 = arith.constant 5 : i32 + %res = scf.while (%iter = %cst0, %arg0 = %true) : (i32, i1) -> i32 { + scf.condition(%arg0) %iter : i32 + } do { + ^bb0(%arg1: i32): + %next = arith.addi %arg1, %cst5 : i32 + %cond = arith.cmpi slt, %next, %bound : i32 + scf.yield %next, %cond : i32, i1 + } + return %res : i32 +} + +// CHECK-LABEL: func.func @wrap_while_loop_with_minimal_before_block( +// CHECK-SAME: %[[BOUND:.*]]: i32) -> i32 { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32 +// CHECK-DAG: %[[TRUE:.*]] = arith.constant true +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : i32 +// CHECK: %[[IF:.*]] = scf.if %[[TRUE]] -> (i32) { +// CHECK: %[[WHILE:.*]] = scf.while (%[[ARG1:.*]] = %[[C0]]) : (i32) -> i32 { +// CHECK: %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[C5]] : i32 +// CHECK: %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32 +// CHECK: scf.condition(%[[COND]]) %[[NEXT]] : i32 +// CHECK: } do { +// CHECK: ^bb0(%[[ARG2:.*]]: i32): +// CHECK: scf.yield %[[ARG2]] : i32 +// CHECK: } +// CHECK: scf.yield %[[WHILE]] : i32 +// CHECK: } else { +// CHECK: scf.yield %[[C0]] : i32 +// CHECK: } +// CHECK: return %[[IF]] : i32 + +// ----- + +func.func @wrap_do_while_loop_in_zero_trip_check(%bound : i32) -> i32 { + %cst0 = arith.constant 0 : i32 + %cst5 = arith.constant 5 : i32 + %res = scf.while (%iter = %cst0) : (i32) -> i32 { + %next = arith.addi %iter, %cst5 : i32 + %cond = arith.cmpi slt, %next, %bound : i32 + scf.condition(%cond) %next : i32 + } do { + ^bb0(%arg1: i32): + scf.yield %arg1 : i32 + } + return %res : i32 +} + +// CHECK-LABEL: func.func @wrap_do_while_loop_in_zero_trip_check( +// CHECK-SAME: %[[BOUND:.*]]: i32) -> i32 { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32 +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : i32 +// CHECK-NOT: scf.if +// CHECK: %[[WHILE:.*]] = scf.while (%[[ARG1:.*]] = %[[C0]]) : (i32) -> i32 { +// CHECK: %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[C5]] : i32 +// CHECK: %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32 +// CHECK: scf.condition(%[[COND]]) %[[NEXT]] : i32 +// CHECK: } do { +// CHECK: ^bb0(%[[ARG2:.*]]: i32): +// CHECK: scf.yield %[[ARG2]] : i32 +// CHECK: } +// CHECK: return %[[WHILE]] : i32 + +// FORCE-CREATE-CHECK-LABEL: func.func @wrap_do_while_loop_in_zero_trip_check( +// FORCE-CREATE-CHECK-SAME: %[[BOUND:.*]]: i32) -> i32 { +// FORCE-CREATE-CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32 +// FORCE-CREATE-CHECK-DAG: %[[C5:.*]] = arith.constant 5 : i32 +// FORCE-CREATE-CHECK: %[[PRE_NEXT:.*]] = arith.addi %[[C0]], %[[C5]] : i32 +// FORCE-CREATE-CHECK: %[[PRE_COND:.*]] = arith.cmpi slt, %[[PRE_NEXT]], %[[BOUND]] : i32 +// FORCE-CREATE-CHECK: %[[IF:.*]] = scf.if %[[PRE_COND]] -> (i32) { +// FORCE-CREATE-CHECK: %[[WHILE:.*]] = scf.while (%[[ARG1:.*]] = %[[PRE_NEXT]]) : (i32) -> i32 { +// FORCE-CREATE-CHECK: %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[C5]] : i32 +// FORCE-CREATE-CHECK: %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32 +// FORCE-CREATE-CHECK: scf.condition(%[[COND]]) %[[NEXT]] : i32 +// FORCE-CREATE-CHECK: } do { +// FORCE-CREATE-CHECK: ^bb0(%[[ARG2:.*]]: i32): +// FORCE-CREATE-CHECK: scf.yield %[[ARG2]] : i32 +// FORCE-CREATE-CHECK: } +// FORCE-CREATE-CHECK: scf.yield %[[WHILE]] : i32 +// FORCE-CREATE-CHECK: } else { +// FORCE-CREATE-CHECK: scf.yield %[[PRE_NEXT]] : i32 +// FORCE-CREATE-CHECK: } +// FORCE-CREATE-CHECK: return %[[IF]] : i32 diff --git a/mlir/test/lib/Dialect/SCF/CMakeLists.txt b/mlir/test/lib/Dialect/SCF/CMakeLists.txt index 22c2f238..d93bd55 100644 --- a/mlir/test/lib/Dialect/SCF/CMakeLists.txt +++ b/mlir/test/lib/Dialect/SCF/CMakeLists.txt @@ -3,6 +3,7 @@ add_mlir_library(MLIRSCFTestPasses TestLoopParametricTiling.cpp TestLoopUnrolling.cpp TestSCFUtils.cpp + TestSCFWrapInZeroTripCheck.cpp TestWhileOpBuilder.cpp EXCLUDE_FROM_LIBMLIR diff --git a/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp b/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp new file mode 100644 index 0000000..10206dd --- /dev/null +++ b/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp @@ -0,0 +1,72 @@ +//===- TestWrapInZeroTripCheck.cpp -- Passes to test SCF zero-trip-check --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the passes to test wrap-in-zero-trip-check transforms on +// SCF loop ops. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace { + +struct TestWrapWhileLoopInZeroTripCheckPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID( + TestWrapWhileLoopInZeroTripCheckPass) + + StringRef getArgument() const final { + return "test-wrap-scf-while-loop-in-zero-trip-check"; + } + + StringRef getDescription() const final { + return "test scf::wrapWhileLoopInZeroTripCheck"; + } + + TestWrapWhileLoopInZeroTripCheckPass() = default; + TestWrapWhileLoopInZeroTripCheckPass( + const TestWrapWhileLoopInZeroTripCheckPass &) {} + explicit TestWrapWhileLoopInZeroTripCheckPass(bool forceCreateCheckParam) { + forceCreateCheck = forceCreateCheckParam; + } + + void runOnOperation() override { + func::FuncOp func = getOperation(); + MLIRContext *context = &getContext(); + IRRewriter rewriter(context); + func.walk([&](scf::WhileOp op) { + FailureOr result = + scf::wrapWhileLoopInZeroTripCheck(op, rewriter, forceCreateCheck); + // Ignore not implemented failure in tests. The expected output should + // catch problems (e.g. transformation doesn't happen). + (void)result; + }); + } + + Option forceCreateCheck{ + *this, "force-create-check", + llvm::cl::desc("Force to create zero-trip-check."), + llvm::cl::init(false)}; +}; + +} // namespace + +namespace mlir { +namespace test { +void registerTestSCFWrapInZeroTripCheckPasses() { + PassRegistration(); +} +} // namespace test +} // namespace mlir diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index 1b3f60b..cec1e52 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -127,6 +127,7 @@ void registerTestPadFusion(); void registerTestRecursiveTypesPass(); void registerTestSCFUtilsPass(); void registerTestSCFWhileOpBuilderPass(); +void registerTestSCFWrapInZeroTripCheckPasses(); void registerTestShapeMappingPass(); void registerTestSliceAnalysisPass(); void registerTestTensorCopyInsertionPass(); @@ -250,6 +251,7 @@ void registerTestPasses() { mlir::test::registerTestRecursiveTypesPass(); mlir::test::registerTestSCFUtilsPass(); mlir::test::registerTestSCFWhileOpBuilderPass(); + mlir::test::registerTestSCFWrapInZeroTripCheckPasses(); mlir::test::registerTestShapeMappingPass(); mlir::test::registerTestSliceAnalysisPass(); mlir::test::registerTestTensorCopyInsertionPass(); -- cgit v1.1 From 8c106a15156857d23ba9e61c55b49b1e2b6c1583 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 8 Feb 2024 15:13:48 -0800 Subject: [flang] Fix attribute printing for fir.global op (#81197) The custom printer for `fir.global` was eluding all the attributes present on the op when printing the attribute dictionary. So any attribute that is not part of the pretty printing was therefore discarded. This patch fix the printer and also make use of the getters for the attribute names when they are hardcoded. --- flang/lib/Optimizer/Dialect/FIROps.cpp | 19 ++++++++++++------- flang/test/Fir/fir-ops.fir | 7 +++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 483f318..a5b31da 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -1348,12 +1348,12 @@ mlir::ParseResult fir::GlobalOp::parse(mlir::OpAsmParser &parser, if (parser.parseOptionalAttrDict(result.attributes)) return mlir::failure(); - if (succeeded(parser.parseOptionalKeyword("constant"))) { + if (succeeded(parser.parseOptionalKeyword(getConstantAttrNameStr()))) { // if "constant" keyword then mark this as a constant, not a variable - result.addAttribute("constant", builder.getUnitAttr()); + result.addAttribute(getConstantAttrNameStr(), builder.getUnitAttr()); } - if (succeeded(parser.parseOptionalKeyword("target"))) + if (succeeded(parser.parseOptionalKeyword(getTargetAttrNameStr()))) result.addAttribute(getTargetAttrNameStr(), builder.getUnitAttr()); mlir::Type globalType; @@ -1382,11 +1382,16 @@ void fir::GlobalOp::print(mlir::OpAsmPrinter &p) { p.printAttributeWithoutType(getSymrefAttr()); if (auto val = getValueOrNull()) p << '(' << val << ')'; - p.printOptionalAttrDict((*this)->getAttrs(), (*this).getAttributeNames()); - if (getOperation()->getAttr(fir::GlobalOp::getConstantAttrNameStr())) - p << " constant"; + // Print all other attributes that are not pretty printed here. + p.printOptionalAttrDict((*this)->getAttrs(), /*elideAttrs=*/{ + getSymNameAttrName(), getSymrefAttrName(), + getTypeAttrName(), getConstantAttrName(), + getTargetAttrName(), getLinkNameAttrName(), + getInitValAttrName()}); + if (getOperation()->getAttr(getConstantAttrName())) + p << " " << getConstantAttrNameStr(); if (getOperation()->getAttr(getTargetAttrName())) - p << " target"; + p << " " << getTargetAttrNameStr(); p << " : "; p.printType(getType()); if (hasInitializationBody()) { diff --git a/flang/test/Fir/fir-ops.fir b/flang/test/Fir/fir-ops.fir index 3c4095b..962621c 100644 --- a/flang/test/Fir/fir-ops.fir +++ b/flang/test/Fir/fir-ops.fir @@ -893,3 +893,10 @@ func.func @test_box_typecode(%a: !fir.class) { // CHECK-LABEL: func.func @test_box_typecode( // CHECK-SAME: %[[A:.*]]: !fir.class) // CHECK: %{{.*}} = fir.box_typecode %[[A]] : (!fir.class) -> i32 + +fir.global @t1 {keep_my_attr = "data"} : i32 { + %1 = arith.constant 0 : i32 + fir.has_value %1 : i32 +} + +// CHECK-LABEL: fir.global @t1 {keep_my_attr = "data"} : i32 -- cgit v1.1 From 9affa177b526459beddafad30474d2e3186376e4 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Thu, 8 Feb 2024 15:14:13 -0800 Subject: [NVPTX] Add support for calling aliases (#81170) The current implementation of aliases tries to remove all the aliases in the module to prevent the generic version of `AsmPrinter` from emitting them incorrectly. Unfortunately, if the aliases are used this will fail. Instead let's override the function to print aliases directly. In addition, the declarations of the alias functions must occur before the uses. To fix this we emit alias declarations as part of `emitDeclarations` and only emit the `.alias` directives at the end (where we can assume the aliasee has also already been declared). --- llvm/include/llvm/CodeGen/AsmPrinter.h | 2 +- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 54 ++++++++++++++---------------- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h | 5 ++- llvm/test/CodeGen/NVPTX/alias.ll | 52 +++++++++++++++++++++------- 5 files changed, 70 insertions(+), 45 deletions(-) diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h index fbd198a..a7fbf4a 100644 --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -897,7 +897,7 @@ private: virtual void emitModuleCommandLines(Module &M); GCMetadataPrinter *getOrCreateGCPrinter(GCStrategy &S); - void emitGlobalAlias(Module &M, const GlobalAlias &GA); + virtual void emitGlobalAlias(const Module &M, const GlobalAlias &GA); void emitGlobalIFunc(Module &M, const GlobalIFunc &GI); private: diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index b961fc2..e89a1c26 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2127,7 +2127,7 @@ void AsmPrinter::emitGlobalGOTEquivs() { emitGlobalVariable(GV); } -void AsmPrinter::emitGlobalAlias(Module &M, const GlobalAlias &GA) { +void AsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) { MCSymbol *Name = getSymbol(&GA); bool IsFunction = GA.getValueType()->isFunctionTy(); // Treat bitcasts of functions as functions also. This is important at least diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index cdfc288..2219d9f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -57,6 +57,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instruction.h" @@ -605,14 +606,33 @@ void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr, O << getVirtualRegisterName(vr); } +void NVPTXAsmPrinter::emitAliasDeclaration(const GlobalAlias *GA, + raw_ostream &O) { + const Function *F = dyn_cast_or_null(GA->getAliaseeObject()); + if (!F || isKernelFunction(*F) || F->isDeclaration()) + report_fatal_error( + "NVPTX aliasee must be a non-kernel function definition"); + + if (GA->hasLinkOnceLinkage() || GA->hasWeakLinkage() || + GA->hasAvailableExternallyLinkage() || GA->hasCommonLinkage()) + report_fatal_error("NVPTX aliasee must not be '.weak'"); + + emitDeclarationWithName(F, getSymbol(GA), O); +} + void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) { + emitDeclarationWithName(F, getSymbol(F), O); +} + +void NVPTXAsmPrinter::emitDeclarationWithName(const Function *F, MCSymbol *S, + raw_ostream &O) { emitLinkageDirective(F, O); if (isKernelFunction(*F)) O << ".entry "; else O << ".func "; printReturnValStr(F, O); - getSymbol(F)->print(O, MAI); + S->print(O, MAI); O << "\n"; emitFunctionParamList(F, O); O << "\n"; @@ -759,6 +779,8 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { } seenMap[&F] = true; } + for (const GlobalAlias &GA : M.aliases()) + emitAliasDeclaration(&GA, O); } static bool isEmptyXXStructor(GlobalVariable *GV) { @@ -853,25 +875,9 @@ void NVPTXAsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) { raw_svector_ostream OS(Str); MCSymbol *Name = getSymbol(&GA); - const Function *F = dyn_cast(GA.getAliasee()); - if (!F || isKernelFunction(*F)) - report_fatal_error("NVPTX aliasee must be a non-kernel function"); - - if (GA.hasLinkOnceLinkage() || GA.hasWeakLinkage() || - GA.hasAvailableExternallyLinkage() || GA.hasCommonLinkage()) - report_fatal_error("NVPTX aliasee must not be '.weak'"); - - OS << "\n"; - emitLinkageDirective(F, OS); - OS << ".func "; - printReturnValStr(F, OS); - OS << Name->getName(); - emitFunctionParamList(F, OS); - if (shouldEmitPTXNoReturn(F, TM)) - OS << "\n.noreturn"; - OS << ";\n"; - OS << ".alias " << Name->getName() << ", " << F->getName() << ";\n"; + OS << ".alias " << Name->getName() << ", " << GA.getAliaseeObject()->getName() + << ";\n"; OutStreamer->emitRawText(OS.str()); } @@ -932,16 +938,6 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) { GlobalsEmitted = true; } - // If we have any aliases we emit them at the end. - SmallVector AliasesToRemove; - for (GlobalAlias &Alias : M.aliases()) { - emitGlobalAlias(M, Alias); - AliasesToRemove.push_back(&Alias); - } - - for (GlobalAlias *A : AliasesToRemove) - A->eraseFromParent(); - // call doFinalization bool ret = AsmPrinter::doFinalization(M); diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h index 7f0f37e..979d185 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -27,6 +27,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCExpr.h" @@ -174,7 +175,7 @@ private: void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O, bool processDemoted, const NVPTXSubtarget &STI); void emitGlobals(const Module &M); - void emitGlobalAlias(const Module &M, const GlobalAlias &GA); + void emitGlobalAlias(const Module &M, const GlobalAlias &GA) override; void emitHeader(Module &M, raw_ostream &O, const NVPTXSubtarget &STI); void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const; void emitVirtualRegister(unsigned int vr, raw_ostream &); @@ -222,6 +223,8 @@ private: void emitLinkageDirective(const GlobalValue *V, raw_ostream &O); void emitDeclarations(const Module &, raw_ostream &O); void emitDeclaration(const Function *, raw_ostream &O); + void emitAliasDeclaration(const GlobalAlias *, raw_ostream &O); + void emitDeclarationWithName(const Function *, MCSymbol *, raw_ostream &O); void emitDemotedVars(const Function *, raw_ostream &); bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo, diff --git a/llvm/test/CodeGen/NVPTX/alias.ll b/llvm/test/CodeGen/NVPTX/alias.ll index d5dc3a1..cb592dd 100644 --- a/llvm/test/CodeGen/NVPTX/alias.ll +++ b/llvm/test/CodeGen/NVPTX/alias.ll @@ -1,8 +1,10 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | %ptxas-verify %} define i32 @a() { ret i32 0 } @b = internal alias i32 (), ptr @a @c = internal alias i32 (), ptr @a +@d = internal alias i32 (), ptr @c define void @foo(i32 %0, ptr %1) { ret void } @bar = alias i32 (), ptr @foo @@ -12,8 +14,37 @@ define void @noreturn() #0 { } @noreturn_alias = alias i32 (), ptr @noreturn +define i32 @z() { + %val = call i32 @b() + ret i32 %val +} + + attributes #0 = { noreturn } +; CHECK: .visible .func (.param .b32 func_retval0) b +; CHECK-NEXT: () +; CHECK-NEXT: ; + +; CHECK: .visible .func (.param .b32 func_retval0) c +; CHECK-NEXT: () +; CHECK-NEXT: ; + +; CHECK: .visible .func (.param .b32 func_retval0) d +; CHECK-NEXT: () +; CHECK-NEXT: ; + +; CHECK: .visible .func bar +; CHECK-NEXT: ( +; CHECK-NEXT: .param .b32 foo_param_0, +; CHECK-NEXT: .param .b64 foo_param_1 +; CHECK-NEXT: ) +; CHECK-NEXT: ; + +; CHECK: .visible .func noreturn_alias +; CHECK-NEXT: () +; CHECK-NEXT: .noreturn; + ; CHECK: .visible .func (.param .b32 func_retval0) a() ; CHECK: .visible .func foo( @@ -24,18 +55,13 @@ attributes #0 = { noreturn } ; CHECK: .visible .func noreturn() ; CHECK-NEXT: .noreturn -; CHECK: .visible .func (.param .b32 func_retval0) b(); -; CHECK-NEXT: .alias b, a; +; CHECK: .visible .func (.param .b32 func_retval0) z() +; CHECK: call.uni (retval0), +; CHECK-NEXT: b, -; CHECK: .visible .func (.param .b32 func_retval0) c(); -; CHECK-NEXT: .alias c, a; -; CHECK: .visible .func bar( -; CHECK-NEXT: .param .b32 foo_param_0, -; CHECK-NEXT: .param .b64 foo_param_1 -; CHECK-NEXT: ); -; CHECK-NEXT: .alias bar, foo; - -; CHECK: .visible .func noreturn_alias() -; CHECK-NEXT: .noreturn; -; CHECK-NEXT: .alias noreturn_alias, noreturn; +; CHECK: .alias b, a; +; CHECK: .alias c, a; +; CHECK: .alias d, a; +; CHECK: .alias bar, foo; +; CHECK: .alias noreturn_alias, noreturn; -- cgit v1.1 From 9211e67da36782db44a46ccb9ac06734ccf2570f Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 8 Feb 2024 17:16:31 -0600 Subject: [NVVMReflect] Force dead branch elimination in NVVMReflect (#81189) Summary: The `__nvvm_reflect` function is used to guard invalid code that varies between architectures. One problem with this feature is that if it is used without optimizations, it will leave invalid code in the module that will then make it to the backend. The `__nvvm_reflect` pass is already mandatory, so it should do some trivial branch removal to ensure that constants are handled correctly. This dead branch elimination only works in the trivial case of a compare on a branch and does not touch any conditionals that were not realted to the `__nvvm_reflect` call in order to preserve `O0` semantics as much as possible. This should allow the following to work on NVPTX targets ```c int foo() { if (__nvvm_reflect("__CUDA_ARCH") >= 700) asm("valid;\n"); } ``` --- llvm/docs/NVPTXUsage.rst | 5 + llvm/lib/Target/NVPTX/NVVMReflect.cpp | 62 +++++++++++ llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll | 141 ++++++++++++++++++++++++ llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll | 1 - 4 files changed, 208 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 22acc6c..b5e3918 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -296,6 +296,11 @@ pipeline, immediately after the link stage. The ``internalize`` pass is also recommended to remove unused math functions from the resulting PTX. For an input IR module ``module.bc``, the following compilation flow is recommended: +The ``NVVMReflect`` pass will attempt to remove dead code even without +optimizations. This allows potentially incompatible instructions to be avoided +at all optimizations levels. This currently only works for simple conditionals +like the above example. + 1. Save list of external functions in ``module.bc`` 2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc`` 3. Internalize all functions not in list from (1) diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 7d2678a..5283c2f 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -20,6 +20,7 @@ #include "NVPTX.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -36,6 +37,8 @@ #include "llvm/Support/raw_os_ostream.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include #include #define NVVM_REFLECT_FUNCTION "__nvvm_reflect" @@ -87,6 +90,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { } SmallVector ToRemove; + SmallVector ToSimplify; // Go through the calls in this function. Each call to __nvvm_reflect or // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument. @@ -171,6 +175,13 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { } else if (ReflectArg == "__CUDA_ARCH") { ReflectVal = SmVersion * 10; } + + // If the immediate user is a simple comparison we want to simplify it. + // TODO: This currently does not handle switch instructions. + for (User *U : Call->users()) + if (ICmpInst *I = dyn_cast(U)) + ToSimplify.push_back(I); + Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); ToRemove.push_back(Call); } @@ -178,6 +189,57 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { for (Instruction *I : ToRemove) I->eraseFromParent(); + // The code guarded by __nvvm_reflect may be invalid for the target machine. + // We need to do some basic dead code elimination to trim invalid code before + // it reaches the backend at all optimization levels. + SmallVector Simplified; + for (ICmpInst *Cmp : ToSimplify) { + Constant *LHS = dyn_cast(Cmp->getOperand(0)); + Constant *RHS = dyn_cast(Cmp->getOperand(1)); + + if (!LHS || !RHS) + continue; + + // If the comparison is a compile time constant we simply propagate it. + Constant *C = ConstantFoldCompareInstOperands( + Cmp->getPredicate(), LHS, RHS, Cmp->getModule()->getDataLayout()); + + if (!C) + continue; + + for (User *U : Cmp->users()) + if (BranchInst *I = dyn_cast(U)) + Simplified.push_back(I); + + Cmp->replaceAllUsesWith(C); + Cmp->eraseFromParent(); + } + + // Each instruction here is a conditional branch off of a constant true or + // false value. Simply replace it with an unconditional branch to the + // appropriate basic block and delete the rest if it is trivially dead. + DenseSet Removed; + for (BranchInst *Branch : Simplified) { + if (Removed.contains(Branch)) + continue; + + ConstantInt *C = dyn_cast(Branch->getCondition()); + if (!C || (!C->isOne() && !C->isZero())) + continue; + + BasicBlock *TrueBB = + C->isOne() ? Branch->getSuccessor(0) : Branch->getSuccessor(1); + BasicBlock *FalseBB = + C->isOne() ? Branch->getSuccessor(1) : Branch->getSuccessor(0); + + ReplaceInstWithInst(Branch, BranchInst::Create(TrueBB)); + if (FalseBB->use_empty() && FalseBB->hasNPredecessors(0) && + FalseBB->getFirstNonPHIOrDbg()) { + Removed.insert(FalseBB->getFirstNonPHIOrDbg()); + changeToUnreachable(FalseBB->getFirstNonPHIOrDbg()); + } + } + return ToRemove.size() > 0; } diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll new file mode 100644 index 0000000..c9586d5 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll @@ -0,0 +1,141 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_52 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_70 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx72 -O0 | FileCheck %s --check-prefix=SM_90 + +@.str = private unnamed_addr constant [12 x i8] c"__CUDA_ARCH\00" + +declare i32 @__nvvm_reflect(ptr) + +; SM_52: .visible .func (.param .b32 func_retval0) foo() +; SM_52: mov.b32 %[[REG:.+]], 3; +; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_52-NEXT: ret; +; +; SM_70: .visible .func (.param .b32 func_retval0) foo() +; SM_70: mov.b32 %[[REG:.+]], 2; +; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_70-NEXT: ret; +; +; SM_90: .visible .func (.param .b32 func_retval0) foo() +; SM_90: mov.b32 %[[REG:.+]], 1; +; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_90-NEXT: ret; +define i32 @foo() { +entry: + %call = call i32 @__nvvm_reflect(ptr @.str) + %cmp = icmp uge i32 %call, 900 + br i1 %cmp, label %if.then, label %if.else + +if.then: + br label %return + +if.else: + %call1 = call i32 @__nvvm_reflect(ptr @.str) + %cmp2 = icmp uge i32 %call1, 700 + br i1 %cmp2, label %if.then3, label %if.else4 + +if.then3: + br label %return + +if.else4: + %call5 = call i32 @__nvvm_reflect(ptr @.str) + %cmp6 = icmp uge i32 %call5, 520 + br i1 %cmp6, label %if.then7, label %if.else8 + +if.then7: + br label %return + +if.else8: + br label %return + +return: + %retval.0 = phi i32 [ 1, %if.then ], [ 2, %if.then3 ], [ 3, %if.then7 ], [ 4, %if.else8 ] + ret i32 %retval.0 +} + +; SM_52: .visible .func (.param .b32 func_retval0) bar() +; SM_52: mov.b32 %[[REG:.+]], 2; +; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_52-NEXT: ret; +; +; SM_70: .visible .func (.param .b32 func_retval0) bar() +; SM_70: mov.b32 %[[REG:.+]], 1; +; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_70-NEXT: ret; +; +; SM_90: .visible .func (.param .b32 func_retval0) bar() +; SM_90: mov.b32 %[[REG:.+]], 1; +; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_90-NEXT: ret; +define i32 @bar() { +entry: + %call = call i32 @__nvvm_reflect(ptr @.str) + %cmp = icmp uge i32 %call, 700 + br i1 %cmp, label %if.then, label %if.else + +if.then: + br label %if.end + +if.else: + br label %if.end + +if.end: + %x = phi i32 [ 1, %if.then ], [ 2, %if.else ] + ret i32 %x +} + +; SM_52-NOT: valid; +; SM_70: valid; +; SM_90: valid; +define void @baz() { +entry: + %call = call i32 @__nvvm_reflect(ptr @.str) + %cmp = icmp uge i32 %call, 700 + br i1 %cmp, label %if.then, label %if.end + +if.then: + call void asm sideeffect "valid;\0A", ""() + br label %if.end + +if.end: + ret void +} + +; SM_52: .visible .func (.param .b32 func_retval0) qux() +; SM_52: mov.u32 %[[REG1:.+]], %[[REG2:.+]]; +; SM_52: st.param.b32 [func_retval0+0], %[[REG1:.+]]; +; SM_52: ret; +; SM_70: .visible .func (.param .b32 func_retval0) qux() +; SM_70: mov.u32 %[[REG1:.+]], %[[REG2:.+]]; +; SM_70: st.param.b32 [func_retval0+0], %[[REG1:.+]]; +; SM_70: ret; +; SM_90: .visible .func (.param .b32 func_retval0) qux() +; SM_90: st.param.b32 [func_retval0+0], %[[REG1:.+]]; +; SM_90: ret; +define i32 @qux() { +entry: + %call = call i32 @__nvvm_reflect(ptr noundef @.str) + %cmp = icmp uge i32 %call, 700 + %conv = zext i1 %cmp to i32 + switch i32 %conv, label %sw.default [ + i32 900, label %sw.bb + i32 700, label %sw.bb1 + i32 520, label %sw.bb2 + ] + +sw.bb: + br label %return + +sw.bb1: + br label %return + +sw.bb2: + br label %return + +sw.default: + br label %return + +return: + %retval = phi i32 [ 4, %sw.default ], [ 3, %sw.bb2 ], [ 2, %sw.bb1 ], [ 1, %sw.bb ] + ret i32 %retval +} diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll index e8c554c..ac5875c 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll @@ -18,4 +18,3 @@ define i32 @foo(float %a, float %b) { ; SM35: ret i32 350 ret i32 %reflect } - -- cgit v1.1 From 42230e213e11a0cf9cdbdcd49225eb0d325ef007 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 8 Feb 2024 17:17:21 -0600 Subject: [LinkerWrapper] Allow 'all' as a generic bundled architecture (#81193) Summary: Currently, the linker wrapper sorts input files into different link jobs according to their architectures. Here we assume each architecture is a unique and incompatible link job unless they are specifically marked compatible. This patch simply adds an `all` target to represent an architecture that should be linked against every single other architecture. This will be useful for modelling generic IR such as the ROCm device libraries or the NVPTX libdevice. --- clang/test/Driver/linker-wrapper.c | 16 ++++++++++++++++ llvm/lib/Object/OffloadBinary.cpp | 4 ++++ 2 files changed, 20 insertions(+) diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c index 010001b..647629a 100644 --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -172,6 +172,22 @@ __attribute__((visibility("protected"), used)) int x; // AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o // AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o +// RUN: clang-offload-packager -o %t-lib.out \ +// RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=all +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out +// RUN: llvm-ar rcs %t.a %t.o +// RUN: clang-offload-packager -o %t1.out \ +// RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t1.o -fembed-offload-object=%t1.out +// RUN: clang-offload-packager -o %t2.out \ +// RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t2.o -fembed-offload-object=%t2.out +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ +// RUN: --linker-path=/usr/bin/ld -- %t1.o %t2.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=ARCH-ALL + +// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o +// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o + // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp index 22d604b..58b9b39 100644 --- a/llvm/lib/Object/OffloadBinary.cpp +++ b/llvm/lib/Object/OffloadBinary.cpp @@ -355,6 +355,10 @@ bool object::areTargetsCompatible(const OffloadFile::TargetID &LHS, if (LHS.first != RHS.first) return false; + // If the architecture is "all" we assume it is always compatible. + if (LHS.second.equals("all") || RHS.second.equals("all")) + return true; + // Only The AMDGPU target requires additional checks. llvm::Triple T(LHS.first); if (!T.isAMDGPU()) -- cgit v1.1 From c429f48b56714f9366eee8490061bd008635cbc0 Mon Sep 17 00:00:00 2001 From: Sam Clegg Date: Thu, 8 Feb 2024 15:20:37 -0800 Subject: [Object][WebAssembly] Improve error on invalid relocation (#81203) See https://github.com/emscripten-core/emscripten/issues/21140 --- llvm/lib/Object/WasmObjectFile.cpp | 40 ++++++++++++++----------------- llvm/test/Object/wasm-bad-relocation.yaml | 35 +++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 22 deletions(-) create mode 100644 llvm/test/Object/wasm-bad-relocation.yaml diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp index ea17154..1d68687 100644 --- a/llvm/lib/Object/WasmObjectFile.cpp +++ b/llvm/lib/Object/WasmObjectFile.cpp @@ -1034,6 +1034,13 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) { if (Reloc.Offset < PreviousOffset) return make_error("relocations not in offset order", object_error::parse_failed); + + auto badReloc = [&](StringRef msg) { + return make_error( + msg + ": " + Twine(Symbols[Reloc.Index].Info.Name), + object_error::parse_failed); + }; + PreviousOffset = Reloc.Offset; Reloc.Index = readVaruint32(Ctx); switch (type) { @@ -1046,18 +1053,15 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) { case wasm::R_WASM_TABLE_INDEX_REL_SLEB: case wasm::R_WASM_TABLE_INDEX_REL_SLEB64: if (!isValidFunctionSymbol(Reloc.Index)) - return make_error( - "invalid relocation function index", object_error::parse_failed); + return badReloc("invalid function relocation"); break; case wasm::R_WASM_TABLE_NUMBER_LEB: if (!isValidTableSymbol(Reloc.Index)) - return make_error("invalid relocation table index", - object_error::parse_failed); + return badReloc("invalid table relocation"); break; case wasm::R_WASM_TYPE_INDEX_LEB: if (Reloc.Index >= Signatures.size()) - return make_error("invalid relocation type index", - object_error::parse_failed); + return badReloc("invalid relocation type index"); break; case wasm::R_WASM_GLOBAL_INDEX_LEB: // R_WASM_GLOBAL_INDEX_LEB are can be used against function and data @@ -1065,18 +1069,15 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) { if (!isValidGlobalSymbol(Reloc.Index) && !isValidDataSymbol(Reloc.Index) && !isValidFunctionSymbol(Reloc.Index)) - return make_error("invalid relocation global index", - object_error::parse_failed); + return badReloc("invalid global relocation"); break; case wasm::R_WASM_GLOBAL_INDEX_I32: if (!isValidGlobalSymbol(Reloc.Index)) - return make_error("invalid relocation global index", - object_error::parse_failed); + return badReloc("invalid global relocation"); break; case wasm::R_WASM_TAG_INDEX_LEB: if (!isValidTagSymbol(Reloc.Index)) - return make_error("invalid relocation tag index", - object_error::parse_failed); + return badReloc("invalid tag relocation"); break; case wasm::R_WASM_MEMORY_ADDR_LEB: case wasm::R_WASM_MEMORY_ADDR_SLEB: @@ -1085,8 +1086,7 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) { case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB: case wasm::R_WASM_MEMORY_ADDR_LOCREL_I32: if (!isValidDataSymbol(Reloc.Index)) - return make_error("invalid relocation data index", - object_error::parse_failed); + return badReloc("invalid data relocation"); Reloc.Addend = readVarint32(Ctx); break; case wasm::R_WASM_MEMORY_ADDR_LEB64: @@ -1095,26 +1095,22 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) { case wasm::R_WASM_MEMORY_ADDR_REL_SLEB64: case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB64: if (!isValidDataSymbol(Reloc.Index)) - return make_error("invalid relocation data index", - object_error::parse_failed); + return badReloc("invalid data relocation"); Reloc.Addend = readVarint64(Ctx); break; case wasm::R_WASM_FUNCTION_OFFSET_I32: if (!isValidFunctionSymbol(Reloc.Index)) - return make_error( - "invalid relocation function index", object_error::parse_failed); + return badReloc("invalid function relocation"); Reloc.Addend = readVarint32(Ctx); break; case wasm::R_WASM_FUNCTION_OFFSET_I64: if (!isValidFunctionSymbol(Reloc.Index)) - return make_error( - "invalid relocation function index", object_error::parse_failed); + return badReloc("invalid function relocation"); Reloc.Addend = readVarint64(Ctx); break; case wasm::R_WASM_SECTION_OFFSET_I32: if (!isValidSectionSymbol(Reloc.Index)) - return make_error( - "invalid relocation section index", object_error::parse_failed); + return badReloc("invalid section relocation"); Reloc.Addend = readVarint32(Ctx); break; default: diff --git a/llvm/test/Object/wasm-bad-relocation.yaml b/llvm/test/Object/wasm-bad-relocation.yaml new file mode 100644 index 0000000..aed405c --- /dev/null +++ b/llvm/test/Object/wasm-bad-relocation.yaml @@ -0,0 +1,35 @@ +# RUN: yaml2obj %s | not llvm-objdump -s - 2>&1 | FileCheck %s + +# Check for invalid relocations. In this case we have a relocations of type +# R_WASM_FUNCTION_INDEX_LEB against a symbol (foo) which is not a function +# symbol but a data symbol. + +# CHECK: invalid function relocation: foo + +--- !WASM +FileHeader: + Version: 0x00000001 +Sections: + - Type: DATA + Segments: + - SectionOffset: 0 + InitFlags: 0 + Offset: + Opcode: I32_CONST + Value: 0 + Content: '6401020304' + Relocations: + - Type: R_WASM_FUNCTION_INDEX_LEB + Index: 0 + Offset: 0x00000000 + - Type: CUSTOM + Name: linking + Version: 2 + SymbolTable: + - Index: 0 + Kind: DATA + Name: foo + Flags: [ ] + Segment: 0 + Offset: 0 + Size: 1 -- cgit v1.1 From 0800a36053943beabe1c3f98fe4ecccbc192a2a7 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 8 Feb 2024 17:31:47 -0600 Subject: Revert "[NVVMReflect] Force dead branch elimination in NVVMReflect (#81189)" This reverts commit 9211e67da36782db44a46ccb9ac06734ccf2570f. Summary: This seemed to crash one one of the CUDA math tests. Revert until it can be fixed. --- llvm/docs/NVPTXUsage.rst | 5 - llvm/lib/Target/NVPTX/NVVMReflect.cpp | 62 ----------- llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll | 141 ------------------------ llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll | 1 + 4 files changed, 1 insertion(+), 208 deletions(-) delete mode 100644 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index b5e3918..22acc6c 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -296,11 +296,6 @@ pipeline, immediately after the link stage. The ``internalize`` pass is also recommended to remove unused math functions from the resulting PTX. For an input IR module ``module.bc``, the following compilation flow is recommended: -The ``NVVMReflect`` pass will attempt to remove dead code even without -optimizations. This allows potentially incompatible instructions to be avoided -at all optimizations levels. This currently only works for simple conditionals -like the above example. - 1. Save list of external functions in ``module.bc`` 2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc`` 3. Internalize all functions not in list from (1) diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 5283c2f..7d2678a 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -20,7 +20,6 @@ #include "NVPTX.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -37,8 +36,6 @@ #include "llvm/Support/raw_os_ostream.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include #include #define NVVM_REFLECT_FUNCTION "__nvvm_reflect" @@ -90,7 +87,6 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { } SmallVector ToRemove; - SmallVector ToSimplify; // Go through the calls in this function. Each call to __nvvm_reflect or // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument. @@ -175,13 +171,6 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { } else if (ReflectArg == "__CUDA_ARCH") { ReflectVal = SmVersion * 10; } - - // If the immediate user is a simple comparison we want to simplify it. - // TODO: This currently does not handle switch instructions. - for (User *U : Call->users()) - if (ICmpInst *I = dyn_cast(U)) - ToSimplify.push_back(I); - Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); ToRemove.push_back(Call); } @@ -189,57 +178,6 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { for (Instruction *I : ToRemove) I->eraseFromParent(); - // The code guarded by __nvvm_reflect may be invalid for the target machine. - // We need to do some basic dead code elimination to trim invalid code before - // it reaches the backend at all optimization levels. - SmallVector Simplified; - for (ICmpInst *Cmp : ToSimplify) { - Constant *LHS = dyn_cast(Cmp->getOperand(0)); - Constant *RHS = dyn_cast(Cmp->getOperand(1)); - - if (!LHS || !RHS) - continue; - - // If the comparison is a compile time constant we simply propagate it. - Constant *C = ConstantFoldCompareInstOperands( - Cmp->getPredicate(), LHS, RHS, Cmp->getModule()->getDataLayout()); - - if (!C) - continue; - - for (User *U : Cmp->users()) - if (BranchInst *I = dyn_cast(U)) - Simplified.push_back(I); - - Cmp->replaceAllUsesWith(C); - Cmp->eraseFromParent(); - } - - // Each instruction here is a conditional branch off of a constant true or - // false value. Simply replace it with an unconditional branch to the - // appropriate basic block and delete the rest if it is trivially dead. - DenseSet Removed; - for (BranchInst *Branch : Simplified) { - if (Removed.contains(Branch)) - continue; - - ConstantInt *C = dyn_cast(Branch->getCondition()); - if (!C || (!C->isOne() && !C->isZero())) - continue; - - BasicBlock *TrueBB = - C->isOne() ? Branch->getSuccessor(0) : Branch->getSuccessor(1); - BasicBlock *FalseBB = - C->isOne() ? Branch->getSuccessor(1) : Branch->getSuccessor(0); - - ReplaceInstWithInst(Branch, BranchInst::Create(TrueBB)); - if (FalseBB->use_empty() && FalseBB->hasNPredecessors(0) && - FalseBB->getFirstNonPHIOrDbg()) { - Removed.insert(FalseBB->getFirstNonPHIOrDbg()); - changeToUnreachable(FalseBB->getFirstNonPHIOrDbg()); - } - } - return ToRemove.size() > 0; } diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll deleted file mode 100644 index c9586d5..0000000 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll +++ /dev/null @@ -1,141 +0,0 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_52 -; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_70 -; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx72 -O0 | FileCheck %s --check-prefix=SM_90 - -@.str = private unnamed_addr constant [12 x i8] c"__CUDA_ARCH\00" - -declare i32 @__nvvm_reflect(ptr) - -; SM_52: .visible .func (.param .b32 func_retval0) foo() -; SM_52: mov.b32 %[[REG:.+]], 3; -; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; -; SM_52-NEXT: ret; -; -; SM_70: .visible .func (.param .b32 func_retval0) foo() -; SM_70: mov.b32 %[[REG:.+]], 2; -; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; -; SM_70-NEXT: ret; -; -; SM_90: .visible .func (.param .b32 func_retval0) foo() -; SM_90: mov.b32 %[[REG:.+]], 1; -; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; -; SM_90-NEXT: ret; -define i32 @foo() { -entry: - %call = call i32 @__nvvm_reflect(ptr @.str) - %cmp = icmp uge i32 %call, 900 - br i1 %cmp, label %if.then, label %if.else - -if.then: - br label %return - -if.else: - %call1 = call i32 @__nvvm_reflect(ptr @.str) - %cmp2 = icmp uge i32 %call1, 700 - br i1 %cmp2, label %if.then3, label %if.else4 - -if.then3: - br label %return - -if.else4: - %call5 = call i32 @__nvvm_reflect(ptr @.str) - %cmp6 = icmp uge i32 %call5, 520 - br i1 %cmp6, label %if.then7, label %if.else8 - -if.then7: - br label %return - -if.else8: - br label %return - -return: - %retval.0 = phi i32 [ 1, %if.then ], [ 2, %if.then3 ], [ 3, %if.then7 ], [ 4, %if.else8 ] - ret i32 %retval.0 -} - -; SM_52: .visible .func (.param .b32 func_retval0) bar() -; SM_52: mov.b32 %[[REG:.+]], 2; -; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; -; SM_52-NEXT: ret; -; -; SM_70: .visible .func (.param .b32 func_retval0) bar() -; SM_70: mov.b32 %[[REG:.+]], 1; -; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; -; SM_70-NEXT: ret; -; -; SM_90: .visible .func (.param .b32 func_retval0) bar() -; SM_90: mov.b32 %[[REG:.+]], 1; -; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; -; SM_90-NEXT: ret; -define i32 @bar() { -entry: - %call = call i32 @__nvvm_reflect(ptr @.str) - %cmp = icmp uge i32 %call, 700 - br i1 %cmp, label %if.then, label %if.else - -if.then: - br label %if.end - -if.else: - br label %if.end - -if.end: - %x = phi i32 [ 1, %if.then ], [ 2, %if.else ] - ret i32 %x -} - -; SM_52-NOT: valid; -; SM_70: valid; -; SM_90: valid; -define void @baz() { -entry: - %call = call i32 @__nvvm_reflect(ptr @.str) - %cmp = icmp uge i32 %call, 700 - br i1 %cmp, label %if.then, label %if.end - -if.then: - call void asm sideeffect "valid;\0A", ""() - br label %if.end - -if.end: - ret void -} - -; SM_52: .visible .func (.param .b32 func_retval0) qux() -; SM_52: mov.u32 %[[REG1:.+]], %[[REG2:.+]]; -; SM_52: st.param.b32 [func_retval0+0], %[[REG1:.+]]; -; SM_52: ret; -; SM_70: .visible .func (.param .b32 func_retval0) qux() -; SM_70: mov.u32 %[[REG1:.+]], %[[REG2:.+]]; -; SM_70: st.param.b32 [func_retval0+0], %[[REG1:.+]]; -; SM_70: ret; -; SM_90: .visible .func (.param .b32 func_retval0) qux() -; SM_90: st.param.b32 [func_retval0+0], %[[REG1:.+]]; -; SM_90: ret; -define i32 @qux() { -entry: - %call = call i32 @__nvvm_reflect(ptr noundef @.str) - %cmp = icmp uge i32 %call, 700 - %conv = zext i1 %cmp to i32 - switch i32 %conv, label %sw.default [ - i32 900, label %sw.bb - i32 700, label %sw.bb1 - i32 520, label %sw.bb2 - ] - -sw.bb: - br label %return - -sw.bb1: - br label %return - -sw.bb2: - br label %return - -sw.default: - br label %return - -return: - %retval = phi i32 [ 4, %sw.default ], [ 3, %sw.bb2 ], [ 2, %sw.bb1 ], [ 1, %sw.bb ] - ret i32 %retval -} diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll index ac5875c..e8c554c 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll @@ -18,3 +18,4 @@ define i32 @foo(float %a, float %b) { ; SM35: ret i32 350 ret i32 %reflect } + -- cgit v1.1 From 2572f45c7d6c081ba9b4fa344e928182f8df7773 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Thu, 8 Feb 2024 15:43:18 -0800 Subject: [flang] Fix missing generated header Fix buildbot failing because of missing HLFIRTypes.h.inc --- flang/lib/Optimizer/Dialect/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt index 745439b..58a4276 100644 --- a/flang/lib/Optimizer/Dialect/CMakeLists.txt +++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt @@ -13,6 +13,7 @@ add_flang_library(FIRDialect CanonicalizationPatternsIncGen MLIRIR FIROpsIncGen + HLFIROpsIncGen intrinsics_gen LINK_LIBS -- cgit v1.1 From 93471466be65cf78330782d461a821ffb82e070a Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 9 Feb 2024 11:41:04 +1100 Subject: Document use of `skip-precommit-approval` label for non-review pull requests (#81053) Derived from this discussion: https://discourse.llvm.org/t/prs-without-approvals-muddy-the-waters/76656 --- llvm/docs/CodeReview.rst | 6 ++++++ llvm/docs/GitHub.rst | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/llvm/docs/CodeReview.rst b/llvm/docs/CodeReview.rst index f1d5b6c..e3a7494 100644 --- a/llvm/docs/CodeReview.rst +++ b/llvm/docs/CodeReview.rst @@ -103,6 +103,12 @@ ready to be committed. Specifically, once a patch is sent out for review, it needs an explicit approval before it is committed. Do not assume silent approval, or solicit objections to a patch with a deadline. +.. note:: + If you are using a Pull Request for purposes other than review + (eg: precommit CI results, convenient web-based reverts, etc) + `skip-precommit-approval` + label to the PR. + Acknowledge All Reviewer Feedback --------------------------------- diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst index c3fbb64..51a7310 100644 --- a/llvm/docs/GitHub.rst +++ b/llvm/docs/GitHub.rst @@ -30,6 +30,12 @@ describes the typical workflow of creating a Pull Request and getting it reviewe and accepted. This is meant as an overview of the GitHub workflow, for complete documentation refer to `GitHub's documentation `_. +.. note:: + If you are using a Pull Request for purposes other than review + (eg: precommit CI results, convenient web-based reverts, etc) + `skip-precommit-approval` + label to the PR. + GitHub Tools ------------ You can interact with GitHub in several ways: via git command line tools, -- cgit v1.1 From 1d4fc381d3da4317cc2cfa59b2d59d53decddf71 Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Thu, 8 Feb 2024 16:48:04 -0800 Subject: [DWARFVerifier] Fix verification of empty line tables (#81162) A line table whose sole entry is an end sequence should not have the entry's file index verified, as that value corresponds to the initial value of the state machine, not to a real file index. In DWARF 5, this is particularly problematic as it uses 0-based indexing, and the state machine specifies a starting index of 1; in other words, you'd need to have _two_ files before such index became legal "by default". A previous attempt to fix this problem was done [1], but it was too specific in its condition, and did not capture all possible cases where this issue can happen. [1]: https://github.com/llvm/llvm-project/pull/77004 --- llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp | 13 +++-- .../X86/verify_empty_debug_line_sequence.yaml | 55 ++++++++++++++++++++++ 2 files changed, 61 insertions(+), 7 deletions(-) create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 2124ff8..b523576 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -1025,6 +1025,11 @@ void DWARFVerifier::verifyDebugLineRows() { FileIndex++; } + // Nothing to verify in a line table with a single row containing the end + // sequence. + if (LineTable->Rows.size() == 1 && LineTable->Rows.front().EndSequence) + continue; + // Verify rows. uint64_t PrevAddress = 0; uint32_t RowIndex = 0; @@ -1048,13 +1053,7 @@ void DWARFVerifier::verifyDebugLineRows() { }); } - // If the prologue contains no file names and the line table has only one - // row, do not verify the file index, this is a line table of an empty - // file with an end_sequence, but the DWARF standard sets the file number - // to 1 by default, otherwise verify file index. - if ((LineTable->Prologue.FileNames.size() || - LineTable->Rows.size() != 1) && - !LineTable->hasFileAtIndex(Row.File)) { + if (!LineTable->hasFileAtIndex(Row.File)) { ++NumDebugLineErrors; ErrorCategory.Report("Invalid file index in debug_line", [&]() { error() << ".debug_line[" diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml new file mode 100644 index 0000000..1bab2c2 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml @@ -0,0 +1,55 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_line... +# CHECK: No errors + +# In a line table like the one below, with no rows (other than the +# end_sequence), we should never verify the file index because the state +# machine initializes the file index to 1, which is invalid in DWARF 5 due to +# its 0-based indexing. + +# file_names[ 0]: +# name: "/home/umb/tests_2018/106_rnglists2" +# dir_index: 0 +# Address Line Column File ISA Discriminator OpIndex Flags +# ------------------ ------ ------ ------ --- ------------- ------- ------------- +# 0x0000000000000000 1 0 1 0 0 0 is_stmt end_sequence + + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + debug_info: + - Length: 0xd + Version: 5 + UnitType: DW_UT_compile + AbbrevTableID: 0 + AbbrOffset: 0x0 + AddrSize: 8 + Entries: + - AbbrCode: 0x1 + Values: + - Value: 0x0 +Sections: + - Name: .debug_line + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 300000000500080025000000010101fb0e0d00010101010000000100000101011f010000000002011f020b010000000000000101 + - Name: .debug_line_str + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + Content: 2F686F6D652F756D622F74657374735F323031382F3130365F726E676C697374733200746573742E63707000 -- cgit v1.1 From 1389260805ec2ffb74a4fb311e7327c64d9b8f54 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Thu, 8 Feb 2024 16:51:28 -0800 Subject: [JITLink][MachO][arm64] Fix error-check order. The error check should be performed after the iterator increment, not before it. Thanks to @dcb314 for catching this! Fixes github.com/apple/swift/issues/81119 --- llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp index 809b2d5..556031b 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp @@ -312,10 +312,10 @@ private: Addend = SignExtend64(RI.r_symbolnum, 24); + ++RelItr; if (RelItr == RelEnd) return make_error("Unpaired Addend reloc at " + formatv("{0:x16}", FixupAddress)); - ++RelItr; RI = getRelocationInfo(RelItr); MachORelocKind = getRelocationKind(RI); -- cgit v1.1 From fbf43b0121006e371fbf50ad8642e4c62405e5cc Mon Sep 17 00:00:00 2001 From: lntue <35648136+lntue@users.noreply.github.com> Date: Thu, 8 Feb 2024 20:13:27 -0500 Subject: [libc] Only declare float128 math functions in the generated math.h if float128 type is supported. (#81010) --- libc/include/math.h.def | 2 ++ libc/spec/spec.td | 4 ++++ libc/spec/stdc.td | 18 +++++++++--------- libc/utils/HdrGen/PublicAPICommand.cpp | 15 ++++++++++++++- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/libc/include/math.h.def b/libc/include/math.h.def index 813bb72..927e2d6 100644 --- a/libc/include/math.h.def +++ b/libc/include/math.h.def @@ -11,6 +11,8 @@ #include <__llvm-libc-common.h> #include +#include + %%public_api() diff --git a/libc/spec/spec.td b/libc/spec/spec.td index 0b557c8..aebf495 100644 --- a/libc/spec/spec.td +++ b/libc/spec/spec.td @@ -176,6 +176,10 @@ class FunctionSpec args> { list Args = args; } +class GuardedFunctionSpec args, string guard_macro> : FunctionSpec { + string Guard = guard_macro; +} + class ObjectSpec { string Name = name; string Type = type; diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 97dabbc..e37f95a 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -359,17 +359,17 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"copysign", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"copysignf", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"copysignl", RetValSpec, [ArgSpec, ArgSpec]>, - FunctionSpec<"copysignf128", RetValSpec, [ArgSpec, ArgSpec]>, + GuardedFunctionSpec<"copysignf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"ceil", RetValSpec, [ArgSpec]>, FunctionSpec<"ceilf", RetValSpec, [ArgSpec]>, FunctionSpec<"ceill", RetValSpec, [ArgSpec]>, - FunctionSpec<"ceilf128", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"ceilf128", RetValSpec, [ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"fabs", RetValSpec, [ArgSpec]>, FunctionSpec<"fabsf", RetValSpec, [ArgSpec]>, FunctionSpec<"fabsl", RetValSpec, [ArgSpec]>, - FunctionSpec<"fabsf128", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"fabsf128", RetValSpec, [ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"fdim", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"fdimf", RetValSpec, [ArgSpec, ArgSpec]>, @@ -378,17 +378,17 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"floor", RetValSpec, [ArgSpec]>, FunctionSpec<"floorf", RetValSpec, [ArgSpec]>, FunctionSpec<"floorl", RetValSpec, [ArgSpec]>, - FunctionSpec<"floorf128", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"floorf128", RetValSpec, [ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"fmin", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"fminf", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"fminl", RetValSpec, [ArgSpec, ArgSpec]>, - FunctionSpec<"fminf128", RetValSpec, [ArgSpec, ArgSpec]>, + GuardedFunctionSpec<"fminf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"fmax", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"fmaxf", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"fmaxl", RetValSpec, [ArgSpec, ArgSpec]>, - FunctionSpec<"fmaxf128", RetValSpec, [ArgSpec, ArgSpec]>, + GuardedFunctionSpec<"fmaxf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"fma", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, FunctionSpec<"fmaf", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, @@ -461,7 +461,7 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"round", RetValSpec, [ArgSpec]>, FunctionSpec<"roundf", RetValSpec, [ArgSpec]>, FunctionSpec<"roundl", RetValSpec, [ArgSpec]>, - FunctionSpec<"roundf128", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"roundf128", RetValSpec, [ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"lround", RetValSpec, [ArgSpec]>, FunctionSpec<"lroundf", RetValSpec, [ArgSpec]>, @@ -486,12 +486,12 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"sqrt", RetValSpec, [ArgSpec]>, FunctionSpec<"sqrtf", RetValSpec, [ArgSpec]>, FunctionSpec<"sqrtl", RetValSpec, [ArgSpec]>, - FunctionSpec<"sqrtf128", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"sqrtf128", RetValSpec, [ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"trunc", RetValSpec, [ArgSpec]>, FunctionSpec<"truncf", RetValSpec, [ArgSpec]>, FunctionSpec<"truncl", RetValSpec, [ArgSpec]>, - FunctionSpec<"truncf128", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"truncf128", RetValSpec, [ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"nearbyint", RetValSpec, [ArgSpec]>, FunctionSpec<"nearbyintf", RetValSpec, [ArgSpec]>, diff --git a/libc/utils/HdrGen/PublicAPICommand.cpp b/libc/utils/HdrGen/PublicAPICommand.cpp index b1c7a07..cf6984b 100644 --- a/libc/utils/HdrGen/PublicAPICommand.cpp +++ b/libc/utils/HdrGen/PublicAPICommand.cpp @@ -102,6 +102,14 @@ void writeAPIFromIndex(APIIndexer &G, llvm::Record *RetValSpec = FunctionSpec->getValueAsDef("Return"); llvm::Record *ReturnType = RetValSpec->getValueAsDef("ReturnType"); + // TODO: https://github.com/llvm/llvm-project/issues/81208 + // Ideally, we should group functions based on their guarding macros. + bool Guarded = + (FunctionSpec->getType()->getAsString() == "GuardedFunctionSpec"); + + if (Guarded) + OS << "#ifdef " << FunctionSpec->getValueAsString("Guard") << "\n"; + OS << G.getTypeAsString(ReturnType) << " " << Name << "("; auto ArgsList = FunctionSpec->getValueAsListOfDefs("Args"); @@ -112,7 +120,12 @@ void writeAPIFromIndex(APIIndexer &G, OS << ", "; } - OS << ") __NOEXCEPT;\n\n"; + OS << ") __NOEXCEPT;\n"; + + if (Guarded) + OS << "#endif // " << FunctionSpec->getValueAsString("Guard") << "\n"; + + OS << "\n"; } // Make another pass over entrypoints to emit object declarations. -- cgit v1.1 From 4759890f859277cd798648a9a333573cd088d98a Mon Sep 17 00:00:00 2001 From: Alexey Z Date: Thu, 8 Feb 2024 20:22:27 -0500 Subject: [mlir][tensor] Fix bug in insert_slice canonical. with tensor encoding (#81045) Previously, `InsertSliceOpSourceCastInserter` was incorrectly applied to a case when tensor types have an encoding attribute attached to them. The type `newSrcType` was missing that attribute from the old `srcType`, which made the expression `srcType == newSrcType` false, since `tensor<2x2xf32, "foo">` is not equal to `tensor<2x2xf32>`. That lead to an endless back and forth between `InsertSliceOpSourceCastInserter` that would introduce a cast and `InsertSliceOpCastFolder` that would remove it right after. --- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 4 ++-- mlir/test/Dialect/Tensor/canonicalize.mlir | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index b21e89a..8298cf1 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -2663,8 +2663,8 @@ struct InsertSliceOpSourceCastInserter final if (!hasValidSizesOffsets(newSrcShape)) return failure(); - RankedTensorType newSrcType = - RankedTensorType::get(newSrcShape, srcType.getElementType()); + RankedTensorType newSrcType = RankedTensorType::get( + newSrcShape, srcType.getElementType(), srcType.getEncoding()); if (srcType == newSrcType || !preservesStaticInformation(srcType, newSrcType) || !tensor::CastOp::areCastCompatible(srcType, newSrcType)) diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index 7192a71..90c715b 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -555,6 +555,24 @@ func.func @insert_slice_canonicalize(%arg0 : tensor, %arg1 : index, // ----- +// Do not insert a cast for the following example. The new source type wouldn't be "more static" than the old one. +func.func @insert_slice_canonicalize_encoding(%arg0 : tensor<2x2xf32, "foo">, + %arg1 : tensor<4x4xf32, "foo">) -> tensor<4x4xf32, "foo"> +{ + %0 = tensor.insert_slice %arg0 into %arg1[0, 0] [2, 2] [1, 1] : tensor<2x2xf32, "foo"> into tensor<4x4xf32, "foo"> + return %0 : tensor<4x4xf32, "foo"> +} +// CHECK-LABEL: func @insert_slice_canonicalize_encoding +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<2x2xf32, "foo"> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: tensor<4x4xf32, "foo"> +// CHECK-NOT: tensor.cast +// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]] into %[[ARG1]] +// CHECK-SAME: [0, 0] [2, 2] [1, 1] +// CHECK-SAME: : tensor<2x2xf32, "foo"> into tensor<4x4xf32, "foo"> +// CHECK: return %[[RESULT]] + +// ----- + func.func @slice_to_insert_slice_canonicalize(%arg0 : tensor, %arg1 : index, %arg2 : index, %arg3 : tensor) -> tensor { -- cgit v1.1 From 6e1f438528b6e1ece2f6ef331905c352ccc2fcfd Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 8 Feb 2024 17:35:45 -0800 Subject: [ELF] Improve --ro-rosegment/--omagic/--nmagic tests Notably, test that --ro-rosegment with a linker script may unnecessarily make a read-only PT_LOAD executable. --- lld/test/ELF/segments.s | 200 ++++++++++++++---------------------------------- 1 file changed, 59 insertions(+), 141 deletions(-) diff --git a/lld/test/ELF/segments.s b/lld/test/ELF/segments.s index d9af9a3..614f6e7 100644 --- a/lld/test/ELF/segments.s +++ b/lld/test/ELF/segments.s @@ -1,157 +1,75 @@ # REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o -# RUN: ld.lld %t -o %t1 -# RUN: llvm-readobj --program-headers %t1 | FileCheck --check-prefix=ROSEGMENT %s -# RUN: ld.lld --no-rosegment --rosegment %t -o - | cmp - %t1 -# RUN: ld.lld --omagic --no-omagic %t -o - | cmp - %t1 +# RUN: ld.lld a.o -o a +# RUN: llvm-readelf -l a | FileCheck --check-prefix=ROSEGMENT %s +# RUN: ld.lld --no-rosegment --rosegment a.o -o - | cmp - a +# RUN: ld.lld --omagic --no-omagic a.o -o - | cmp - a -# ROSEGMENT: ProgramHeader { -# ROSEGMENT: Type: PT_LOAD -# ROSEGMENT-NEXT: Offset: 0x0 -# ROSEGMENT-NEXT: VirtualAddress: -# ROSEGMENT-NEXT: PhysicalAddress: -# ROSEGMENT-NEXT: FileSize: -# ROSEGMENT-NEXT: MemSize: -# ROSEGMENT-NEXT: Flags [ -# ROSEGMENT-NEXT: PF_R -# ROSEGMENT-NEXT: ] -# ROSEGMENT-NEXT: Alignment: 4096 -# ROSEGMENT-NEXT: } -# ROSEGMENT-NEXT: ProgramHeader { -# ROSEGMENT-NEXT: Type: PT_LOAD -# ROSEGMENT-NEXT: Offset: 0x15C -# ROSEGMENT-NEXT: VirtualAddress: -# ROSEGMENT-NEXT: PhysicalAddress: -# ROSEGMENT-NEXT: FileSize: -# ROSEGMENT-NEXT: MemSize: -# ROSEGMENT-NEXT: Flags [ -# ROSEGMENT-NEXT: PF_R -# ROSEGMENT-NEXT: PF_X -# ROSEGMENT-NEXT: ] -# ROSEGMENT-NEXT: Alignment: 4096 -# ROSEGMENT-NEXT: } -# ROSEGMENT-NEXT: ProgramHeader { -# ROSEGMENT-NEXT: Type: PT_LOAD -# ROSEGMENT-NEXT: Offset: 0x15E -# ROSEGMENT-NEXT: VirtualAddress: -# ROSEGMENT-NEXT: PhysicalAddress: -# ROSEGMENT-NEXT: FileSize: 1 -# ROSEGMENT-NEXT: MemSize: 1 -# ROSEGMENT-NEXT: Flags [ -# ROSEGMENT-NEXT: PF_R -# ROSEGMENT-NEXT: PF_W -# ROSEGMENT-NEXT: ] -# ROSEGMENT-NEXT: Alignment: 4096 -# ROSEGMENT-NEXT: } +# ROSEGMENT: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align +# ROSEGMENT-NEXT: PHDR 0x000040 0x0000000000200040 0x0000000000200040 0x000118 0x000118 R 0x8 +# ROSEGMENT-NEXT: LOAD 0x000000 0x0000000000200000 0x0000000000200000 0x00015a 0x00015a R 0x1000 +# ROSEGMENT-NEXT: LOAD 0x00015c 0x000000000020115c 0x000000000020115c 0x000003 0x000003 R E 0x1000 +# ROSEGMENT-NEXT: LOAD 0x00015f 0x000000000020215f 0x000000000020215f 0x000002 0x000002 RW 0x1000 +# ROSEGMENT-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0 -# RUN: ld.lld --no-rosegment %t -o %t2 -# RUN: llvm-readobj --program-headers %t2 | FileCheck --check-prefix=NOROSEGMENT %s +# RUN: ld.lld --no-rosegment a.o -o noro +# RUN: llvm-readelf -l noro | FileCheck --check-prefix=NOROSEGMENT %s -# NOROSEGMENT: ProgramHeader { -# NOROSEGMENT: Type: PT_LOAD -# NOROSEGMENT-NEXT: Offset: 0x0 -# NOROSEGMENT-NEXT: VirtualAddress: -# NOROSEGMENT-NEXT: PhysicalAddress: -# NOROSEGMENT-NEXT: FileSize: -# NOROSEGMENT-NEXT: MemSize: -# NOROSEGMENT-NEXT: Flags [ -# NOROSEGMENT-NEXT: PF_R -# NOROSEGMENT-NEXT: PF_X -# NOROSEGMENT-NEXT: ] -# NOROSEGMENT-NEXT: Alignment: 4096 -# NOROSEGMENT-NEXT: } -# NOROSEGMENT-NEXT: ProgramHeader { -# NOROSEGMENT-NEXT: Type: PT_LOAD -# NOROSEGMENT-NEXT: Offset: 0x126 -# NOROSEGMENT-NEXT: VirtualAddress: -# NOROSEGMENT-NEXT: PhysicalAddress: -# NOROSEGMENT-NEXT: FileSize: -# NOROSEGMENT-NEXT: MemSize: -# NOROSEGMENT-NEXT: Flags [ -# NOROSEGMENT-NEXT: PF_R -# NOROSEGMENT-NEXT: PF_W -# NOROSEGMENT-NEXT: ] -# NOROSEGMENT-NEXT: Alignment: 4096 -# NOROSEGMENT-NEXT: } -# NOROSEGMENT-NEXT: ProgramHeader { -# NOROSEGMENT-NEXT: Type: PT_GNU_STACK +# NOROSEGMENT: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align +# NOROSEGMENT-NEXT: PHDR 0x000040 0x0000000000200040 0x0000000000200040 0x0000e0 0x0000e0 R 0x8 +# NOROSEGMENT-NEXT: LOAD 0x000000 0x0000000000200000 0x0000000000200000 0x000127 0x000127 R E 0x1000 +# NOROSEGMENT-NEXT: LOAD 0x000127 0x0000000000201127 0x0000000000201127 0x000002 0x000002 RW 0x1000 +# NOROSEGMENT-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0 -# RUN: ld.lld -N %t -o %t3 -# RUN: llvm-readobj --program-headers %t3 | FileCheck --check-prefix=OMAGIC %s -# RUN: ld.lld --omagic %t -o %t3 -# RUN: llvm-readobj --program-headers %t3 | FileCheck --check-prefix=OMAGIC %s +# RUN: ld.lld --no-rosegment a.o -T a.lds -o noro1 +# RUN: llvm-readelf -l noro1 | FileCheck --check-prefix=NOROSEGMENT1 %s -# OMAGIC: ProgramHeader { -# OMAGIC: Type: PT_LOAD -# OMAGIC-NEXT: Offset: 0xB0 -# OMAGIC-NEXT: VirtualAddress: -# OMAGIC-NEXT: PhysicalAddress: -# OMAGIC-NEXT: FileSize: -# OMAGIC-NEXT: MemSize: -# OMAGIC-NEXT: Flags [ -# OMAGIC-NEXT: PF_R -# OMAGIC-NEXT: PF_W -# OMAGIC-NEXT: PF_X -# OMAGIC-NEXT: ] -# OMAGIC-NEXT: Alignment: 4 -# OMAGIC-NEXT: } -# OMAGIC-NEXT: ProgramHeader { -# OMAGIC-NEXT: Type: PT_GNU_STACK +# NOROSEGMENT1: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align +# NOROSEGMENT1-NEXT: LOAD 0x001000 0x0000000000000000 0x0000000000000000 0x000007 0x000007 R E 0x1000 +# NOROSEGMENT1-NEXT: LOAD 0x001007 0x0000000000000007 0x0000000000000007 0x000001 0x000001 RW 0x1000 +# NOROSEGMENT1-NEXT: LOAD 0x001008 0x0000000000000008 0x0000000000000008 0x000001 0x000001 R E 0x1000 +# NOROSEGMENT1-NEXT: LOAD 0x001009 0x0000000000000009 0x0000000000000009 0x000001 0x000001 RW 0x1000 +# NOROSEGMENT1-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0 -# RUN: ld.lld -n %t -o %t4 -# RUN: llvm-readobj --program-headers %t4 | FileCheck --check-prefix=NMAGIC %s -# RUN: ld.lld --nmagic %t -o %t4 -# RUN: llvm-readobj --program-headers %t4 | FileCheck --check-prefix=NMAGIC %s +# RUN: ld.lld -N a.o -o omagic +# RUN: llvm-readelf -l omagic | FileCheck --check-prefix=OMAGIC %s +# RUN: ld.lld --omagic a.o -o - | cmp - omagic -# NMAGIC: ProgramHeader { -# NMAGIC-NEXT: Type: PT_LOAD -# NMAGIC-NEXT: Offset: 0x120 -# NMAGIC-NEXT: VirtualAddress: -# NMAGIC-NEXT: PhysicalAddress: -# NMAGIC-NEXT: FileSize: 1 -# NMAGIC-NEXT: MemSize: 1 -# NMAGIC-NEXT: Flags [ -# NMAGIC-NEXT: PF_R -# NMAGIC-NEXT: ] -# NMAGIC-NEXT: Alignment: 1 -# NMAGIC-NEXT: } -# NMAGIC-NEXT: ProgramHeader { -# NMAGIC-NEXT: Type: PT_LOAD -# NMAGIC-NEXT: Offset: 0x124 -# NMAGIC-NEXT: VirtualAddress: -# NMAGIC-NEXT: PhysicalAddress: -# NMAGIC-NEXT: FileSize: 2 -# NMAGIC-NEXT: MemSize: 2 -# NMAGIC-NEXT: Flags [ -# NMAGIC-NEXT: PF_R -# NMAGIC-NEXT: PF_X -# NMAGIC-NEXT: ] -# NMAGIC-NEXT: Alignment: 4 -# NMAGIC-NEXT: } -# NMAGIC-NEXT: ProgramHeader { -# NMAGIC-NEXT: Type: PT_LOAD (0x1) -# NMAGIC-NEXT: Offset: 0x126 -# NMAGIC-NEXT: VirtualAddress: -# NMAGIC-NEXT: PhysicalAddress: -# NMAGIC-NEXT: FileSize: 1 -# NMAGIC-NEXT: MemSize: 1 -# NMAGIC-NEXT: Flags [ -# NMAGIC-NEXT: PF_R -# NMAGIC-NEXT: PF_W -# NMAGIC-NEXT: ] -# NMAGIC-NEXT: Alignment: 1 -# NMAGIC-NEXT: } +# OMAGIC: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align +# OMAGIC-NEXT: LOAD 0x0000b0 0x00000000002000b0 0x00000000002000b0 0x000009 0x000009 RWE 0x4 +# OMAGIC-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0 +# RUN: ld.lld -n a.o -o nmagic +# RUN: llvm-readelf -l nmagic | FileCheck --check-prefix=NMAGIC %s +# RUN: ld.lld --nmagic a.o -o - | cmp nmagic - + +# NMAGIC: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align +# NMAGIC-NEXT: LOAD 0x000120 0x0000000000200120 0x0000000000200120 0x000002 0x000002 R 0x1 +# NMAGIC-NEXT: LOAD 0x000124 0x0000000000200124 0x0000000000200124 0x000003 0x000003 R E 0x4 +# NMAGIC-NEXT: LOAD 0x000127 0x0000000000200127 0x0000000000200127 0x000002 0x000002 RW 0x1 +# NMAGIC-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0 + +#--- a.s .global _start _start: nop -.section .ro,"a" -nop +.section .ro1,"a"; .byte 1 +.section .rw1,"aw"; .byte 3 +.section .rx1,"ax"; .byte 2 -.section .rw,"aw" -nop +.section .ro2,"a"; .byte 1 +.section .rw2,"aw"; .byte 3 +.section .rx2,"ax"; .byte 2 -.section .rx,"ax" -nop +#--- a.lds +SECTIONS { + .ro1 : {} + .text : {} + .rx : { *(.rx*) } + .rw1 : {} + .ro2 : {} + .rw2 : {} +} -- cgit v1.1 From f60826917aff102450a470dee85208fd578685c4 Mon Sep 17 00:00:00 2001 From: Abdurrahman Akkas Date: Thu, 8 Feb 2024 17:50:41 -0800 Subject: [MLIR] Fix a small formatting issue in AsmPrinter.cpp (#81214) Introduced in 76ce4736721a6e9030210bda6df0ad8a8f478a19 --- mlir/lib/IR/AsmPrinter.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp index 6b8b747..6bed909 100644 --- a/mlir/lib/IR/AsmPrinter.cpp +++ b/mlir/lib/IR/AsmPrinter.cpp @@ -74,11 +74,9 @@ MLIRContext *AsmParser::getContext() const { return getBuilder().getContext(); } /// Parse a type list. /// This is out-of-line to work-around https://github.com/llvm/llvm-project/issues/62918 ParseResult AsmParser::parseTypeList(SmallVectorImpl &result) { - return parseCommaSeparatedList( - [&]() { return parseType(result.emplace_back()); }); - } - - + return parseCommaSeparatedList( + [&]() { return parseType(result.emplace_back()); }); +} //===----------------------------------------------------------------------===// // DialectAsmPrinter -- cgit v1.1 From ffabcbcf8f9fc7ef5fd29e2a711f95aba0ef7808 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 8 Feb 2024 17:16:31 -0600 Subject: [NVVMReflect][Reland] Force dead branch elimination in NVVMReflect (#81189) Summary: The `__nvvm_reflect` function is used to guard invalid code that varies between architectures. One problem with this feature is that if it is used without optimizations, it will leave invalid code in the module that will then make it to the backend. The `__nvvm_reflect` pass is already mandatory, so it should do some trivial branch removal to ensure that constants are handled correctly. This dead branch elimination only works in the trivial case of a compare on a branch and does not touch any conditionals that were not realted to the `__nvvm_reflect` call in order to preserve `O0` semantics as much as possible. This should allow the following to work on NVPTX targets ```c int foo() { if (__nvvm_reflect("__CUDA_ARCH") >= 700) asm("valid;\n"); } ``` Relanding after fixing a bug. --- llvm/docs/NVPTXUsage.rst | 5 + llvm/lib/Target/NVPTX/NVVMReflect.cpp | 65 +++++++++ llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll | 175 ++++++++++++++++++++++++ llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll | 1 - 4 files changed, 245 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 22acc6c..b5e3918 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -296,6 +296,11 @@ pipeline, immediately after the link stage. The ``internalize`` pass is also recommended to remove unused math functions from the resulting PTX. For an input IR module ``module.bc``, the following compilation flow is recommended: +The ``NVVMReflect`` pass will attempt to remove dead code even without +optimizations. This allows potentially incompatible instructions to be avoided +at all optimizations levels. This currently only works for simple conditionals +like the above example. + 1. Save list of external functions in ``module.bc`` 2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc`` 3. Internalize all functions not in list from (1) diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 7d2678a..3794ad9b 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -20,6 +20,7 @@ #include "NVPTX.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -36,6 +37,8 @@ #include "llvm/Support/raw_os_ostream.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include #include #define NVVM_REFLECT_FUNCTION "__nvvm_reflect" @@ -87,6 +90,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { } SmallVector ToRemove; + SmallVector ToSimplify; // Go through the calls in this function. Each call to __nvvm_reflect or // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument. @@ -171,6 +175,13 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { } else if (ReflectArg == "__CUDA_ARCH") { ReflectVal = SmVersion * 10; } + + // If the immediate user is a simple comparison we want to simplify it. + // TODO: This currently does not handle switch instructions. + for (User *U : Call->users()) + if (ICmpInst *I = dyn_cast(U)) + ToSimplify.push_back(I); + Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); ToRemove.push_back(Call); } @@ -178,6 +189,60 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { for (Instruction *I : ToRemove) I->eraseFromParent(); + // The code guarded by __nvvm_reflect may be invalid for the target machine. + // We need to do some basic dead code elimination to trim invalid code before + // it reaches the backend at all optimization levels. + SmallVector Simplified; + for (ICmpInst *Cmp : ToSimplify) { + Constant *LHS = dyn_cast(Cmp->getOperand(0)); + Constant *RHS = dyn_cast(Cmp->getOperand(1)); + + if (!LHS || !RHS) + continue; + + // If the comparison is a compile time constant we simply propagate it. + Constant *C = ConstantFoldCompareInstOperands( + Cmp->getPredicate(), LHS, RHS, Cmp->getModule()->getDataLayout()); + + if (!C) + continue; + + for (User *U : Cmp->users()) + if (BranchInst *I = dyn_cast(U)) + Simplified.push_back(I); + + Cmp->replaceAllUsesWith(C); + Cmp->eraseFromParent(); + } + + // Each instruction here is a conditional branch off of a constant true or + // false value. Simply replace it with an unconditional branch to the + // appropriate basic block and delete the rest if it is trivially dead. + DenseSet Removed; + for (BranchInst *Branch : Simplified) { + if (Removed.contains(Branch)) + continue; + + ConstantInt *C = dyn_cast(Branch->getCondition()); + if (!C || (!C->isOne() && !C->isZero())) + continue; + + BasicBlock *TrueBB = + C->isOne() ? Branch->getSuccessor(0) : Branch->getSuccessor(1); + BasicBlock *FalseBB = + C->isOne() ? Branch->getSuccessor(1) : Branch->getSuccessor(0); + + // This transformation is only correct on simple edges. + if (!FalseBB->hasNPredecessors(1)) + continue; + + ReplaceInstWithInst(Branch, BranchInst::Create(TrueBB)); + if (FalseBB->use_empty() && !FalseBB->getFirstNonPHIOrDbg()) { + Removed.insert(FalseBB->getFirstNonPHIOrDbg()); + changeToUnreachable(FalseBB->getFirstNonPHIOrDbg()); + } + } + return ToRemove.size() > 0; } diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll new file mode 100644 index 0000000..9dcdf5b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll @@ -0,0 +1,175 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_52 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_70 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx72 -O0 | FileCheck %s --check-prefix=SM_90 + +@.str = private unnamed_addr constant [12 x i8] c"__CUDA_ARCH\00" +@.str1 = constant [11 x i8] c"__CUDA_FTZ\00" + +declare i32 @__nvvm_reflect(ptr) + +; SM_52: .visible .func (.param .b32 func_retval0) foo() +; SM_52: mov.b32 %[[REG:.+]], 3; +; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_52-NEXT: ret; +; +; SM_70: .visible .func (.param .b32 func_retval0) foo() +; SM_70: mov.b32 %[[REG:.+]], 2; +; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_70-NEXT: ret; +; +; SM_90: .visible .func (.param .b32 func_retval0) foo() +; SM_90: mov.b32 %[[REG:.+]], 1; +; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_90-NEXT: ret; +define i32 @foo() { +entry: + %call = call i32 @__nvvm_reflect(ptr @.str) + %cmp = icmp uge i32 %call, 900 + br i1 %cmp, label %if.then, label %if.else + +if.then: + br label %return + +if.else: + %call1 = call i32 @__nvvm_reflect(ptr @.str) + %cmp2 = icmp uge i32 %call1, 700 + br i1 %cmp2, label %if.then3, label %if.else4 + +if.then3: + br label %return + +if.else4: + %call5 = call i32 @__nvvm_reflect(ptr @.str) + %cmp6 = icmp uge i32 %call5, 520 + br i1 %cmp6, label %if.then7, label %if.else8 + +if.then7: + br label %return + +if.else8: + br label %return + +return: + %retval.0 = phi i32 [ 1, %if.then ], [ 2, %if.then3 ], [ 3, %if.then7 ], [ 4, %if.else8 ] + ret i32 %retval.0 +} + +; SM_52: .visible .func (.param .b32 func_retval0) bar() +; SM_52: mov.b32 %[[REG:.+]], 2; +; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_52-NEXT: ret; +; +; SM_70: .visible .func (.param .b32 func_retval0) bar() +; SM_70: mov.b32 %[[REG:.+]], 1; +; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_70-NEXT: ret; +; +; SM_90: .visible .func (.param .b32 func_retval0) bar() +; SM_90: mov.b32 %[[REG:.+]], 1; +; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_90-NEXT: ret; +define i32 @bar() { +entry: + %call = call i32 @__nvvm_reflect(ptr @.str) + %cmp = icmp uge i32 %call, 700 + br i1 %cmp, label %if.then, label %if.else + +if.then: + br label %if.end + +if.else: + br label %if.end + +if.end: + %x = phi i32 [ 1, %if.then ], [ 2, %if.else ] + ret i32 %x +} + +; SM_52-NOT: valid; +; SM_70: valid; +; SM_90: valid; +define void @baz() { +entry: + %call = call i32 @__nvvm_reflect(ptr @.str) + %cmp = icmp uge i32 %call, 700 + br i1 %cmp, label %if.then, label %if.end + +if.then: + call void asm sideeffect "valid;\0A", ""() + br label %if.end + +if.end: + ret void +} + +; SM_52: .visible .func (.param .b32 func_retval0) qux() +; SM_52: mov.u32 %[[REG1:.+]], %[[REG2:.+]]; +; SM_52: st.param.b32 [func_retval0+0], %[[REG1:.+]]; +; SM_52: ret; +; SM_70: .visible .func (.param .b32 func_retval0) qux() +; SM_70: mov.u32 %[[REG1:.+]], %[[REG2:.+]]; +; SM_70: st.param.b32 [func_retval0+0], %[[REG1:.+]]; +; SM_70: ret; +; SM_90: .visible .func (.param .b32 func_retval0) qux() +; SM_90: st.param.b32 [func_retval0+0], %[[REG1:.+]]; +; SM_90: ret; +define i32 @qux() { +entry: + %call = call i32 @__nvvm_reflect(ptr noundef @.str) + %cmp = icmp uge i32 %call, 700 + %conv = zext i1 %cmp to i32 + switch i32 %conv, label %sw.default [ + i32 900, label %sw.bb + i32 700, label %sw.bb1 + i32 520, label %sw.bb2 + ] + +sw.bb: + br label %return + +sw.bb1: + br label %return + +sw.bb2: + br label %return + +sw.default: + br label %return + +return: + %retval = phi i32 [ 4, %sw.default ], [ 3, %sw.bb2 ], [ 2, %sw.bb1 ], [ 1, %sw.bb ] + ret i32 %retval +} + +; SM_52: .visible .func (.param .b32 func_retval0) phi() +; SM_52: mov.f32 %[[REG:.+]], 0f00000000; +; SM_52-NEXT: st.param.f32 [func_retval0+0], %[[REG]]; +; SM_52-NEXT: ret; +; SM_70: .visible .func (.param .b32 func_retval0) phi() +; SM_70: mov.f32 %[[REG:.+]], 0f00000000; +; SM_70-NEXT: st.param.f32 [func_retval0+0], %[[REG]]; +; SM_70-NEXT: ret; +; SM_90: .visible .func (.param .b32 func_retval0) phi() +; SM_90: mov.f32 %[[REG:.+]], 0f00000000; +; SM_90-NEXT: st.param.f32 [func_retval0+0], %[[REG]]; +; SM_90-NEXT: ret; +define float @phi() { +entry: + %0 = call i32 @__nvvm_reflect(ptr @.str) + %1 = icmp eq i32 %0, 0 + br i1 %1, label %if.then, label %if.else + +if.then: + br label %if.else + +if.else: + %.08 = phi float [ 0.000000e+00, %if.then ], [ 1.000000e+00, %entry ] + %4 = fcmp ogt float %.08, 0.000000e+00 + br i1 %4, label %exit, label %if.exit + +if.exit: + br label %exit + +exit: + ret float 0.000000e+00 +} diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll index e8c554c..ac5875c 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll @@ -18,4 +18,3 @@ define i32 @foo(float %a, float %b) { ; SM35: ret i32 350 ret i32 %reflect } - -- cgit v1.1 From c560ce464ae486e86e3d2d9684df3f714317f502 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 8 Feb 2024 18:49:21 -0800 Subject: [flang][cuda] Lower attribute for dummy argument (#81212) Lower CUDA attribute for simple dummy argument. This is done in a similar way than `TARGET`, `OPTIONAL` and so on. This patch also move the `Fortran::common::CUDADataAttr` to `fir::CUDAAttributeAttr` mapping to `flang/include/flang/Optimizer/Support/Utils.h` so that it can be reused where needed. --- .../flang/Optimizer/Dialect/FIROpsSupport.h | 3 ++ flang/include/flang/Optimizer/Support/Utils.h | 30 +++++++++++++++++++ flang/lib/Lower/CallInterface.cpp | 5 ++++ flang/lib/Lower/ConvertVariable.cpp | 28 ++--------------- flang/test/Lower/CUDA/cuda-data-attribute.cuf | 35 +++++++++++++++++++++- 5 files changed, 74 insertions(+), 27 deletions(-) diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h index 977949e..6ac6a31 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h +++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h @@ -72,6 +72,9 @@ constexpr llvm::StringRef getOptionalAttrName() { return "fir.optional"; } /// Attribute to mark Fortran entities with the TARGET attribute. static constexpr llvm::StringRef getTargetAttrName() { return "fir.target"; } +/// Attribute to mark Fortran entities with the CUDA attribute. +static constexpr llvm::StringRef getCUDAAttrName() { return "fir.cuda_attr"; } + /// Attribute to mark that a function argument is a character dummy procedure. /// Character dummy procedure have special ABI constraints. static constexpr llvm::StringRef getCharacterProcedureDummyAttrName() { diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h index b50f297..586701b 100644 --- a/flang/include/flang/Optimizer/Support/Utils.h +++ b/flang/include/flang/Optimizer/Support/Utils.h @@ -273,6 +273,36 @@ inline void genMinMaxlocReductionLoop( builder.setInsertionPointAfter(ifMaskTrueOp); } +inline fir::CUDAAttributeAttr +getCUDAAttribute(mlir::MLIRContext *mlirContext, + std::optional cudaAttr) { + if (cudaAttr) { + fir::CUDAAttribute attr; + switch (*cudaAttr) { + case Fortran::common::CUDADataAttr::Constant: + attr = fir::CUDAAttribute::Constant; + break; + case Fortran::common::CUDADataAttr::Device: + attr = fir::CUDAAttribute::Device; + break; + case Fortran::common::CUDADataAttr::Managed: + attr = fir::CUDAAttribute::Managed; + break; + case Fortran::common::CUDADataAttr::Pinned: + attr = fir::CUDAAttribute::Pinned; + break; + case Fortran::common::CUDADataAttr::Shared: + attr = fir::CUDAAttribute::Shared; + break; + case Fortran::common::CUDADataAttr::Texture: + // Obsolete attribute + return {}; + } + return fir::CUDAAttributeAttr::get(mlirContext, attr); + } + return {}; +} + } // namespace fir #endif // FORTRAN_OPTIMIZER_SUPPORT_UTILS_H diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp index b007c95..4c297ce 100644 --- a/flang/lib/Lower/CallInterface.cpp +++ b/flang/lib/Lower/CallInterface.cpp @@ -19,6 +19,7 @@ #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Optimizer/Support/InternalNames.h" +#include "flang/Optimizer/Support/Utils.h" #include "flang/Semantics/symbol.h" #include "flang/Semantics/tools.h" #include @@ -993,6 +994,10 @@ private: TODO(loc, "VOLATILE in procedure interface"); if (obj.attrs.test(Attrs::Target)) addMLIRAttr(fir::getTargetAttrName()); + if (obj.cudaDataAttr) + attrs.emplace_back( + mlir::StringAttr::get(&mlirContext, fir::getCUDAAttrName()), + fir::getCUDAAttribute(&mlirContext, obj.cudaDataAttr)); // TODO: intents that require special care (e.g finalization) diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index d57bdd4..f14267f 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -37,6 +37,7 @@ #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/FatalError.h" #include "flang/Optimizer/Support/InternalNames.h" +#include "flang/Optimizer/Support/Utils.h" #include "flang/Semantics/runtime-type-info.h" #include "flang/Semantics/tools.h" #include "llvm/Support/Debug.h" @@ -1583,32 +1584,7 @@ fir::CUDAAttributeAttr Fortran::lower::translateSymbolCUDAAttribute( mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym) { std::optional cudaAttr = Fortran::semantics::GetCUDADataAttr(&sym); - if (cudaAttr) { - fir::CUDAAttribute attr; - switch (*cudaAttr) { - case Fortran::common::CUDADataAttr::Constant: - attr = fir::CUDAAttribute::Constant; - break; - case Fortran::common::CUDADataAttr::Device: - attr = fir::CUDAAttribute::Device; - break; - case Fortran::common::CUDADataAttr::Managed: - attr = fir::CUDAAttribute::Managed; - break; - case Fortran::common::CUDADataAttr::Pinned: - attr = fir::CUDAAttribute::Pinned; - break; - case Fortran::common::CUDADataAttr::Shared: - attr = fir::CUDAAttribute::Shared; - break; - case Fortran::common::CUDADataAttr::Texture: - // Obsolete attribute - return {}; - } - - return fir::CUDAAttributeAttr::get(mlirContext, attr); - } - return {}; + return fir::getCUDAAttribute(mlirContext, cudaAttr); } /// Map a symbol to its FIR address and evaluated specification expressions. diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf index caa8ac7..b02701b 100644 --- a/flang/test/Lower/CUDA/cuda-data-attribute.cuf +++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf @@ -1,7 +1,7 @@ ! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s ! RUN: bbc -emit-hlfir -fcuda %s -o - | fir-opt -convert-hlfir-to-fir | FileCheck %s --check-prefix=FIR -! Test lowering of CUDA attribute on local variables. +! Test lowering of CUDA attribute on variables. subroutine local_var_attrs real, constant :: rc @@ -20,3 +20,36 @@ end subroutine ! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QFlocal_var_attrsErd"} : (!fir.ref) -> !fir.ref ! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFlocal_var_attrsErm"} : (!fir.ref>>) -> !fir.ref>> ! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFlocal_var_attrsErp"} : (!fir.ref>>) -> !fir.ref>> + +subroutine dummy_arg_constant(dc) + real, constant :: dc +end subroutine +! CHECK-LABEL: func.func @_QPdummy_arg_constant( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "dc", fir.cuda_attr = #fir.cuda} +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, uniq_name = "_QFdummy_arg_constantEdc"} : (!fir.ref) -> (!fir.ref, !fir.ref) + +subroutine dummy_arg_device(dd) + real, device :: dd +end subroutine +! CHECK-LABEL: func.func @_QPdummy_arg_device( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "dd", fir.cuda_attr = #fir.cuda}) { +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, uniq_name = "_QFdummy_arg_deviceEdd"} : (!fir.ref) -> (!fir.ref, !fir.ref) + +subroutine dummy_arg_managed(dm) + real, allocatable, managed :: dm +end subroutine +! CHECK-LABEL: func.func @_QPdummy_arg_managed( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref>> {fir.bindc_name = "dm", fir.cuda_attr = #fir.cuda}) { +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFdummy_arg_managedEdm"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) + +subroutine dummy_arg_pinned(dp) + real, allocatable, pinned :: dp +end subroutine +! CHECK-LABEL: func.func @_QPdummy_arg_pinned( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref>> {fir.bindc_name = "dp", fir.cuda_attr = #fir.cuda}) { +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFdummy_arg_pinnedEdp"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) + + + + + -- cgit v1.1 From 8e297c779635d7f22626c1a9dd1cb9dc86ea6540 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 8 Feb 2024 19:05:55 -0800 Subject: [ELF] Improve --ro-rosegment tests --- lld/test/ELF/segments.s | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/lld/test/ELF/segments.s b/lld/test/ELF/segments.s index 614f6e7..ee17117 100644 --- a/lld/test/ELF/segments.s +++ b/lld/test/ELF/segments.s @@ -9,11 +9,24 @@ # ROSEGMENT: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align # ROSEGMENT-NEXT: PHDR 0x000040 0x0000000000200040 0x0000000000200040 0x000118 0x000118 R 0x8 -# ROSEGMENT-NEXT: LOAD 0x000000 0x0000000000200000 0x0000000000200000 0x00015a 0x00015a R 0x1000 +# ROSEGMENT-NEXT: LOAD 0x000000 0x0000000000200000 0x0000000000200000 0x00015b 0x00015b R 0x1000 # ROSEGMENT-NEXT: LOAD 0x00015c 0x000000000020115c 0x000000000020115c 0x000003 0x000003 R E 0x1000 # ROSEGMENT-NEXT: LOAD 0x00015f 0x000000000020215f 0x000000000020215f 0x000002 0x000002 RW 0x1000 # ROSEGMENT-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0 +# RUN: ld.lld --rosegment a.o -T a.lds -o ro1 +# RUN: llvm-readelf -l ro1 | FileCheck --check-prefix=ROSEGMENT1 %s + +# ROSEGMENT1: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align +# ROSEGMENT1-NEXT: LOAD 0x001000 0x0000000000000000 0x0000000000000000 0x000001 0x000001 R 0x1000 +# ROSEGMENT1-NEXT: LOAD 0x001004 0x0000000000000004 0x0000000000000004 0x000002 0x000002 R E 0x1000 +# ROSEGMENT1-NEXT: LOAD 0x001006 0x0000000000000006 0x0000000000000006 0x000001 0x000001 RW 0x1000 +# ROSEGMENT1-NEXT: LOAD 0x001007 0x0000000000000007 0x0000000000000007 0x000001 0x000001 R E 0x1000 +# ROSEGMENT1-NEXT: LOAD 0x001008 0x0000000000000008 0x0000000000000008 0x000001 0x000001 R 0x1000 +# ROSEGMENT1-NEXT: LOAD 0x001009 0x0000000000000009 0x0000000000000009 0x000001 0x000001 RW 0x1000 +# ROSEGMENT1-NEXT: LOAD 0x00100a 0x000000000000000a 0x000000000000000a 0x000001 0x000001 R 0x1000 +# ROSEGMENT1-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0 + # RUN: ld.lld --no-rosegment a.o -o noro # RUN: llvm-readelf -l noro | FileCheck --check-prefix=NOROSEGMENT %s @@ -27,10 +40,11 @@ # RUN: llvm-readelf -l noro1 | FileCheck --check-prefix=NOROSEGMENT1 %s # NOROSEGMENT1: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align -# NOROSEGMENT1-NEXT: LOAD 0x001000 0x0000000000000000 0x0000000000000000 0x000007 0x000007 R E 0x1000 -# NOROSEGMENT1-NEXT: LOAD 0x001007 0x0000000000000007 0x0000000000000007 0x000001 0x000001 RW 0x1000 -# NOROSEGMENT1-NEXT: LOAD 0x001008 0x0000000000000008 0x0000000000000008 0x000001 0x000001 R E 0x1000 +# NOROSEGMENT1-NEXT: LOAD 0x001000 0x0000000000000000 0x0000000000000000 0x000006 0x000006 R E 0x1000 +# NOROSEGMENT1-NEXT: LOAD 0x001006 0x0000000000000006 0x0000000000000006 0x000001 0x000001 RW 0x1000 +# NOROSEGMENT1-NEXT: LOAD 0x001007 0x0000000000000007 0x0000000000000007 0x000002 0x000002 R E 0x1000 # NOROSEGMENT1-NEXT: LOAD 0x001009 0x0000000000000009 0x0000000000000009 0x000001 0x000001 RW 0x1000 +# NOROSEGMENT1-NEXT: LOAD 0x00100a 0x000000000000000a 0x000000000000000a 0x000001 0x000001 R E 0x1000 # NOROSEGMENT1-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0 # RUN: ld.lld -N a.o -o omagic @@ -46,7 +60,7 @@ # RUN: ld.lld --nmagic a.o -o - | cmp nmagic - # NMAGIC: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align -# NMAGIC-NEXT: LOAD 0x000120 0x0000000000200120 0x0000000000200120 0x000002 0x000002 R 0x1 +# NMAGIC-NEXT: LOAD 0x000120 0x0000000000200120 0x0000000000200120 0x000003 0x000003 R 0x1 # NMAGIC-NEXT: LOAD 0x000124 0x0000000000200124 0x0000000000200124 0x000003 0x000003 R E 0x4 # NMAGIC-NEXT: LOAD 0x000127 0x0000000000200127 0x0000000000200127 0x000002 0x000002 RW 0x1 # NMAGIC-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0 @@ -64,12 +78,16 @@ _start: .section .rw2,"aw"; .byte 3 .section .rx2,"ax"; .byte 2 +.section .ro3,"a"; .byte 1 + #--- a.lds SECTIONS { .ro1 : {} .text : {} - .rx : { *(.rx*) } + .rx1 : {} .rw1 : {} + .rx2 : {} .ro2 : {} .rw2 : {} + .ro3 : {} } -- cgit v1.1 From ac0577177f053ba7e7016e1b7e44cf5932d00b03 Mon Sep 17 00:00:00 2001 From: Abinaya Saravanan Date: Fri, 9 Feb 2024 09:15:15 +0530 Subject: [HEXAGON] Add basic block limit for RDF optimizations (#81071) Skip RDF optimizations if a function contains a number of basic blocks that is more than a limit --------- Co-authored-by: Yashas Andaluri --- llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp | 10 ++++++++++ llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp | 11 +++++++++++ llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp | 4 ++++ 3 files changed, 25 insertions(+) diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp index aa31762..0e82bf6 100644 --- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -47,6 +47,8 @@ static cl::opt CodeGrowthLimit("hexagon-amode-growth-limit", cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode " "optimization")); +extern cl::opt RDFFuncBlockLimit; + namespace llvm { FunctionPass *createHexagonOptAddrMode(); @@ -856,6 +858,14 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; + // Perform RDF optimizations only if number of basic blocks in the + // function is less than the limit + if (MF.size() > RDFFuncBlockLimit) { + LLVM_DEBUG(dbgs() << "Skipping " << getPassName() + << ": too many basic blocks\n"); + return false; + } + bool Changed = false; auto &HST = MF.getSubtarget(); MRI = &MF.getRegInfo(); diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp index 7eccbd2..4131f2a 100644 --- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp +++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp @@ -50,6 +50,9 @@ static unsigned RDFCount = 0; static cl::opt RDFLimit("hexagon-rdf-limit", cl::init(std::numeric_limits::max())); + +extern cl::opt RDFFuncBlockLimit; + static cl::opt RDFDump("hexagon-rdf-dump", cl::Hidden); static cl::opt RDFTrackReserved("hexagon-rdf-track-reserved", cl::Hidden); @@ -285,6 +288,14 @@ bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; + // Perform RDF optimizations only if number of basic blocks in the + // function is less than the limit + if (MF.size() > RDFFuncBlockLimit) { + if (RDFDump) + dbgs() << "Skipping " << getPassName() << ": too many basic blocks\n"; + return false; + } + if (RDFLimit.getPosition()) { if (RDFCount >= RDFLimit) return false; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index e7a692d..7d4b420 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -39,6 +39,10 @@ static cl::opt static cl::opt EnableRDFOpt("rdf-opt", cl::Hidden, cl::init(true), cl::desc("Enable RDF-based optimizations")); +cl::opt RDFFuncBlockLimit( + "rdf-bb-limit", cl::Hidden, cl::init(1000), + cl::desc("Basic block limit for a function for RDF optimizations")); + static cl::opt DisableHardwareLoops("disable-hexagon-hwloops", cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target")); -- cgit v1.1 From db88f3015867ca569ae78a30f20a944c8e1b8afc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 8 Feb 2024 19:27:27 -0800 Subject: [RISCV] Add test for saving s10 with cm.push. NFC If cm.push saves s10, it must also save s11 due to an encoding limitation. We handle this in the code, but had no test for it. --- llvm/test/CodeGen/RISCV/push-pop-popret.ll | 71 ++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll index 09a91498..e007dcc 100644 --- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll +++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll @@ -3218,3 +3218,74 @@ entry: call void @bar(ptr %0, ptr %var) ret i32 %x } + +define void @spill_x10() { +; RV32IZCMP-LABEL: spill_x10: +; RV32IZCMP: # %bb.0: # %entry +; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -64 +; RV32IZCMP-NEXT: .cfi_def_cfa_offset 64 +; RV32IZCMP-NEXT: .cfi_offset s10, -8 +; RV32IZCMP-NEXT: #APP +; RV32IZCMP-NEXT: li s10, 0 +; RV32IZCMP-NEXT: #NO_APP +; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 64 +; +; RV64IZCMP-LABEL: spill_x10: +; RV64IZCMP: # %bb.0: # %entry +; RV64IZCMP-NEXT: cm.push {ra, s0-s11}, -112 +; RV64IZCMP-NEXT: .cfi_def_cfa_offset 112 +; RV64IZCMP-NEXT: .cfi_offset s10, -16 +; RV64IZCMP-NEXT: #APP +; RV64IZCMP-NEXT: li s10, 0 +; RV64IZCMP-NEXT: #NO_APP +; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 112 +; +; RV32IZCMP-SR-LABEL: spill_x10: +; RV32IZCMP-SR: # %bb.0: # %entry +; RV32IZCMP-SR-NEXT: cm.push {ra, s0-s11}, -64 +; RV32IZCMP-SR-NEXT: .cfi_def_cfa_offset 64 +; RV32IZCMP-SR-NEXT: .cfi_offset s10, -8 +; RV32IZCMP-SR-NEXT: #APP +; RV32IZCMP-SR-NEXT: li s10, 0 +; RV32IZCMP-SR-NEXT: #NO_APP +; RV32IZCMP-SR-NEXT: cm.popret {ra, s0-s11}, 64 +; +; RV64IZCMP-SR-LABEL: spill_x10: +; RV64IZCMP-SR: # %bb.0: # %entry +; RV64IZCMP-SR-NEXT: cm.push {ra, s0-s11}, -112 +; RV64IZCMP-SR-NEXT: .cfi_def_cfa_offset 112 +; RV64IZCMP-SR-NEXT: .cfi_offset s10, -16 +; RV64IZCMP-SR-NEXT: #APP +; RV64IZCMP-SR-NEXT: li s10, 0 +; RV64IZCMP-SR-NEXT: #NO_APP +; RV64IZCMP-SR-NEXT: cm.popret {ra, s0-s11}, 112 +; +; RV32I-LABEL: spill_x10: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw s10, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: .cfi_offset s10, -4 +; RV32I-NEXT: #APP +; RV32I-NEXT: li s10, 0 +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: lw s10, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: spill_x10: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: .cfi_offset s10, -8 +; RV64I-NEXT: #APP +; RV64I-NEXT: li s10, 0 +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +entry: + tail call void asm sideeffect "li s10, 0", "~{s10}"() + ret void +} -- cgit v1.1 From 763139afc19ddf2e0f0265dc828ce8e5fbe92530 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Thu, 8 Feb 2024 21:42:29 -0800 Subject: [clang-format] Update FormatToken::isSimpleTypeSpecifier() (#80241) Now with a8279a8bc541, we can make the update. --- clang/include/clang/Format/Format.h | 2 ++ clang/lib/Format/FormatToken.cpp | 35 +---------------------------------- clang/lib/Format/FormatTokenLexer.cpp | 7 ++++--- clang/lib/Format/FormatTokenLexer.h | 1 - 4 files changed, 7 insertions(+), 38 deletions(-) diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index cb14d98..bb63d33 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -5175,6 +5175,8 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style, ArrayRef Ranges, StringRef FileName = ""); +extern LangOptions LangOpts; + /// Returns the ``LangOpts`` that the formatter expects you to set. /// /// \param Style determines specific settings for lexing mode. diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index b791c5a..69f751d 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -34,41 +34,8 @@ const char *getTokenTypeName(TokenType Type) { return nullptr; } -// FIXME: This is copy&pasted from Sema. Put it in a common place and remove -// duplication. bool FormatToken::isSimpleTypeSpecifier() const { - switch (Tok.getKind()) { - case tok::kw_short: - case tok::kw_long: - case tok::kw___int64: - case tok::kw___int128: - case tok::kw_signed: - case tok::kw_unsigned: - case tok::kw_void: - case tok::kw_char: - case tok::kw_int: - case tok::kw_half: - case tok::kw_float: - case tok::kw_double: - case tok::kw___bf16: - case tok::kw__Float16: - case tok::kw___float128: - case tok::kw___ibm128: - case tok::kw_wchar_t: - case tok::kw_bool: -#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait: -#include "clang/Basic/TransformTypeTraits.def" - case tok::annot_typename: - case tok::kw_char8_t: - case tok::kw_char16_t: - case tok::kw_char32_t: - case tok::kw_typeof: - case tok::kw_decltype: - case tok::kw__Atomic: - return true; - default: - return false; - } + return Tok.isSimpleTypeSpecifier(LangOpts); } bool FormatToken::isTypeOrIdentifier() const { diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index a87d0ba..31b2b7e 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -22,18 +22,20 @@ namespace clang { namespace format { +LangOptions LangOpts; + FormatTokenLexer::FormatTokenLexer( const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator &Allocator, IdentifierTable &IdentTable) : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), - Column(Column), TrailingWhitespace(0), - LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID), + Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID), Style(Style), IdentTable(IdentTable), Keywords(IdentTable), Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0), FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), MacroBlockEndRegex(Style.MacroBlockEnd) { + LangOpts = getFormattingLangOpts(Style); Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts)); Lex->SetKeepWhitespaceMode(true); @@ -1442,7 +1444,6 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) { void FormatTokenLexer::resetLexer(unsigned Offset) { StringRef Buffer = SourceMgr.getBufferData(ID); - LangOpts = getFormattingLangOpts(Style); Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts, Buffer.begin(), Buffer.begin() + Offset, Buffer.end())); Lex->SetKeepWhitespaceMode(true); diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h index 65dd733..52838f1 100644 --- a/clang/lib/Format/FormatTokenLexer.h +++ b/clang/lib/Format/FormatTokenLexer.h @@ -120,7 +120,6 @@ private: unsigned Column; unsigned TrailingWhitespace; std::unique_ptr Lex; - LangOptions LangOpts; const SourceManager &SourceMgr; FileID ID; const FormatStyle &Style; -- cgit v1.1 From b2cd50dbe78c0f0328fe208ab8c4d6005d9272dd Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 8 Feb 2024 22:42:06 -0800 Subject: [RISCV] Use replace XLenVT with i64 in some isel patterns that are only used for RV64. NFC --- llvm/lib/Target/RISCV/RISCVInstrInfoZb.td | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index f8938c0..9e32444 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -888,22 +888,22 @@ foreach i = {1,2,3} in { } let Predicates = [HasStdExtZbs, IsRV64] in { -def : Pat<(i32 (and (not (shiftop 1, (XLenVT GPR:$rs2))), GPR:$rs1)), +def : Pat<(i32 (and (not (shiftop 1, (i64 GPR:$rs2))), GPR:$rs1)), (BCLR GPR:$rs1, GPR:$rs2)>; -def : Pat<(i32 (and (rotl -2, (XLenVT GPR:$rs2)), GPR:$rs1)), +def : Pat<(i32 (and (rotl -2, (i64 GPR:$rs2)), GPR:$rs1)), (BCLR GPR:$rs1, GPR:$rs2)>; -def : Pat<(i32 (or (shiftop 1, (XLenVT GPR:$rs2)), GPR:$rs1)), +def : Pat<(i32 (or (shiftop 1, (i64 GPR:$rs2)), GPR:$rs1)), (BSET GPR:$rs1, GPR:$rs2)>; -def : Pat<(i32 (xor (shiftop 1, (XLenVT GPR:$rs2)), GPR:$rs1)), +def : Pat<(i32 (xor (shiftop 1, (i64 GPR:$rs2)), GPR:$rs1)), (BINV GPR:$rs1, GPR:$rs2)>; -def : Pat<(i32 (and (shiftop GPR:$rs1, (XLenVT GPR:$rs2)), 1)), +def : Pat<(i32 (and (shiftop GPR:$rs1, (i64 GPR:$rs2)), 1)), (BEXT GPR:$rs1, GPR:$rs2)>; -def : Pat<(i64 (and (anyext (i32 (shiftop GPR:$rs1, (XLenVT GPR:$rs2)))), 1)), +def : Pat<(i64 (and (anyext (i32 (shiftop GPR:$rs1, (i64 GPR:$rs2)))), 1)), (BEXT GPR:$rs1, GPR:$rs2)>; -def : Pat<(i32 (shiftop 1, (XLenVT GPR:$rs2))), +def : Pat<(i32 (shiftop 1, (i64 GPR:$rs2))), (BSET (XLenVT X0), GPR:$rs2)>; -def : Pat<(i32 (not (shiftop -1, (XLenVT GPR:$rs2)))), +def : Pat<(i32 (not (shiftop -1, (i64 GPR:$rs2)))), (ADDI (BSET (XLenVT X0), GPR:$rs2), -1)>; def : Pat<(i32 (and (srl GPR:$rs1, uimm5:$shamt), (i32 1))), -- cgit v1.1 From 95b14da678f4670283240ef4cf60f3a39bed97b4 Mon Sep 17 00:00:00 2001 From: Quentin Dian Date: Fri, 9 Feb 2024 15:29:05 +0800 Subject: [RegisterCoalescer] Clear instructions not recorded in `ErasedInstrs` but erased (#79820) Fixes #79718. Fixes #71178. The same instructions may exist in an iteration. We cannot immediately delete instructions in `ErasedInstrs`. --- llvm/lib/CodeGen/RegisterCoalescer.cpp | 27 ++- .../LoongArch/register-coalescer-crash-pr79718.mir | 213 +++++++++++++++++++++ .../X86/PR71178-register-coalescer-crash.ll | 103 ++++++++++ 3 files changed, 338 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir create mode 100644 llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index cbb1a74..7e9c992 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -236,7 +236,8 @@ namespace { /// was successfully coalesced away. If it is not currently possible to /// coalesce this interval, but it may be possible if other things get /// coalesced, then it returns true by reference in 'Again'. - bool joinCopy(MachineInstr *CopyMI, bool &Again); + bool joinCopy(MachineInstr *CopyMI, bool &Again, + SmallPtrSetImpl &CurrentErasedInstrs); /// Attempt to join these two intervals. On failure, this /// returns false. The output "SrcInt" will not have been modified, so we @@ -1964,7 +1965,9 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI, LIS->shrinkToUses(&LI); } -bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { +bool RegisterCoalescer::joinCopy( + MachineInstr *CopyMI, bool &Again, + SmallPtrSetImpl &CurrentErasedInstrs) { Again = false; LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI); @@ -2156,7 +2159,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { // CopyMI has been erased by joinIntervals at this point. Remove it from // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back // to the work list. This keeps ErasedInstrs from growing needlessly. - ErasedInstrs.erase(CopyMI); + if (ErasedInstrs.erase(CopyMI)) + // But we may encounter the instruction again in this iteration. + CurrentErasedInstrs.insert(CopyMI); // Rewrite all SrcReg operands to DstReg. // Also update DstReg operands to include DstIdx if it is set. @@ -3982,21 +3987,33 @@ void RegisterCoalescer::lateLiveIntervalUpdate() { bool RegisterCoalescer:: copyCoalesceWorkList(MutableArrayRef CurrList) { bool Progress = false; + SmallPtrSet CurrentErasedInstrs; for (MachineInstr *&MI : CurrList) { if (!MI) continue; // Skip instruction pointers that have already been erased, for example by // dead code elimination. - if (ErasedInstrs.count(MI)) { + if (ErasedInstrs.count(MI) || CurrentErasedInstrs.count(MI)) { MI = nullptr; continue; } bool Again = false; - bool Success = joinCopy(MI, Again); + bool Success = joinCopy(MI, Again, CurrentErasedInstrs); Progress |= Success; if (Success || !Again) MI = nullptr; } + // Clear instructions not recorded in `ErasedInstrs` but erased. + if (!CurrentErasedInstrs.empty()) { + for (MachineInstr *&MI : CurrList) { + if (MI && CurrentErasedInstrs.count(MI)) + MI = nullptr; + } + for (MachineInstr *&MI : WorkList) { + if (MI && CurrentErasedInstrs.count(MI)) + MI = nullptr; + } + } return Progress; } diff --git a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir new file mode 100644 index 0000000..9bbb579 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir @@ -0,0 +1,213 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -o - %s -mtriple=loongarch64 \ +# RUN: -run-pass=register-coalescer -join-liveintervals=1 -join-splitedges=0 | FileCheck %s + +--- +name: foo +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $r4, $r5, $r6, $r7, $r8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $r8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $r7 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $r6 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $r5 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr = COPY $r4 + ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1 + ; CHECK-NEXT: [[ORI:%[0-9]+]]:gpr = ORI $r0, 1 + ; CHECK-NEXT: [[ANDI1:%[0-9]+]]:gpr = ANDI [[COPY2]], 1 + ; CHECK-NEXT: [[ANDI2:%[0-9]+]]:gpr = ANDI [[COPY1]], 1 + ; CHECK-NEXT: [[ANDI3:%[0-9]+]]:gpr = ANDI [[COPY]], 1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr = COPY $r0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr = COPY [[COPY5]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BEQZ [[ANDI]], %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.9(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: PseudoBR %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.7(0x7c000000), %bb.6(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[LD_D:%[0-9]+]]:gpr = LD_D $r0, 8 + ; CHECK-NEXT: dead [[LD_D1:%[0-9]+]]:gpr = LD_D $r0, 0 + ; CHECK-NEXT: BNEZ [[ANDI1]], %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6: + ; CHECK-NEXT: successors: %bb.11(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY $r0 + ; CHECK-NEXT: PseudoBR %bb.11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7: + ; CHECK-NEXT: successors: %bb.8(0x7c000000), %bb.10(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BEQZ [[ANDI2]], %bb.10 + ; CHECK-NEXT: PseudoBR %bb.8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8: + ; CHECK-NEXT: successors: %bb.9(0x04000000), %bb.5(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1 + ; CHECK-NEXT: BEQZ [[ANDI3]], %bb.5 + ; CHECK-NEXT: PseudoBR %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9: + ; CHECK-NEXT: successors: %bb.12(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ST_B $r0, [[COPY4]], 0 + ; CHECK-NEXT: PseudoBR %bb.12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.10: + ; CHECK-NEXT: successors: %bb.11(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY [[ORI]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.11: + ; CHECK-NEXT: successors: %bb.12(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ST_D $r0, [[COPY4]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.12: + ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.1(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BEQ [[COPY7]], [[ORI]], %bb.2 + ; CHECK-NEXT: PseudoBR %bb.1 + bb.0: + liveins: $r4, $r5, $r6, $r7, $r8 + + %0:gpr = COPY killed $r8 + %1:gpr = COPY killed $r7 + %2:gpr = COPY killed $r6 + %3:gpr = COPY killed $r5 + %4:gpr = COPY killed $r4 + %5:gpr = COPY $r0 + %6:gpr = COPY killed %5 + %7:gpr = ANDI killed %3, 1 + %8:gpr = ORI $r0, 1 + %9:gpr = ANDI killed %2, 1 + %10:gpr = ANDI killed %1, 1 + %11:gpr = ANDI killed %0, 1 + %12:gpr = COPY %6 + %13:gpr = COPY killed %6 + %14:gpr = IMPLICIT_DEF + + bb.1: + %15:gpr = COPY killed %14 + %16:gpr = COPY killed %13 + %17:gpr = COPY killed %12 + %18:gpr = COPY %17 + %19:gpr = COPY %16 + %20:gpr = COPY killed %16 + %21:gpr = COPY killed %15 + + bb.2: + successors: %bb.3, %bb.4 + + %22:gpr = COPY killed %21 + %23:gpr = COPY killed %20 + %24:gpr = COPY killed %19 + %25:gpr = COPY killed %18 + BEQZ %7, %bb.4 + + bb.3: + %26:gpr = COPY killed %24 + %27:gpr = COPY killed %23 + PseudoBR %bb.9 + + bb.4: + %28:gpr = COPY killed %23 + + bb.5: + successors: %bb.7(0x7c000000), %bb.6(0x04000000) + + %29:gpr = COPY killed %28 + dead %30:gpr = LD_D $r0, 8 + dead %31:gpr = LD_D $r0, 0 + BNEZ %9, %bb.7 + + bb.6: + %32:gpr = COPY $r0 + %33:gpr = COPY killed %32 + %34:gpr = COPY killed %33 + %35:gpr = COPY killed %22 + PseudoBR %bb.11 + + bb.7: + successors: %bb.8(0x7c000000), %bb.10(0x04000000) + + BEQZ %10, %bb.10 + PseudoBR %bb.8 + + bb.8: + successors: %bb.9(0x04000000), %bb.5(0x7c000000) + + %36:gpr = ADDI_D killed %29, 1 + %28:gpr = COPY %36 + %26:gpr = COPY %36 + %27:gpr = COPY killed %36 + BEQZ %11, %bb.5 + PseudoBR %bb.9 + + bb.9: + %37:gpr = COPY killed %27 + %38:gpr = COPY killed %26 + %39:gpr = COPY $r0 + ST_B killed %39, %4, 0 + %40:gpr = COPY killed %25 + %41:gpr = COPY killed %38 + %42:gpr = COPY killed %37 + %43:gpr = COPY killed %22 + PseudoBR %bb.12 + + bb.10: + %44:gpr = ADDI_D killed %29, 1 + %34:gpr = COPY %8 + %35:gpr = COPY killed %44 + + bb.11: + %45:gpr = COPY killed %35 + %46:gpr = COPY killed %34 + %47:gpr = COPY $r0 + ST_D killed %47, %4, 0 + %40:gpr = COPY %45 + %41:gpr = COPY %46 + %42:gpr = COPY killed %46 + %43:gpr = COPY killed %45 + + bb.12: + successors: %bb.2(0x7c000000), %bb.1(0x04000000) + + %48:gpr = COPY killed %43 + %49:gpr = COPY killed %42 + %50:gpr = COPY killed %41 + %51:gpr = COPY killed %40 + %12:gpr = COPY %51 + %13:gpr = COPY %50 + %14:gpr = COPY %48 + %18:gpr = COPY killed %51 + %19:gpr = COPY killed %50 + %20:gpr = COPY killed %49 + %21:gpr = COPY killed %48 + BEQ %17, %8, %bb.2 + PseudoBR %bb.1 + +... diff --git a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll new file mode 100644 index 0000000..0ce346f --- /dev/null +++ b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=x86_64 -- | FileCheck %s + +define i32 @h(i1 %arg, i32 %arg1) { +; CHECK-LABEL: h: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: movabsq $9166129423, %rcx # imm = 0x22258090F +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: jmp .LBB0_1 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_9: # %bb18 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: jne .LBB0_10 +; CHECK-NEXT: .LBB0_1: # %bb4 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: # %bb.7: # %bb16 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: jne .LBB0_9 +; CHECK-NEXT: # %bb.8: # %bb17 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: jmp .LBB0_9 +; CHECK-NEXT: .LBB0_2: # %bb9 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB0_4 +; CHECK-NEXT: # %bb.3: # %bb13 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: .LBB0_4: # %bb14 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: cmpl $1, %esi +; CHECK-NEXT: je .LBB0_1 +; CHECK-NEXT: # %bb.5: # %bb14 +; CHECK-NEXT: movl %eax, %r8d +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: jne .LBB0_6 +; CHECK-NEXT: .LBB0_10: # %bb22 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_6: # %bb22.loopexit1 +; CHECK-NEXT: movl %r8d, %eax +; CHECK-NEXT: retq +bb: + br label %bb2 + +bb2: ; preds = %bb14, %bb + %i = phi i64 [ %i5, %bb14 ], [ 0, %bb ] + %i3 = phi i32 [ %i15, %bb14 ], [ 1, %bb ] + br label %bb4 + +bb4: ; preds = %bb18, %bb2 + %i5 = phi i64 [ %i19, %bb18 ], [ %i, %bb2 ] + %i6 = phi i64 [ %i20, %bb18 ], [ %i, %bb2 ] + %i7 = phi i32 [ 0, %bb18 ], [ %i3, %bb2 ] + %i8 = icmp eq i64 %i6, 0 + br i1 %i8, label %bb16, label %bb9 + +bb9: ; preds = %bb4 + br i1 %arg, label %bb12, label %bb10 + +bb10: ; preds = %bb9 + %i11 = sdiv i64 0, 0 + br label %bb12 + +bb12: ; preds = %bb10, %bb9 + br i1 %arg, label %bb13, label %bb14 + +bb13: ; preds = %bb12 + br label %bb14 + +bb14: ; preds = %bb13, %bb12 + %i15 = phi i32 [ 0, %bb13 ], [ %i7, %bb12 ] + switch i32 %arg1, label %bb22 [ + i32 0, label %bb21 + i32 1, label %bb2 + ] + +bb16: ; preds = %bb4 + br i1 %arg, label %bb18, label %bb17 + +bb17: ; preds = %bb16 + br label %bb18 + +bb18: ; preds = %bb17, %bb16 + %i19 = phi i64 [ 9166129423, %bb17 ], [ %i5, %bb16 ] + %i20 = phi i64 [ 9166129423, %bb17 ], [ %i6, %bb16 ] + br i1 %arg, label %bb22, label %bb4 + +bb21: ; preds = %bb14 + br label %bb22 + +bb22: ; preds = %bb21, %bb18, %bb14 + %i23 = phi i32 [ %arg1, %bb21 ], [ %i15, %bb14 ], [ 0, %bb18 ] + ret i32 %i23 +} -- cgit v1.1 From abc39f9aa750634973fe8ba5519d6bbdd70567c4 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 8 Feb 2024 23:36:37 -0800 Subject: [RISCV] Add casts to isel patterns that produce more than 1 instruction. We need explicitly cast to XLenVT to avoid tablegen picking i32. If the SelectionDAG scheduler is used it can't find a register class for i32 if i32 isn't a legal type. Fixes #81192, but I might have missed some patterns. --- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 24 +++---- llvm/lib/Target/RISCV/RISCVInstrInfoD.td | 24 +++---- llvm/lib/Target/RISCV/RISCVInstrInfoF.td | 16 ++--- llvm/lib/Target/RISCV/RISCVInstrInfoM.td | 2 +- llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 59 ++++++++-------- llvm/lib/Target/RISCV/RISCVInstrInfoZb.td | 99 ++++++++++++++------------- llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td | 16 ++--- 7 files changed, 122 insertions(+), 118 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 5189824..7fe9b62 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1260,14 +1260,14 @@ def : PatGprSimm12; // negate of low bit can be done via two (compressible) shifts. The negate // is never compressible since rs1 and rd can't be the same register. def : Pat<(XLenVT (sub 0, (and_oneuse GPR:$rs, 1))), - (SRAI (SLLI $rs, (ImmSubFromXLen (XLenVT 1))), + (SRAI (XLenVT (SLLI $rs, (ImmSubFromXLen (XLenVT 1)))), (ImmSubFromXLen (XLenVT 1)))>; // AND with leading/trailing ones mask exceeding simm32/simm12. def : Pat<(i64 (and GPR:$rs, LeadingOnesMask:$mask)), - (SLLI (SRLI $rs, LeadingOnesMask:$mask), LeadingOnesMask:$mask)>; + (SLLI (i64 (SRLI $rs, LeadingOnesMask:$mask)), LeadingOnesMask:$mask)>; def : Pat<(XLenVT (and GPR:$rs, TrailingOnesMask:$mask)), - (SRLI (SLLI $rs, TrailingOnesMask:$mask), TrailingOnesMask:$mask)>; + (SRLI (XLenVT (SLLI $rs, TrailingOnesMask:$mask)), TrailingOnesMask:$mask)>; // Match both a plain shift and one where the shift amount is masked (this is // typically introduced when the legalizer promotes the shift amount and @@ -1380,7 +1380,7 @@ defm Select_GPR : SelectCC_GPR_rrirr; class SelectCompressOpt : Pat<(riscv_selectcc_frag:$select (XLenVT GPR:$lhs), simm12_no6:$Constant, Cond, (XLenVT GPR:$truev), GPR:$falsev), - (Select_GPR_Using_CC_GPR (ADDI GPR:$lhs, (NegImm simm12:$Constant)), (XLenVT X0), + (Select_GPR_Using_CC_GPR (XLenVT (ADDI GPR:$lhs, (NegImm simm12:$Constant))), (XLenVT X0), (IntCCtoRISCVCC $select), GPR:$truev, GPR:$falsev)>; def OptForMinSize : Predicate<"MF ? MF->getFunction().hasMinSize() : false">; @@ -1728,12 +1728,12 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), /// RV64 patterns let Predicates = [IsRV64, NotHasStdExtZba] in { -def : Pat<(i64 (and GPR:$rs1, 0xffffffff)), (SRLI (SLLI GPR:$rs1, 32), 32)>; +def : Pat<(i64 (and GPR:$rs1, 0xffffffff)), (SRLI (i64 (SLLI GPR:$rs1, 32)), 32)>; // If we're shifting a 32-bit zero extended value left by 0-31 bits, use 2 // shifts instead of 3. This can occur when unsigned is used to index an array. def : Pat<(i64 (shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)), - (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>; + (SRLI (i64 (SLLI GPR:$rs1, 32)), (ImmSubFrom32 uimm5:$shamt))>; } class binop_allhusers @@ -1768,7 +1768,7 @@ def u32simm12 : ImmLeaf; + (SLLI (i64 (SRLIW $rs, LeadingOnesWMask:$mask)), LeadingOnesWMask:$mask)>; /// sext and zext @@ -1864,13 +1864,13 @@ def KCFI_CHECK /// Simple optimization def : Pat<(XLenVT (add GPR:$rs1, (AddiPair:$rs2))), - (ADDI (ADDI GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2)), + (ADDI (XLenVT (ADDI GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2))), (AddiPairImmSmall GPR:$rs2))>; let Predicates = [IsRV64] in { // Select W instructions if only the lower 32-bits of the result are used. def : Pat<(binop_allwusers GPR:$rs1, (AddiPair:$rs2)), - (ADDIW (ADDIW GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2)), + (ADDIW (i64 (ADDIW GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2))), (AddiPairImmSmall AddiPair:$rs2))>; } @@ -1929,7 +1929,7 @@ def : PatGprImm; def : PatGprImm; def : Pat<(i32 (and GPR:$rs, TrailingOnesMask:$mask)), - (SRLI (SLLI $rs, (i64 (XLenSubTrailingOnes $mask))), + (SRLI (i32 (SLLI $rs, (i64 (XLenSubTrailingOnes $mask)))), (i64 (XLenSubTrailingOnes $mask)))>; // Use sext if the sign bit of the input is 0. @@ -1937,12 +1937,12 @@ def : Pat<(zext_is_sext GPR:$src), (ADDIW GPR:$src, 0)>; } let Predicates = [IsRV64, NotHasStdExtZba] in { -def : Pat<(zext GPR:$src), (SRLI (SLLI GPR:$src, 32), 32)>; +def : Pat<(zext GPR:$src), (SRLI (i64 (SLLI GPR:$src, 32)), 32)>; // If we're shifting a 32-bit zero extended value left by 0-31 bits, use 2 // shifts instead of 3. This can occur when unsigned is used to index an array. def : Pat<(shl (zext GPR:$rs), uimm5:$shamt), - (SRLI (SLLI GPR:$rs, 32), (ImmSubFrom32 uimm5:$shamt))>; + (SRLI (i64 (SLLI GPR:$rs, 32)), (ImmSubFrom32 uimm5:$shamt))>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index fec43d8..9b4f93d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -410,11 +410,11 @@ foreach Ext = DExts in { let Predicates = [HasStdExtD] in { // Match signaling FEQ_D def : Pat<(XLenVT (strict_fsetccs FPR64:$rs1, FPR64:$rs2, SETEQ)), - (AND (FLE_D $rs1, $rs2), - (FLE_D $rs2, $rs1))>; + (AND (XLenVT (FLE_D $rs1, $rs2)), + (XLenVT (FLE_D $rs2, $rs1)))>; def : Pat<(XLenVT (strict_fsetccs FPR64:$rs1, FPR64:$rs2, SETOEQ)), - (AND (FLE_D $rs1, $rs2), - (FLE_D $rs2, $rs1))>; + (AND (XLenVT (FLE_D $rs1, $rs2)), + (XLenVT (FLE_D $rs2, $rs1)))>; // If both operands are the same, use a single FLE. def : Pat<(XLenVT (strict_fsetccs FPR64:$rs1, FPR64:$rs1, SETEQ)), (FLE_D $rs1, $rs1)>; @@ -430,11 +430,11 @@ def : PatSetCC; let Predicates = [HasStdExtZdinx, IsRV64] in { // Match signaling FEQ_D def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs2, SETEQ)), - (AND (FLE_D_INX $rs1, $rs2), - (FLE_D_INX $rs2, $rs1))>; + (AND (XLenVT (FLE_D_INX $rs1, $rs2)), + (XLenVT (FLE_D_INX $rs2, $rs1)))>; def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs2, SETOEQ)), - (AND (FLE_D_INX $rs1, $rs2), - (FLE_D_INX $rs2, $rs1))>; + (AND (XLenVT (FLE_D_INX $rs1, $rs2)), + (XLenVT (FLE_D_INX $rs2, $rs1)))>; // If both operands are the same, use a single FLE. def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs1, SETEQ)), (FLE_D_INX $rs1, $rs1)>; @@ -450,11 +450,11 @@ def : PatSetCC; let Predicates = [HasStdExtZdinx, IsRV32] in { // Match signaling FEQ_D def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs2, SETEQ)), - (AND (FLE_D_IN32X $rs1, $rs2), - (FLE_D_IN32X $rs2, $rs1))>; + (AND (XLenVT (FLE_D_IN32X $rs1, $rs2)), + (XLenVT (FLE_D_IN32X $rs2, $rs1)))>; def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs2, SETOEQ)), - (AND (FLE_D_IN32X $rs1, $rs2), - (FLE_D_IN32X $rs2, $rs1))>; + (AND (XLenVT (FLE_D_IN32X $rs1, $rs2)), + (XLenVT (FLE_D_IN32X $rs2, $rs1)))>; // If both operands are the same, use a single FLE. def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs1, SETEQ)), (FLE_D_IN32X $rs1, $rs1)>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index 52eadbd..7d89608 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -617,11 +617,11 @@ foreach Ext = FExts in { let Predicates = [HasStdExtF] in { // Match signaling FEQ_S def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs2, SETEQ)), - (AND (FLE_S $rs1, $rs2), - (FLE_S $rs2, $rs1))>; + (AND (XLenVT (FLE_S $rs1, $rs2)), + (XLenVT (FLE_S $rs2, $rs1)))>; def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs2, SETOEQ)), - (AND (FLE_S $rs1, $rs2), - (FLE_S $rs2, $rs1))>; + (AND (XLenVT (FLE_S $rs1, $rs2)), + (XLenVT (FLE_S $rs2, $rs1)))>; // If both operands are the same, use a single FLE. def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs1, SETEQ)), (FLE_S $rs1, $rs1)>; @@ -632,11 +632,11 @@ def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs1, SETOEQ)), let Predicates = [HasStdExtZfinx] in { // Match signaling FEQ_S def : Pat<(XLenVT (strict_fsetccs FPR32INX:$rs1, FPR32INX:$rs2, SETEQ)), - (AND (FLE_S_INX $rs1, $rs2), - (FLE_S_INX $rs2, $rs1))>; + (AND (XLenVT (FLE_S_INX $rs1, $rs2)), + (XLenVT (FLE_S_INX $rs2, $rs1)))>; def : Pat<(XLenVT (strict_fsetccs FPR32INX:$rs1, FPR32INX:$rs2, SETOEQ)), - (AND (FLE_S_INX $rs1, $rs2), - (FLE_S_INX $rs2, $rs1))>; + (AND (XLenVT (FLE_S_INX $rs1, $rs2)), + (XLenVT (FLE_S_INX $rs2, $rs1)))>; // If both operands are the same, use a single FLE. def : Pat<(XLenVT (strict_fsetccs FPR32INX:$rs1, FPR32INX:$rs1, SETEQ)), (FLE_S_INX $rs1, $rs1)>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td index f9890ca..6b43d43 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td @@ -112,7 +112,7 @@ let Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] in { // inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish // zeroing the upper 32 bits. def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))), - (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>; + (MULHU (i64 (SLLI GPR:$rs1, 32)), (i64 (SLLI GPR:$rs2, 32)))>; } // Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td index ff474e4..79ced38 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td @@ -548,65 +548,66 @@ def : Pat<(add_non_imm12 sh3add_op:$rs1, (XLenVT GPR:$rs2)), (TH_ADDSL GPR:$rs2, sh3add_op:$rs1, 3)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2), - (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 1), 1)>; + (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 1)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2), - (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 2), 1)>; + (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 1)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 18)), GPR:$rs2), - (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 3), 1)>; + (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 1)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 12)), GPR:$rs2), - (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 1), 2)>; + (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 20)), GPR:$rs2), - (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 2), 2)>; + (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 36)), GPR:$rs2), - (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 3), 2)>; + (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 24)), GPR:$rs2), - (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 1), 3)>; + (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 3)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2), - (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 2), 3)>; + (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 3)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2), - (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 3), 3)>; + (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 3)>; def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i), - (TH_ADDSL GPR:$r, (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i)), 2)>; + (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))), 2)>; def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i), - (TH_ADDSL GPR:$r, (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i)), 3)>; + (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))), 3)>; def : Pat<(mul (XLenVT GPR:$r), C3LeftShift:$i), - (SLLI (TH_ADDSL GPR:$r, GPR:$r, 1), + (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 1)), (TrailingZeros C3LeftShift:$i))>; def : Pat<(mul (XLenVT GPR:$r), C5LeftShift:$i), - (SLLI (TH_ADDSL GPR:$r, GPR:$r, 2), + (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), (TrailingZeros C5LeftShift:$i))>; def : Pat<(mul (XLenVT GPR:$r), C9LeftShift:$i), - (SLLI (TH_ADDSL GPR:$r, GPR:$r, 3), + (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), (TrailingZeros C9LeftShift:$i))>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 11)), - (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 2), 1)>; + (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 1)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 19)), - (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 3), 1)>; + (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 1)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 13)), - (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 1), 2)>; + (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 1)), 2)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 21)), - (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 2), 2)>; + (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 37)), - (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 3), 2)>; + (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 2)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 25)), - (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 2), (TH_ADDSL GPR:$r, GPR:$r, 2), 2)>; + (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), + (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 41)), - (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 2), 3)>; + (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 3)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 73)), - (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 3), 3)>; + (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 3)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 27)), - (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 3), (TH_ADDSL GPR:$r, GPR:$r, 3), 1)>; + (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 1)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 45)), - (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 3), (TH_ADDSL GPR:$r, GPR:$r, 3), 2)>; + (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 2)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 81)), - (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 3), (TH_ADDSL GPR:$r, GPR:$r, 3), 3)>; + (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 3)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 200)), - (SLLI (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 2), - (TH_ADDSL GPR:$r, GPR:$r, 2), 2), 3)>; + (SLLI (XLenVT (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), + (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)), 3)>; } // Predicates = [HasVendorXTHeadBa] let Predicates = [HasVendorXTHeadBb] in { @@ -633,14 +634,14 @@ def : Pat<(sra (bswap i64:$rs1), (i64 32)), def : Pat<(binop_allwusers (bswap i64:$rs1), (i64 32)), (TH_REVW i64:$rs1)>; def : Pat<(riscv_clzw i64:$rs1), - (TH_FF0 (SLLI (XORI i64:$rs1, -1), 32))>; + (TH_FF0 (i64 (SLLI (i64 (XORI i64:$rs1, -1)), 32)))>; } // Predicates = [HasVendorXTHeadBb, IsRV64] let Predicates = [HasVendorXTHeadBs] in { def : Pat<(and (srl (XLenVT GPR:$rs1), uimmlog2xlen:$shamt), 1), (TH_TST GPR:$rs1, uimmlog2xlen:$shamt)>; def : Pat<(XLenVT (seteq (and (XLenVT GPR:$rs1), SingleBitSetMask:$mask), 0)), - (TH_TST (XORI GPR:$rs1, -1), SingleBitSetMask:$mask)>; + (TH_TST (XLenVT (XORI GPR:$rs1, -1)), SingleBitSetMask:$mask)>; } // Predicates = [HasVendorXTHeadBs] let Predicates = [HasVendorXTHeadCondMov] in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 9e32444..f0f8494 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -555,7 +555,7 @@ def : Pat<(XLenVT (and (shiftop GPR:$rs1, (XLenVT GPR:$rs2)), 1)), def : Pat<(XLenVT (shiftop 1, (XLenVT GPR:$rs2))), (BSET (XLenVT X0), GPR:$rs2)>; def : Pat<(XLenVT (not (shiftop -1, (XLenVT GPR:$rs2)))), - (ADDI (BSET (XLenVT X0), GPR:$rs2), -1)>; + (ADDI (XLenVT (BSET (XLenVT X0), GPR:$rs2)), -1)>; def : Pat<(XLenVT (and GPR:$rs1, BCLRMask:$mask)), (BCLRI GPR:$rs1, BCLRMask:$mask)>; @@ -568,25 +568,25 @@ def : Pat<(XLenVT (and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1))), (BEXTI GPR:$rs1, uimmlog2xlen:$shamt)>; def : Pat<(XLenVT (seteq (XLenVT (and GPR:$rs1, SingleBitSetMask:$mask)), 0)), - (BEXTI (XORI GPR:$rs1, -1), SingleBitSetMask:$mask)>; + (BEXTI (XLenVT (XORI GPR:$rs1, -1)), SingleBitSetMask:$mask)>; def : Pat<(XLenVT (or GPR:$r, BSETINVTwoBitsMask:$i)), - (BSETI (BSETI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i)), + (BSETI (XLenVT (BSETI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i))), (BSETINVTwoBitsMaskHigh BSETINVTwoBitsMask:$i))>; def : Pat<(XLenVT (xor GPR:$r, BSETINVTwoBitsMask:$i)), - (BINVI (BINVI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i)), + (BINVI (XLenVT (BINVI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i))), (BSETINVTwoBitsMaskHigh BSETINVTwoBitsMask:$i))>; def : Pat<(XLenVT (or GPR:$r, BSETINVORIMask:$i)), - (BSETI (ORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i)), + (BSETI (XLenVT (ORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i))), (BSETINVTwoBitsMaskHigh BSETINVORIMask:$i))>; def : Pat<(XLenVT (xor GPR:$r, BSETINVORIMask:$i)), - (BINVI (XORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i)), + (BINVI (XLenVT (XORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i))), (BSETINVTwoBitsMaskHigh BSETINVORIMask:$i))>; def : Pat<(XLenVT (and GPR:$r, BCLRITwoBitsMask:$i)), - (BCLRI (BCLRI GPR:$r, (BCLRITwoBitsMaskLow BCLRITwoBitsMask:$i)), + (BCLRI (XLenVT (BCLRI GPR:$r, (BCLRITwoBitsMaskLow BCLRITwoBitsMask:$i))), (BCLRITwoBitsMaskHigh BCLRITwoBitsMask:$i))>; def : Pat<(XLenVT (and GPR:$r, BCLRIANDIMask:$i)), - (BCLRI (ANDI GPR:$r, (BCLRIANDIMaskLow BCLRIANDIMask:$i)), + (BCLRI (XLenVT (ANDI GPR:$r, (BCLRIANDIMaskLow BCLRIANDIMask:$i))), (BCLRITwoBitsMaskHigh BCLRIANDIMask:$i))>; } // Predicates = [HasStdExtZbs] @@ -614,7 +614,7 @@ def : PatGpr; def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>; def : Pat<(i64 (riscv_absw GPR:$rs1)), - (MAX GPR:$rs1, (SUBW (XLenVT X0), GPR:$rs1))>; + (MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>; } // Predicates = [HasStdExtZbb, IsRV64] let Predicates = [HasStdExtZbb] in { @@ -686,63 +686,66 @@ foreach i = {1,2,3} in { } def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2), - (SH1ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; + (SH1ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2), - (SH1ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; + (SH1ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 18)), GPR:$rs2), - (SH1ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; + (SH1ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 12)), GPR:$rs2), - (SH2ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; + (SH2ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 20)), GPR:$rs2), - (SH2ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; + (SH2ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 36)), GPR:$rs2), - (SH2ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; + (SH2ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 24)), GPR:$rs2), - (SH3ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; + (SH3ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2), - (SH3ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; + (SH3ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2), - (SH3ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; + (SH3ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>; def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i), - (SH2ADD (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i)), + (SH2ADD (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))), GPR:$r)>; def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i), - (SH3ADD (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i)), + (SH3ADD (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))), GPR:$r)>; def : Pat<(mul (XLenVT GPR:$r), C3LeftShift:$i), - (SLLI (SH1ADD GPR:$r, GPR:$r), + (SLLI (XLenVT (SH1ADD GPR:$r, GPR:$r)), (TrailingZeros C3LeftShift:$i))>; def : Pat<(mul (XLenVT GPR:$r), C5LeftShift:$i), - (SLLI (SH2ADD GPR:$r, GPR:$r), + (SLLI (XLenVT (SH2ADD GPR:$r, GPR:$r)), (TrailingZeros C5LeftShift:$i))>; def : Pat<(mul (XLenVT GPR:$r), C9LeftShift:$i), - (SLLI (SH3ADD GPR:$r, GPR:$r), + (SLLI (XLenVT (SH3ADD GPR:$r, GPR:$r)), (TrailingZeros C9LeftShift:$i))>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 11)), - (SH1ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>; + (SH1ADD (XLenVT (SH2ADD GPR:$r, GPR:$r)), GPR:$r)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 19)), - (SH1ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>; + (SH1ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), GPR:$r)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 13)), - (SH2ADD (SH1ADD GPR:$r, GPR:$r), GPR:$r)>; + (SH2ADD (XLenVT (SH1ADD GPR:$r, GPR:$r)), GPR:$r)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 21)), - (SH2ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>; + (SH2ADD (XLenVT (SH2ADD GPR:$r, GPR:$r)), GPR:$r)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 37)), - (SH2ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>; + (SH2ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), GPR:$r)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 25)), - (SH3ADD (SH1ADD GPR:$r, GPR:$r), GPR:$r)>; + (SH3ADD (XLenVT (SH1ADD GPR:$r, GPR:$r)), GPR:$r)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 41)), - (SH3ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>; + (SH3ADD (XLenVT (SH2ADD GPR:$r, GPR:$r)), GPR:$r)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 73)), - (SH3ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>; + (SH3ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), GPR:$r)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 27)), - (SH1ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>; + (SH1ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), + (XLenVT (SH3ADD GPR:$r, GPR:$r)))>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 45)), - (SH2ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>; + (SH2ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), + (XLenVT (SH3ADD GPR:$r, GPR:$r)))>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 81)), - (SH3ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>; + (SH3ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), + (XLenVT (SH3ADD GPR:$r, GPR:$r)))>; } // Predicates = [HasStdExtZba] let Predicates = [HasStdExtZba, IsRV64] in { @@ -751,7 +754,7 @@ def : Pat<(i64 (shl (and GPR:$rs1, 0xFFFFFFFF), uimm5:$shamt)), // Match a shifted 0xffffffff mask. Use SRLI to clear the LSBs and SLLI_UW to // mask and shift. def : Pat<(i64 (and GPR:$rs1, Shifted32OnesMask:$mask)), - (SLLI_UW (SRLI GPR:$rs1, Shifted32OnesMask:$mask), + (SLLI_UW (XLenVT (SRLI GPR:$rs1, Shifted32OnesMask:$mask)), Shifted32OnesMask:$mask)>; def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFFF), GPR:$rs2)), (ADD_UW GPR:$rs1, GPR:$rs2)>; @@ -781,29 +784,29 @@ foreach i = {1,2,3} in { } def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFFE), (XLenVT GPR:$rs2))), - (SH1ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>; + (SH1ADD (XLenVT (SRLIW GPR:$rs1, 1)), GPR:$rs2)>; def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFFC), (XLenVT GPR:$rs2))), - (SH2ADD (SRLIW GPR:$rs1, 2), GPR:$rs2)>; + (SH2ADD (XLenVT (SRLIW GPR:$rs1, 2)), GPR:$rs2)>; def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFF8), (XLenVT GPR:$rs2))), - (SH3ADD (SRLIW GPR:$rs1, 3), GPR:$rs2)>; + (SH3ADD (XLenVT (SRLIW GPR:$rs1, 3)), GPR:$rs2)>; // Use SRLI to clear the LSBs and SHXADD_UW to mask and shift. def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0x1FFFFFFFE), (XLenVT GPR:$rs2))), - (SH1ADD_UW (SRLI GPR:$rs1, 1), GPR:$rs2)>; + (SH1ADD_UW (XLenVT (SRLI GPR:$rs1, 1)), GPR:$rs2)>; def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0x3FFFFFFFC), (XLenVT GPR:$rs2))), - (SH2ADD_UW (SRLI GPR:$rs1, 2), GPR:$rs2)>; + (SH2ADD_UW (XLenVT (SRLI GPR:$rs1, 2)), GPR:$rs2)>; def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0x7FFFFFFF8), (XLenVT GPR:$rs2))), - (SH3ADD_UW (SRLI GPR:$rs1, 3), GPR:$rs2)>; + (SH3ADD_UW (XLenVT (SRLI GPR:$rs1, 3)), GPR:$rs2)>; def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C3LeftShiftUW:$i)), - (SH1ADD (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i)), - (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i)))>; + (SH1ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i))), + (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i))))>; def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C5LeftShiftUW:$i)), - (SH2ADD (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i)), - (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i)))>; + (SH2ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i))), + (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i))))>; def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C9LeftShiftUW:$i)), - (SH3ADD (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i)), - (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i)))>; + (SH3ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i))), + (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i))))>; } // Predicates = [HasStdExtZba, IsRV64] let Predicates = [HasStdExtZbcOrZbkc] in { @@ -904,7 +907,7 @@ def : Pat<(i64 (and (anyext (i32 (shiftop GPR:$rs1, (i64 GPR:$rs2)))), 1)), def : Pat<(i32 (shiftop 1, (i64 GPR:$rs2))), (BSET (XLenVT X0), GPR:$rs2)>; def : Pat<(i32 (not (shiftop -1, (i64 GPR:$rs2)))), - (ADDI (BSET (XLenVT X0), GPR:$rs2), -1)>; + (ADDI (i32 (BSET (XLenVT X0), GPR:$rs2)), -1)>; def : Pat<(i32 (and (srl GPR:$rs1, uimm5:$shamt), (i32 1))), (BEXTI GPR:$rs1, uimm5:$shamt)>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index 2e0f754..e0f1c71 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -366,11 +366,11 @@ foreach Ext = ZfhExts in { let Predicates = [HasStdExtZfh] in { // Match signaling FEQ_H def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), FPR16:$rs2, SETEQ)), - (AND (FLE_H $rs1, $rs2), - (FLE_H $rs2, $rs1))>; + (AND (XLenVT (FLE_H $rs1, $rs2)), + (XLenVT (FLE_H $rs2, $rs1)))>; def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), FPR16:$rs2, SETOEQ)), - (AND (FLE_H $rs1, $rs2), - (FLE_H $rs2, $rs1))>; + (AND (XLenVT (FLE_H $rs1, $rs2)), + (XLenVT (FLE_H $rs2, $rs1)))>; // If both operands are the same, use a single FLE. def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), (f16 FPR16:$rs1), SETEQ)), (FLE_H $rs1, $rs1)>; @@ -381,11 +381,11 @@ def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), (f16 FPR16:$rs1), SETOEQ)), let Predicates = [HasStdExtZhinx] in { // Match signaling FEQ_H def : Pat<(XLenVT (strict_fsetccs FPR16INX:$rs1, FPR16INX:$rs2, SETEQ)), - (AND (FLE_H_INX $rs1, $rs2), - (FLE_H_INX $rs2, $rs1))>; + (AND (XLenVT (FLE_H_INX $rs1, $rs2)), + (XLenVT (FLE_H_INX $rs2, $rs1)))>; def : Pat<(XLenVT (strict_fsetccs FPR16INX:$rs1, FPR16INX:$rs2, SETOEQ)), - (AND (FLE_H_INX $rs1, $rs2), - (FLE_H_INX $rs2, $rs1))>; + (AND (XLenVT (FLE_H_INX $rs1, $rs2)), + (XLenVT (FLE_H_INX $rs2, $rs1)))>; // If both operands are the same, use a single FLE. def : Pat<(XLenVT (strict_fsetccs FPR16INX:$rs1, FPR16INX:$rs1, SETEQ)), (FLE_H_INX $rs1, $rs1)>; -- cgit v1.1 From 0079136f7d2454ef2889061bb214741163ba232d Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 9 Feb 2024 07:48:43 +0000 Subject: [BasicAA] Fix Scale check in vscale aliasing. (#81174) This is a fix for #80818, as pointed out in #81144 it should be checking the abs of Scale. The added test changes from NoAlias to MayAlias. --- llvm/lib/Analysis/BasicAliasAnalysis.cpp | 2 +- llvm/test/Analysis/BasicAA/vscale.ll | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index ae31814..682b0a2 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -1187,7 +1187,7 @@ AliasResult BasicAAResult::aliasGEP( // so noalias still holds so long as the dependency distance is at least as // big as the typesize. if (VLeftSize.hasValue() && - Scale.uge(VLeftSize.getValue().getKnownMinValue())) + Scale.abs().uge(VLeftSize.getValue().getKnownMinValue())) return AliasResult::NoAlias; } diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll index ce0c6f1..b2f5c66 100644 --- a/llvm/test/Analysis/BasicAA/vscale.ll +++ b/llvm/test/Analysis/BasicAA/vscale.ll @@ -458,6 +458,17 @@ define void @vscale_v1v2types(ptr %p) { ret void } +; CHECK-LABEL: vscale_negativescale +; CHECK-DAG: MayAlias: * %p, * %vm16 +define void @vscale_negativescale(ptr %p) vscale_range(1,16) { + %v = call i64 @llvm.vscale.i64() + %vm = mul nsw i64 %v, -15 + %vm16 = getelementptr i8, ptr %p, i64 %vm + load , ptr %vm16 + load , ptr %p + ret void +} + ; CHECK-LABEL: twovscales ; CHECK-DAG: MayAlias: * %vp161, * %vp162 ; CHECK-DAG: MayAlias: * %vp161, * %vp161b -- cgit v1.1 From 8316bf34ac21117f35bc8e6fafa2b3e7da75e1d5 Mon Sep 17 00:00:00 2001 From: DianQK Date: Fri, 9 Feb 2024 15:54:54 +0800 Subject: Revert "[RegisterCoalescer] Clear instructions not recorded in `ErasedInstrs` but erased (#79820)" This reverts commit 95b14da678f4670283240ef4cf60f3a39bed97b4. --- llvm/lib/CodeGen/RegisterCoalescer.cpp | 27 +-- .../LoongArch/register-coalescer-crash-pr79718.mir | 213 --------------------- .../X86/PR71178-register-coalescer-crash.ll | 103 ---------- 3 files changed, 5 insertions(+), 338 deletions(-) delete mode 100644 llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir delete mode 100644 llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 7e9c992..cbb1a74 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -236,8 +236,7 @@ namespace { /// was successfully coalesced away. If it is not currently possible to /// coalesce this interval, but it may be possible if other things get /// coalesced, then it returns true by reference in 'Again'. - bool joinCopy(MachineInstr *CopyMI, bool &Again, - SmallPtrSetImpl &CurrentErasedInstrs); + bool joinCopy(MachineInstr *CopyMI, bool &Again); /// Attempt to join these two intervals. On failure, this /// returns false. The output "SrcInt" will not have been modified, so we @@ -1965,9 +1964,7 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI, LIS->shrinkToUses(&LI); } -bool RegisterCoalescer::joinCopy( - MachineInstr *CopyMI, bool &Again, - SmallPtrSetImpl &CurrentErasedInstrs) { +bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { Again = false; LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI); @@ -2159,9 +2156,7 @@ bool RegisterCoalescer::joinCopy( // CopyMI has been erased by joinIntervals at this point. Remove it from // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back // to the work list. This keeps ErasedInstrs from growing needlessly. - if (ErasedInstrs.erase(CopyMI)) - // But we may encounter the instruction again in this iteration. - CurrentErasedInstrs.insert(CopyMI); + ErasedInstrs.erase(CopyMI); // Rewrite all SrcReg operands to DstReg. // Also update DstReg operands to include DstIdx if it is set. @@ -3987,33 +3982,21 @@ void RegisterCoalescer::lateLiveIntervalUpdate() { bool RegisterCoalescer:: copyCoalesceWorkList(MutableArrayRef CurrList) { bool Progress = false; - SmallPtrSet CurrentErasedInstrs; for (MachineInstr *&MI : CurrList) { if (!MI) continue; // Skip instruction pointers that have already been erased, for example by // dead code elimination. - if (ErasedInstrs.count(MI) || CurrentErasedInstrs.count(MI)) { + if (ErasedInstrs.count(MI)) { MI = nullptr; continue; } bool Again = false; - bool Success = joinCopy(MI, Again, CurrentErasedInstrs); + bool Success = joinCopy(MI, Again); Progress |= Success; if (Success || !Again) MI = nullptr; } - // Clear instructions not recorded in `ErasedInstrs` but erased. - if (!CurrentErasedInstrs.empty()) { - for (MachineInstr *&MI : CurrList) { - if (MI && CurrentErasedInstrs.count(MI)) - MI = nullptr; - } - for (MachineInstr *&MI : WorkList) { - if (MI && CurrentErasedInstrs.count(MI)) - MI = nullptr; - } - } return Progress; } diff --git a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir deleted file mode 100644 index 9bbb579..0000000 --- a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir +++ /dev/null @@ -1,213 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -# RUN: llc -o - %s -mtriple=loongarch64 \ -# RUN: -run-pass=register-coalescer -join-liveintervals=1 -join-splitedges=0 | FileCheck %s - ---- -name: foo -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: foo - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $r4, $r5, $r6, $r7, $r8 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $r8 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $r7 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $r6 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $r5 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr = COPY $r4 - ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1 - ; CHECK-NEXT: [[ORI:%[0-9]+]]:gpr = ORI $r0, 1 - ; CHECK-NEXT: [[ANDI1:%[0-9]+]]:gpr = ANDI [[COPY2]], 1 - ; CHECK-NEXT: [[ANDI2:%[0-9]+]]:gpr = ANDI [[COPY1]], 1 - ; CHECK-NEXT: [[ANDI3:%[0-9]+]]:gpr = ANDI [[COPY]], 1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr = COPY $r0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY $r0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr = COPY [[COPY5]] - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BEQZ [[ANDI]], %bb.4 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.9(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: PseudoBR %bb.9 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: - ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.5: - ; CHECK-NEXT: successors: %bb.7(0x7c000000), %bb.6(0x04000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead [[LD_D:%[0-9]+]]:gpr = LD_D $r0, 8 - ; CHECK-NEXT: dead [[LD_D1:%[0-9]+]]:gpr = LD_D $r0, 0 - ; CHECK-NEXT: BNEZ [[ANDI1]], %bb.7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.6: - ; CHECK-NEXT: successors: %bb.11(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY $r0 - ; CHECK-NEXT: PseudoBR %bb.11 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.7: - ; CHECK-NEXT: successors: %bb.8(0x7c000000), %bb.10(0x04000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BEQZ [[ANDI2]], %bb.10 - ; CHECK-NEXT: PseudoBR %bb.8 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.8: - ; CHECK-NEXT: successors: %bb.9(0x04000000), %bb.5(0x7c000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1 - ; CHECK-NEXT: BEQZ [[ANDI3]], %bb.5 - ; CHECK-NEXT: PseudoBR %bb.9 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.9: - ; CHECK-NEXT: successors: %bb.12(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: ST_B $r0, [[COPY4]], 0 - ; CHECK-NEXT: PseudoBR %bb.12 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.10: - ; CHECK-NEXT: successors: %bb.11(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY [[ORI]] - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.11: - ; CHECK-NEXT: successors: %bb.12(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: ST_D $r0, [[COPY4]], 0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.12: - ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.1(0x04000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BEQ [[COPY7]], [[ORI]], %bb.2 - ; CHECK-NEXT: PseudoBR %bb.1 - bb.0: - liveins: $r4, $r5, $r6, $r7, $r8 - - %0:gpr = COPY killed $r8 - %1:gpr = COPY killed $r7 - %2:gpr = COPY killed $r6 - %3:gpr = COPY killed $r5 - %4:gpr = COPY killed $r4 - %5:gpr = COPY $r0 - %6:gpr = COPY killed %5 - %7:gpr = ANDI killed %3, 1 - %8:gpr = ORI $r0, 1 - %9:gpr = ANDI killed %2, 1 - %10:gpr = ANDI killed %1, 1 - %11:gpr = ANDI killed %0, 1 - %12:gpr = COPY %6 - %13:gpr = COPY killed %6 - %14:gpr = IMPLICIT_DEF - - bb.1: - %15:gpr = COPY killed %14 - %16:gpr = COPY killed %13 - %17:gpr = COPY killed %12 - %18:gpr = COPY %17 - %19:gpr = COPY %16 - %20:gpr = COPY killed %16 - %21:gpr = COPY killed %15 - - bb.2: - successors: %bb.3, %bb.4 - - %22:gpr = COPY killed %21 - %23:gpr = COPY killed %20 - %24:gpr = COPY killed %19 - %25:gpr = COPY killed %18 - BEQZ %7, %bb.4 - - bb.3: - %26:gpr = COPY killed %24 - %27:gpr = COPY killed %23 - PseudoBR %bb.9 - - bb.4: - %28:gpr = COPY killed %23 - - bb.5: - successors: %bb.7(0x7c000000), %bb.6(0x04000000) - - %29:gpr = COPY killed %28 - dead %30:gpr = LD_D $r0, 8 - dead %31:gpr = LD_D $r0, 0 - BNEZ %9, %bb.7 - - bb.6: - %32:gpr = COPY $r0 - %33:gpr = COPY killed %32 - %34:gpr = COPY killed %33 - %35:gpr = COPY killed %22 - PseudoBR %bb.11 - - bb.7: - successors: %bb.8(0x7c000000), %bb.10(0x04000000) - - BEQZ %10, %bb.10 - PseudoBR %bb.8 - - bb.8: - successors: %bb.9(0x04000000), %bb.5(0x7c000000) - - %36:gpr = ADDI_D killed %29, 1 - %28:gpr = COPY %36 - %26:gpr = COPY %36 - %27:gpr = COPY killed %36 - BEQZ %11, %bb.5 - PseudoBR %bb.9 - - bb.9: - %37:gpr = COPY killed %27 - %38:gpr = COPY killed %26 - %39:gpr = COPY $r0 - ST_B killed %39, %4, 0 - %40:gpr = COPY killed %25 - %41:gpr = COPY killed %38 - %42:gpr = COPY killed %37 - %43:gpr = COPY killed %22 - PseudoBR %bb.12 - - bb.10: - %44:gpr = ADDI_D killed %29, 1 - %34:gpr = COPY %8 - %35:gpr = COPY killed %44 - - bb.11: - %45:gpr = COPY killed %35 - %46:gpr = COPY killed %34 - %47:gpr = COPY $r0 - ST_D killed %47, %4, 0 - %40:gpr = COPY %45 - %41:gpr = COPY %46 - %42:gpr = COPY killed %46 - %43:gpr = COPY killed %45 - - bb.12: - successors: %bb.2(0x7c000000), %bb.1(0x04000000) - - %48:gpr = COPY killed %43 - %49:gpr = COPY killed %42 - %50:gpr = COPY killed %41 - %51:gpr = COPY killed %40 - %12:gpr = COPY %51 - %13:gpr = COPY %50 - %14:gpr = COPY %48 - %18:gpr = COPY killed %51 - %19:gpr = COPY killed %50 - %20:gpr = COPY killed %49 - %21:gpr = COPY killed %48 - BEQ %17, %8, %bb.2 - PseudoBR %bb.1 - -... diff --git a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll deleted file mode 100644 index 0ce346f..0000000 --- a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll +++ /dev/null @@ -1,103 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc < %s -mtriple=x86_64 -- | FileCheck %s - -define i32 @h(i1 %arg, i32 %arg1) { -; CHECK-LABEL: h: -; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: movabsq $9166129423, %rcx # imm = 0x22258090F -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: jmp .LBB0_1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB0_9: # %bb18 -; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: jne .LBB0_10 -; CHECK-NEXT: .LBB0_1: # %bb4 -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: jne .LBB0_2 -; CHECK-NEXT: # %bb.7: # %bb16 -; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: jne .LBB0_9 -; CHECK-NEXT: # %bb.8: # %bb17 -; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: jmp .LBB0_9 -; CHECK-NEXT: .LBB0_2: # %bb9 -; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB0_4 -; CHECK-NEXT: # %bb.3: # %bb13 -; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: .LBB0_4: # %bb14 -; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: cmpl $1, %esi -; CHECK-NEXT: je .LBB0_1 -; CHECK-NEXT: # %bb.5: # %bb14 -; CHECK-NEXT: movl %eax, %r8d -; CHECK-NEXT: testl %esi, %esi -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: jne .LBB0_6 -; CHECK-NEXT: .LBB0_10: # %bb22 -; CHECK-NEXT: retq -; CHECK-NEXT: .LBB0_6: # %bb22.loopexit1 -; CHECK-NEXT: movl %r8d, %eax -; CHECK-NEXT: retq -bb: - br label %bb2 - -bb2: ; preds = %bb14, %bb - %i = phi i64 [ %i5, %bb14 ], [ 0, %bb ] - %i3 = phi i32 [ %i15, %bb14 ], [ 1, %bb ] - br label %bb4 - -bb4: ; preds = %bb18, %bb2 - %i5 = phi i64 [ %i19, %bb18 ], [ %i, %bb2 ] - %i6 = phi i64 [ %i20, %bb18 ], [ %i, %bb2 ] - %i7 = phi i32 [ 0, %bb18 ], [ %i3, %bb2 ] - %i8 = icmp eq i64 %i6, 0 - br i1 %i8, label %bb16, label %bb9 - -bb9: ; preds = %bb4 - br i1 %arg, label %bb12, label %bb10 - -bb10: ; preds = %bb9 - %i11 = sdiv i64 0, 0 - br label %bb12 - -bb12: ; preds = %bb10, %bb9 - br i1 %arg, label %bb13, label %bb14 - -bb13: ; preds = %bb12 - br label %bb14 - -bb14: ; preds = %bb13, %bb12 - %i15 = phi i32 [ 0, %bb13 ], [ %i7, %bb12 ] - switch i32 %arg1, label %bb22 [ - i32 0, label %bb21 - i32 1, label %bb2 - ] - -bb16: ; preds = %bb4 - br i1 %arg, label %bb18, label %bb17 - -bb17: ; preds = %bb16 - br label %bb18 - -bb18: ; preds = %bb17, %bb16 - %i19 = phi i64 [ 9166129423, %bb17 ], [ %i5, %bb16 ] - %i20 = phi i64 [ 9166129423, %bb17 ], [ %i6, %bb16 ] - br i1 %arg, label %bb22, label %bb4 - -bb21: ; preds = %bb14 - br label %bb22 - -bb22: ; preds = %bb21, %bb18, %bb14 - %i23 = phi i32 [ %arg1, %bb21 ], [ %i15, %bb14 ], [ 0, %bb18 ] - ret i32 %i23 -} -- cgit v1.1 From ccb46e8365787c446236df20c068d101c637346a Mon Sep 17 00:00:00 2001 From: DianQK Date: Fri, 9 Feb 2024 15:58:48 +0800 Subject: Reapply "[RegisterCoalescer] Clear instructions not recorded in `ErasedInstrs` but erased (#79820)" This reverts commit 8316bf34ac21117f35bc8e6fafa2b3e7da75e1d5. --- llvm/lib/CodeGen/RegisterCoalescer.cpp | 27 ++- .../LoongArch/register-coalescer-crash-pr79718.mir | 212 +++++++++++++++++++++ .../X86/PR71178-register-coalescer-crash.ll | 103 ++++++++++ 3 files changed, 337 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir create mode 100644 llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index cbb1a74..7e9c992 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -236,7 +236,8 @@ namespace { /// was successfully coalesced away. If it is not currently possible to /// coalesce this interval, but it may be possible if other things get /// coalesced, then it returns true by reference in 'Again'. - bool joinCopy(MachineInstr *CopyMI, bool &Again); + bool joinCopy(MachineInstr *CopyMI, bool &Again, + SmallPtrSetImpl &CurrentErasedInstrs); /// Attempt to join these two intervals. On failure, this /// returns false. The output "SrcInt" will not have been modified, so we @@ -1964,7 +1965,9 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI, LIS->shrinkToUses(&LI); } -bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { +bool RegisterCoalescer::joinCopy( + MachineInstr *CopyMI, bool &Again, + SmallPtrSetImpl &CurrentErasedInstrs) { Again = false; LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI); @@ -2156,7 +2159,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { // CopyMI has been erased by joinIntervals at this point. Remove it from // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back // to the work list. This keeps ErasedInstrs from growing needlessly. - ErasedInstrs.erase(CopyMI); + if (ErasedInstrs.erase(CopyMI)) + // But we may encounter the instruction again in this iteration. + CurrentErasedInstrs.insert(CopyMI); // Rewrite all SrcReg operands to DstReg. // Also update DstReg operands to include DstIdx if it is set. @@ -3982,21 +3987,33 @@ void RegisterCoalescer::lateLiveIntervalUpdate() { bool RegisterCoalescer:: copyCoalesceWorkList(MutableArrayRef CurrList) { bool Progress = false; + SmallPtrSet CurrentErasedInstrs; for (MachineInstr *&MI : CurrList) { if (!MI) continue; // Skip instruction pointers that have already been erased, for example by // dead code elimination. - if (ErasedInstrs.count(MI)) { + if (ErasedInstrs.count(MI) || CurrentErasedInstrs.count(MI)) { MI = nullptr; continue; } bool Again = false; - bool Success = joinCopy(MI, Again); + bool Success = joinCopy(MI, Again, CurrentErasedInstrs); Progress |= Success; if (Success || !Again) MI = nullptr; } + // Clear instructions not recorded in `ErasedInstrs` but erased. + if (!CurrentErasedInstrs.empty()) { + for (MachineInstr *&MI : CurrList) { + if (MI && CurrentErasedInstrs.count(MI)) + MI = nullptr; + } + for (MachineInstr *&MI : WorkList) { + if (MI && CurrentErasedInstrs.count(MI)) + MI = nullptr; + } + } return Progress; } diff --git a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir new file mode 100644 index 0000000..b3c44af --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir @@ -0,0 +1,212 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -o - %s -mtriple=loongarch64 \ +# RUN: -run-pass=register-coalescer -join-liveintervals=1 -join-splitedges=0 | FileCheck %s + +--- +name: foo +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $r4, $r5, $r6, $r7, $r8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $r8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $r7 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $r6 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $r5 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr = COPY $r4 + ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1 + ; CHECK-NEXT: [[ORI:%[0-9]+]]:gpr = ORI $r0, 1 + ; CHECK-NEXT: [[ANDI1:%[0-9]+]]:gpr = ANDI [[COPY2]], 1 + ; CHECK-NEXT: [[ANDI2:%[0-9]+]]:gpr = ANDI [[COPY1]], 1 + ; CHECK-NEXT: [[ANDI3:%[0-9]+]]:gpr = ANDI [[COPY]], 1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr = COPY $r0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr = COPY [[COPY5]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BEQZ [[ANDI]], %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.9(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: PseudoBR %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.7(0x7c000000), %bb.6(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[LD_D:%[0-9]+]]:gpr = LD_D $r0, 8 + ; CHECK-NEXT: dead [[LD_D1:%[0-9]+]]:gpr = LD_D $r0, 0 + ; CHECK-NEXT: BNEZ [[ANDI1]], %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6: + ; CHECK-NEXT: successors: %bb.11(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY $r0 + ; CHECK-NEXT: PseudoBR %bb.11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7: + ; CHECK-NEXT: successors: %bb.8(0x7c000000), %bb.10(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BEQZ [[ANDI2]], %bb.10 + ; CHECK-NEXT: PseudoBR %bb.8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8: + ; CHECK-NEXT: successors: %bb.9(0x04000000), %bb.5(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1 + ; CHECK-NEXT: BEQZ [[ANDI3]], %bb.5 + ; CHECK-NEXT: PseudoBR %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9: + ; CHECK-NEXT: successors: %bb.12(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ST_B $r0, [[COPY4]], 0 + ; CHECK-NEXT: PseudoBR %bb.12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.10: + ; CHECK-NEXT: successors: %bb.11(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY [[ORI]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.11: + ; CHECK-NEXT: successors: %bb.12(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ST_D $r0, [[COPY4]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.12: + ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.1(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BEQ [[COPY7]], [[ORI]], %bb.2 + ; CHECK-NEXT: PseudoBR %bb.1 + bb.0: + liveins: $r4, $r5, $r6, $r7, $r8 + + %0:gpr = COPY killed $r8 + %1:gpr = COPY killed $r7 + %2:gpr = COPY killed $r6 + %3:gpr = COPY killed $r5 + %4:gpr = COPY killed $r4 + %5:gpr = COPY $r0 + %6:gpr = COPY killed %5 + %7:gpr = ANDI killed %3, 1 + %8:gpr = ORI $r0, 1 + %9:gpr = ANDI killed %2, 1 + %10:gpr = ANDI killed %1, 1 + %11:gpr = ANDI killed %0, 1 + %12:gpr = COPY %6 + %13:gpr = COPY killed %6 + %14:gpr = IMPLICIT_DEF + + bb.1: + %15:gpr = COPY killed %14 + %16:gpr = COPY killed %13 + %17:gpr = COPY killed %12 + %18:gpr = COPY %17 + %19:gpr = COPY %16 + %20:gpr = COPY killed %16 + %21:gpr = COPY killed %15 + + bb.2: + successors: %bb.3, %bb.4 + + %22:gpr = COPY killed %21 + %23:gpr = COPY killed %20 + %24:gpr = COPY killed %19 + %25:gpr = COPY killed %18 + BEQZ %7, %bb.4 + + bb.3: + %26:gpr = COPY killed %24 + %27:gpr = COPY killed %23 + PseudoBR %bb.9 + + bb.4: + %28:gpr = COPY killed %23 + + bb.5: + successors: %bb.7(0x7c000000), %bb.6(0x04000000) + + %29:gpr = COPY killed %28 + dead %30:gpr = LD_D $r0, 8 + dead %31:gpr = LD_D $r0, 0 + BNEZ %9, %bb.7 + + bb.6: + %32:gpr = COPY $r0 + %33:gpr = COPY killed %32 + %34:gpr = COPY killed %33 + %35:gpr = COPY killed %22 + PseudoBR %bb.11 + + bb.7: + successors: %bb.8(0x7c000000), %bb.10(0x04000000) + + BEQZ %10, %bb.10 + PseudoBR %bb.8 + + bb.8: + successors: %bb.9(0x04000000), %bb.5(0x7c000000) + + %36:gpr = ADDI_D killed %29, 1 + %28:gpr = COPY %36 + %26:gpr = COPY %36 + %27:gpr = COPY killed %36 + BEQZ %11, %bb.5 + PseudoBR %bb.9 + + bb.9: + %37:gpr = COPY killed %27 + %38:gpr = COPY killed %26 + %39:gpr = COPY $r0 + ST_B killed %39, %4, 0 + %40:gpr = COPY killed %25 + %41:gpr = COPY killed %38 + %42:gpr = COPY killed %37 + %43:gpr = COPY killed %22 + PseudoBR %bb.12 + + bb.10: + %44:gpr = ADDI_D killed %29, 1 + %34:gpr = COPY %8 + %35:gpr = COPY killed %44 + + bb.11: + %45:gpr = COPY killed %35 + %46:gpr = COPY killed %34 + %47:gpr = COPY $r0 + ST_D killed %47, %4, 0 + %40:gpr = COPY %45 + %41:gpr = COPY %46 + %42:gpr = COPY killed %46 + %43:gpr = COPY killed %45 + + bb.12: + successors: %bb.2(0x7c000000), %bb.1(0x04000000) + + %48:gpr = COPY killed %43 + %49:gpr = COPY killed %42 + %50:gpr = COPY killed %41 + %51:gpr = COPY killed %40 + %12:gpr = COPY %51 + %13:gpr = COPY %50 + %14:gpr = COPY %48 + %18:gpr = COPY killed %51 + %19:gpr = COPY killed %50 + %20:gpr = COPY killed %49 + %21:gpr = COPY killed %48 + BEQ %17, %8, %bb.2 + PseudoBR %bb.1 + +... diff --git a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll new file mode 100644 index 0000000..0ce346f --- /dev/null +++ b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=x86_64 -- | FileCheck %s + +define i32 @h(i1 %arg, i32 %arg1) { +; CHECK-LABEL: h: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: movabsq $9166129423, %rcx # imm = 0x22258090F +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: jmp .LBB0_1 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_9: # %bb18 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: jne .LBB0_10 +; CHECK-NEXT: .LBB0_1: # %bb4 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: # %bb.7: # %bb16 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: jne .LBB0_9 +; CHECK-NEXT: # %bb.8: # %bb17 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: jmp .LBB0_9 +; CHECK-NEXT: .LBB0_2: # %bb9 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB0_4 +; CHECK-NEXT: # %bb.3: # %bb13 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: .LBB0_4: # %bb14 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: cmpl $1, %esi +; CHECK-NEXT: je .LBB0_1 +; CHECK-NEXT: # %bb.5: # %bb14 +; CHECK-NEXT: movl %eax, %r8d +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: jne .LBB0_6 +; CHECK-NEXT: .LBB0_10: # %bb22 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_6: # %bb22.loopexit1 +; CHECK-NEXT: movl %r8d, %eax +; CHECK-NEXT: retq +bb: + br label %bb2 + +bb2: ; preds = %bb14, %bb + %i = phi i64 [ %i5, %bb14 ], [ 0, %bb ] + %i3 = phi i32 [ %i15, %bb14 ], [ 1, %bb ] + br label %bb4 + +bb4: ; preds = %bb18, %bb2 + %i5 = phi i64 [ %i19, %bb18 ], [ %i, %bb2 ] + %i6 = phi i64 [ %i20, %bb18 ], [ %i, %bb2 ] + %i7 = phi i32 [ 0, %bb18 ], [ %i3, %bb2 ] + %i8 = icmp eq i64 %i6, 0 + br i1 %i8, label %bb16, label %bb9 + +bb9: ; preds = %bb4 + br i1 %arg, label %bb12, label %bb10 + +bb10: ; preds = %bb9 + %i11 = sdiv i64 0, 0 + br label %bb12 + +bb12: ; preds = %bb10, %bb9 + br i1 %arg, label %bb13, label %bb14 + +bb13: ; preds = %bb12 + br label %bb14 + +bb14: ; preds = %bb13, %bb12 + %i15 = phi i32 [ 0, %bb13 ], [ %i7, %bb12 ] + switch i32 %arg1, label %bb22 [ + i32 0, label %bb21 + i32 1, label %bb2 + ] + +bb16: ; preds = %bb4 + br i1 %arg, label %bb18, label %bb17 + +bb17: ; preds = %bb16 + br label %bb18 + +bb18: ; preds = %bb17, %bb16 + %i19 = phi i64 [ 9166129423, %bb17 ], [ %i5, %bb16 ] + %i20 = phi i64 [ 9166129423, %bb17 ], [ %i6, %bb16 ] + br i1 %arg, label %bb22, label %bb4 + +bb21: ; preds = %bb14 + br label %bb22 + +bb22: ; preds = %bb21, %bb18, %bb14 + %i23 = phi i32 [ %arg1, %bb21 ], [ %i15, %bb14 ], [ 0, %bb18 ] + ret i32 %i23 +} -- cgit v1.1 From b477d39bf6811ac12a1e7e98f308cf4c9a8de26f Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Fri, 9 Feb 2024 09:10:49 +0100 Subject: [flang] Align runtime info and lowering regarding passing ABIs (#81166) Runtime derived type info contains information to tell the runtime if some argument in a user defined assignment must be passed with a descriptor or not. This information was not properly build, it would tell the runtime that TARGET argument must be passed via descriptor, which is incorrect. Share the logic between lowering and runtime info generation to determine if an argument must be passed by descriptor or not. --- flang/include/flang/Evaluate/characteristics.h | 1 + flang/lib/Evaluate/characteristics.cpp | 24 +++++++++++++++++++++++ flang/lib/Lower/CallInterface.cpp | 27 +------------------------- flang/lib/Semantics/runtime-type-info.cpp | 14 ++++++++----- flang/test/Semantics/typeinfo09.f90 | 20 +++++++++++++++++++ 5 files changed, 55 insertions(+), 31 deletions(-) create mode 100644 flang/test/Semantics/typeinfo09.f90 diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h index fd4af15..04a0d71 100644 --- a/flang/include/flang/Evaluate/characteristics.h +++ b/flang/include/flang/Evaluate/characteristics.h @@ -229,6 +229,7 @@ struct DummyDataObject { static std::optional Characterize( const semantics::Symbol &, FoldingContext &); bool CanBePassedViaImplicitInterface(std::string *whyNot = nullptr) const; + bool IsPassedByDescriptor(bool isBindC) const; llvm::raw_ostream &Dump(llvm::raw_ostream &) const; TypeAndShape type; diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp index d480050..c14a422 100644 --- a/flang/lib/Evaluate/characteristics.cpp +++ b/flang/lib/Evaluate/characteristics.cpp @@ -461,6 +461,30 @@ bool DummyDataObject::CanBePassedViaImplicitInterface( } } +bool DummyDataObject::IsPassedByDescriptor(bool isBindC) const { + constexpr TypeAndShape::Attrs shapeRequiringBox = { + TypeAndShape::Attr::AssumedShape, TypeAndShape::Attr::DeferredShape, + TypeAndShape::Attr::AssumedRank, TypeAndShape::Attr::Coarray}; + if ((attrs & Attrs{Attr::Allocatable, Attr::Pointer}).any()) { + return true; + } else if ((type.attrs() & shapeRequiringBox).any()) { + // Need to pass shape/coshape info in a descriptor. + return true; + } else if (type.type().IsPolymorphic() && !type.type().IsAssumedType()) { + // Need to pass dynamic type info in a descriptor. + return true; + } else if (const auto *derived{GetDerivedTypeSpec(type.type())}) { + if (const semantics::Scope *scope = derived->scope()) { + // Need to pass length type parameters in a descriptor if any. + return scope->IsDerivedTypeWithLengthParameter(); + } + } else if (isBindC && type.type().IsAssumedLengthCharacter()) { + // Fortran 2018 18.3.6 point 2 (5) + return true; + } + return false; +} + llvm::raw_ostream &DummyDataObject::Dump(llvm::raw_ostream &o) const { attrs.Dump(o, EnumToString); if (intent != common::Intent::Default) { diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp index 4c297ce..f67ee88 100644 --- a/flang/lib/Lower/CallInterface.cpp +++ b/flang/lib/Lower/CallInterface.cpp @@ -916,31 +916,6 @@ private: } } - // Define when an explicit argument must be passed in a fir.box. - bool dummyRequiresBox( - const Fortran::evaluate::characteristics::DummyDataObject &obj, - bool isBindC) { - using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr; - using ShapeAttrs = Fortran::evaluate::characteristics::TypeAndShape::Attrs; - constexpr ShapeAttrs shapeRequiringBox = { - ShapeAttr::AssumedShape, ShapeAttr::DeferredShape, - ShapeAttr::AssumedRank, ShapeAttr::Coarray}; - if ((obj.type.attrs() & shapeRequiringBox).any()) - // Need to pass shape/coshape info in fir.box. - return true; - if (obj.type.type().IsPolymorphic() && !obj.type.type().IsAssumedType()) - // Need to pass dynamic type info in fir.box. - return true; - if (const Fortran::semantics::DerivedTypeSpec *derived = - Fortran::evaluate::GetDerivedTypeSpec(obj.type.type())) - if (const Fortran::semantics::Scope *scope = derived->scope()) - // Need to pass length type parameters in fir.box if any. - return scope->IsDerivedTypeWithLengthParameter(); - if (isBindC && obj.type.type().IsAssumedLengthCharacter()) - return true; // Fortran 2018 18.3.6 point 2 (5) - return false; - } - mlir::Type translateDynamicType(const Fortran::evaluate::DynamicType &dynamicType) { Fortran::common::TypeCategory cat = dynamicType.category(); @@ -1027,7 +1002,7 @@ private: addFirOperand(boxRefType, nextPassedArgPosition(), Property::MutableBox, attrs); addPassedArg(PassEntityBy::MutableBox, entity, characteristics); - } else if (dummyRequiresBox(obj, isBindC)) { + } else if (obj.IsPassedByDescriptor(isBindC)) { // Pass as fir.box or fir.class if (isValueAttr && !getConverter().getLoweringOptions().getLowerToHighLevelFIR()) diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp index de71083..66c4216 100644 --- a/flang/lib/Semantics/runtime-type-info.cpp +++ b/flang/lib/Semantics/runtime-type-info.cpp @@ -1144,7 +1144,7 @@ void RuntimeTableBuilder::DescribeSpecialProc( which = scalarFinalEnum_; if (int rank{evaluate::GetRank(typeAndShape.shape())}; rank > 0) { which = IntExpr<1>(ToInt64(which).value() + rank); - if (!proc->dummyArguments[0].CanBePassedViaImplicitInterface()) { + if (dummyData.IsPassedByDescriptor(proc->IsBindC())) { argThatMightBeDescriptor = 1; } if (!typeAndShape.attrs().test(evaluate::characteristics:: @@ -1187,10 +1187,14 @@ void RuntimeTableBuilder::DescribeSpecialProc( break; } } - if (argThatMightBeDescriptor != 0 && - !proc->dummyArguments.at(argThatMightBeDescriptor - 1) - .CanBePassedViaImplicitInterface()) { - isArgDescriptorSet |= 1 << (argThatMightBeDescriptor - 1); + if (argThatMightBeDescriptor != 0) { + if (const auto *dummyData{ + std::get_if( + &proc->dummyArguments.at(argThatMightBeDescriptor - 1).u)}) { + if (dummyData->IsPassedByDescriptor(proc->IsBindC())) { + isArgDescriptorSet |= 1 << (argThatMightBeDescriptor - 1); + } + } } evaluate::StructureConstructorValues values; auto index{evaluate::ToInt64(which)}; diff --git a/flang/test/Semantics/typeinfo09.f90 b/flang/test/Semantics/typeinfo09.f90 new file mode 100644 index 0000000..3527ee6 --- /dev/null +++ b/flang/test/Semantics/typeinfo09.f90 @@ -0,0 +1,20 @@ +!RUN: bbc --dump-symbols %s | FileCheck %s +!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s +! test setting of isargdescriptorset in the runtime type info. + +module m + type :: sometype + contains + procedure :: copy => copy_impl + generic :: assignment(=) => copy + end type +interface + subroutine copy_impl(this, x) + import + class(sometype), intent(out) :: this + type(sometype), target, intent(in) :: x + end subroutine +end interface +end module + +!CHECK: .s.sometype, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=copy_impl)] -- cgit v1.1 From bc6955f18ced3ca89d49bc28eeb58cd6d367e136 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Fri, 9 Feb 2024 09:20:25 +0100 Subject: [AMDGPU] Don't fix the scavenge slot at offset 0 (#79136) At the moment, the emergency spill slot is a fixed object for entry functions and chain functions, and a regular stack object otherwise. This patch adopts the latter behaviour for entry/chain functions too. It seems this was always the intention [1] and it will also save us a bit of stack space in cases where the first stack object has a large alignment. [1] https://github.com/llvm/llvm-project/commit/34c8b835b16fb3879f1b9770e91df21883356bb6 --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 12 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 3 - .../AMDGPU/GlobalISel/call-outgoing-stack-args.ll | 54 +- .../AMDGPU/GlobalISel/crash-stack-address-O0.ll | 4 +- .../AMDGPU/GlobalISel/flat-scratch-init.gfx.ll | 4 +- .../test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 130 +-- .../AMDGPU/GlobalISel/insertelement-stack-lower.ll | 258 ++--- llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 24 +- .../CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll | 104 +- llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll | 10 +- llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll | 2 +- llvm/test/CodeGen/AMDGPU/call-argument-types.ll | 106 +- .../AMDGPU/callee-special-input-vgprs-packed.ll | 4 +- .../CodeGen/AMDGPU/callee-special-input-vgprs.ll | 6 +- llvm/test/CodeGen/AMDGPU/captured-frame-index.ll | 50 +- llvm/test/CodeGen/AMDGPU/cc-update.ll | 32 +- llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll | 42 +- llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll | 13 +- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 54 +- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 132 +-- llvm/test/CodeGen/AMDGPU/commute-compares.ll | 2 +- .../CodeGen/AMDGPU/control-flow-fastregalloc.ll | 16 +- llvm/test/CodeGen/AMDGPU/extload-private.ll | 8 +- llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll | 2 + llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll | 16 +- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 108 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 570 +++++----- .../frame-index-elimination-tied-operand.mir | 2 +- .../test/CodeGen/AMDGPU/frame-index-elimination.ll | 2 +- .../CodeGen/AMDGPU/global_atomics_scan_fadd.ll | 72 +- .../CodeGen/AMDGPU/global_atomics_scan_fsub.ll | 72 +- llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll | 20 +- llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll | 18 +- .../test/CodeGen/AMDGPU/kernarg-stack-alignment.ll | 10 +- .../test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll | 4 +- .../AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll | 4 +- .../CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll | 2 +- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 144 +-- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 32 +- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 24 +- llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll | 4 +- ...partial-regcopy-and-spill-missed-at-regalloc.ll | 8 +- .../CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll | 62 +- .../AMDGPU/pei-amdgpu-cs-chain-preserve.mir | 20 +- llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir | 16 +- .../regalloc-introduces-copy-sgpr-to-agpr.mir | 136 +-- llvm/test/CodeGen/AMDGPU/scratch-simple.ll | 8 +- llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll | 16 +- llvm/test/CodeGen/AMDGPU/sgpr-spill.mir | 240 ++-- llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll | 2 +- llvm/test/CodeGen/AMDGPU/spill-agpr.ll | 32 +- llvm/test/CodeGen/AMDGPU/spill-m0.ll | 4 +- .../CodeGen/AMDGPU/spill-offset-calculation.ll | 36 +- llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll | 16 +- llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir | 54 +- llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll | 28 +- llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll | 8 +- llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll | 18 +- .../CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll | 68 +- .../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 2 +- .../AMDGPU/vgpr-spill-placement-issue61083.ll | 14 +- llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll | 1164 ++++++++++---------- llvm/test/CodeGen/AMDGPU/wqm.ll | 8 +- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 88 +- .../MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll | 4 +- .../MIR/AMDGPU/machine-function-info-after-pei.ll | 2 +- llvm/test/DebugInfo/AMDGPU/variable-locations.ll | 6 +- 67 files changed, 2127 insertions(+), 2109 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index b94d143..52d6fe6 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -552,14 +552,10 @@ int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI) { if (ScavengeFI) return *ScavengeFI; - if (isBottomOfStack()) { - ScavengeFI = MFI.CreateFixedObject( - TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); - } else { - ScavengeFI = MFI.CreateStackObject( - TRI.getSpillSize(AMDGPU::SGPR_32RegClass), - TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false); - } + + ScavengeFI = + MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass), + TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false); return *ScavengeFI; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 1a22b77..3664535 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2287,9 +2287,6 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (FrameReg) FIOp.ChangeToRegister(FrameReg, false); - if (!Offset) - return false; - MachineOperand *OffsetOp = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); int64_t NewOffset = Offset + OffsetOp->getImm(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 6e49a5a..61bc28b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -67,6 +67,8 @@ define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 @@ -97,25 +99,23 @@ define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; MUBUF-NEXT: s_nop 0 -; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 -; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:16 -; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:20 -; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:24 -; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:28 -; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:32 -; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:36 -; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:40 -; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:44 -; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:48 -; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:52 -; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:56 -; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:60 -; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64 -; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:8 +; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:12 +; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:16 +; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:20 +; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:24 +; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:28 +; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:32 +; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:36 +; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:40 +; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:44 +; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:48 +; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:52 +; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:56 +; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:60 ; MUBUF-NEXT: s_movk_i32 s32, 0x1400 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 @@ -162,6 +162,7 @@ define amdgpu_kernel void @kernel_caller_byval() { ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:8 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:16 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:24 @@ -177,16 +178,15 @@ define amdgpu_kernel void @kernel_caller_byval() { ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:104 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:112 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:120 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:128 -; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8 +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; FLATSCR-NEXT: s_nop 0 -; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:16 -; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:24 -; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:32 -; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:40 -; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:48 -; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:56 -; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:64 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:8 +; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:16 +; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:24 +; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:32 +; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:40 +; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:48 +; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:56 ; FLATSCR-NEXT: s_movk_i32 s32, 0x50 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll index 9580326..0d79365 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll @@ -12,10 +12,10 @@ define amdgpu_kernel void @stack_write_fi() { ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s5 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll index dcad707..b4b95fd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll @@ -12,7 +12,7 @@ define amdgpu_ps void @amdgpu_ps() { ; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4 ; MESA-NEXT: s_mov_b64 s[0:1], src_private_base ; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; MESA-NEXT: v_mov_b32_e32 v0, 4 +; MESA-NEXT: v_mov_b32_e32 v0, 0 ; MESA-NEXT: v_mov_b32_e32 v1, s1 ; MESA-NEXT: v_mov_b32_e32 v2, 0 ; MESA-NEXT: flat_store_dword v[0:1], v2 @@ -24,7 +24,7 @@ define amdgpu_ps void @amdgpu_ps() { ; PAL-NEXT: s_getpc_b64 s[2:3] ; PAL-NEXT: s_mov_b32 s2, s0 ; PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; PAL-NEXT: v_mov_b32_e32 v0, 4 +; PAL-NEXT: v_mov_b32_e32 v0, 0 ; PAL-NEXT: v_mov_b32_e32 v2, 0 ; PAL-NEXT: s_waitcnt lgkmcnt(0) ; PAL-NEXT: s_and_b32 s3, s3, 0xffff diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 75065f6..921bdb5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -15,11 +15,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: s_add_i32 s1, s1, 4 +; GFX9-NEXT: s_add_i32 s1, s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, 4 +; GFX9-NEXT: s_add_i32 s0, s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -36,8 +36,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-NEXT: s_add_i32 s0, s0, 4 -; GFX10-NEXT: s_add_i32 s1, s1, 4 +; GFX10-NEXT: s_add_i32 s0, s0, 0 +; GFX10-NEXT: s_add_i32 s1, s1, 0 ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -51,12 +51,12 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_add_i32 s1, s1, 4 +; GFX940-NEXT: s_add_i32 s1, s1, 0 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; @@ -69,10 +69,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_add_i32 s0, s0, 4 +; GFX11-NEXT: s_add_i32 s0, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; @@ -87,9 +87,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v2, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: @@ -109,12 +109,12 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 0, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -129,8 +129,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0, v1 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc @@ -143,9 +143,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX940-NEXT: v_add_u32_e32 v0, 0, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -156,9 +156,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0, v1 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm @@ -169,9 +169,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:128 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: @@ -324,16 +324,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc +; GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_addk_i32 s1, 0x104 +; GFX9-NEXT: s_addk_i32 s1, 0x100 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_addk_i32 s0, 0x104 +; GFX9-NEXT: s_addk_i32 s0, 0x100 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -345,15 +345,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-NEXT: s_addk_i32 s0, 0x104 -; GFX10-NEXT: s_addk_i32 s1, 0x104 +; GFX10-NEXT: s_addk_i32 s0, 0x100 +; GFX10-NEXT: s_addk_i32 s1, 0x100 ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -363,42 +363,42 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX940-LABEL: store_load_sindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_addk_i32 s1, 0x104 +; GFX940-NEXT: s_addk_i32 s1, 0x100 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:260 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off offset:256 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc +; GFX11-NEXT: scratch_load_b32 v2, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_addk_i32 s0, 0x104 +; GFX11-NEXT: s_addk_i32 s0, 0x100 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:260 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:256 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -408,9 +408,9 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:260 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:256 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: @@ -432,16 +432,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc +; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 0x104, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -455,11 +455,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc +; GFX10-NEXT: scratch_load_dword v3, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x104, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc @@ -468,15 +468,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX940-LABEL: store_load_vindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off offset:260 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off offset:256 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0x104, v0 +; GFX940-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -485,12 +485,12 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc +; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:260 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:256 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm @@ -500,12 +500,12 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 15 -; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:260 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:256 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:384 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:380 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: @@ -708,7 +708,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -718,9 +718,9 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:16388 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: @@ -812,12 +812,12 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 15 -; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16388 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16512 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16508 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: @@ -1003,11 +1003,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX12-LABEL: store_load_large_imm_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX12-NEXT: scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 off, v0, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: @@ -1116,7 +1116,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0, v0 ; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0, v0 ; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX940-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX940-NEXT: v_add_u32_e32 v0, 0, v0 ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 @@ -1160,7 +1160,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1024 glc dlc @@ -1173,9 +1173,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1028 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1028 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index faab70c..a1c99f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x10 ; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0x100 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0 ; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40 @@ -35,189 +35,189 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: v_mov_b32_e32 v14, s50 ; GCN-NEXT: v_mov_b32_e32 v15, s51 ; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:260 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:264 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:268 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:272 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:276 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:280 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:288 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:292 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:296 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:300 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:304 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:308 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:312 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:316 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:12 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:20 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:28 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:36 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:40 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:44 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:48 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:52 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:56 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:60 ; GCN-NEXT: v_mov_b32_e32 v0, s52 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:320 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64 ; GCN-NEXT: v_mov_b32_e32 v0, s53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:324 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68 ; GCN-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:328 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:72 ; GCN-NEXT: v_mov_b32_e32 v0, s55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:332 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:76 ; GCN-NEXT: v_mov_b32_e32 v0, s56 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:336 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:80 ; GCN-NEXT: v_mov_b32_e32 v0, s57 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:340 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:84 ; GCN-NEXT: v_mov_b32_e32 v0, s58 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:344 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:88 ; GCN-NEXT: v_mov_b32_e32 v0, s59 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:348 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:92 ; GCN-NEXT: v_mov_b32_e32 v0, s60 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:352 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:96 ; GCN-NEXT: v_mov_b32_e32 v0, s61 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:356 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:100 ; GCN-NEXT: v_mov_b32_e32 v0, s62 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:360 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:104 ; GCN-NEXT: v_mov_b32_e32 v0, s63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:364 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:108 ; GCN-NEXT: v_mov_b32_e32 v0, s64 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:368 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:112 ; GCN-NEXT: v_mov_b32_e32 v0, s65 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:372 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116 ; GCN-NEXT: v_mov_b32_e32 v0, s66 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:376 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120 ; GCN-NEXT: v_mov_b32_e32 v0, s67 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:380 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124 ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:384 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 ; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:388 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:392 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:136 ; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:396 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:140 ; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:400 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:144 ; GCN-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:404 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:148 ; GCN-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:408 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:152 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:412 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:156 ; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:416 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:160 ; GCN-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:420 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:164 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:424 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:168 ; GCN-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:428 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:172 ; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:432 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:176 ; GCN-NEXT: v_mov_b32_e32 v0, s17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:436 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:180 ; GCN-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:440 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:184 ; GCN-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:444 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:188 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:448 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:192 ; GCN-NEXT: v_mov_b32_e32 v0, s37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:452 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:196 ; GCN-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:456 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:200 ; GCN-NEXT: v_mov_b32_e32 v0, s39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:460 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:204 ; GCN-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:464 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:208 ; GCN-NEXT: v_mov_b32_e32 v0, s41 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:468 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:212 ; GCN-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:472 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:216 ; GCN-NEXT: v_mov_b32_e32 v0, s43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:476 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:220 ; GCN-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:480 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:224 ; GCN-NEXT: v_mov_b32_e32 v0, s45 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:484 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:228 ; GCN-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:488 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:232 ; GCN-NEXT: v_mov_b32_e32 v0, s47 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:492 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:236 ; GCN-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:496 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:240 ; GCN-NEXT: v_mov_b32_e32 v0, s49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:500 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:244 ; GCN-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NEXT: s_and_b32 s4, s25, 63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:504 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:248 ; GCN-NEXT: v_mov_b32_e32 v0, s51 ; GCN-NEXT: s_lshl_b32 s4, s4, 2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:508 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:252 ; GCN-NEXT: v_add_u32_e32 v0, s4, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s24 ; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:260 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:264 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:268 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:272 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:276 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:280 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:284 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:288 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:292 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:296 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:300 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:304 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:308 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:312 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:316 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], 0 offset:320 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], 0 offset:324 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], 0 offset:328 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], 0 offset:332 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], 0 offset:336 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], 0 offset:340 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], 0 offset:344 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], 0 offset:348 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], 0 offset:352 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], 0 offset:356 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], 0 offset:360 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], 0 offset:364 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], 0 offset:368 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], 0 offset:372 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], 0 offset:376 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], 0 offset:380 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], 0 offset:384 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], 0 offset:388 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], 0 offset:392 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], 0 offset:396 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], 0 offset:400 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], 0 offset:404 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], 0 offset:408 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], 0 offset:412 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], 0 offset:416 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], 0 offset:420 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], 0 offset:424 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], 0 offset:428 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], 0 offset:432 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], 0 offset:436 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], 0 offset:440 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], 0 offset:444 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], 0 offset:448 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], 0 offset:452 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], 0 offset:456 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], 0 offset:460 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], 0 offset:464 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], 0 offset:468 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], 0 offset:472 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], 0 offset:476 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], 0 offset:480 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], 0 offset:484 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], 0 offset:488 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], 0 offset:492 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], 0 offset:496 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], 0 offset:500 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:504 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:508 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:8 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:12 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:16 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:20 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:24 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:28 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:32 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:36 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:40 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:44 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:48 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:52 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:56 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:60 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], 0 offset:64 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], 0 offset:68 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], 0 offset:72 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], 0 offset:76 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], 0 offset:80 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], 0 offset:84 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], 0 offset:88 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], 0 offset:92 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], 0 offset:96 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], 0 offset:100 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], 0 offset:104 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], 0 offset:108 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], 0 offset:112 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], 0 offset:116 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], 0 offset:120 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], 0 offset:124 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], 0 offset:128 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], 0 offset:132 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], 0 offset:136 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], 0 offset:140 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], 0 offset:144 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], 0 offset:148 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], 0 offset:152 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], 0 offset:156 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], 0 offset:160 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], 0 offset:164 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], 0 offset:168 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], 0 offset:172 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], 0 offset:176 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], 0 offset:180 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], 0 offset:184 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], 0 offset:188 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], 0 offset:192 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], 0 offset:196 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], 0 offset:200 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], 0 offset:204 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], 0 offset:208 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], 0 offset:212 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], 0 offset:216 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], 0 offset:220 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], 0 offset:224 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], 0 offset:228 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], 0 offset:232 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], 0 offset:236 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], 0 offset:240 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], 0 offset:244 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:248 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:252 ; GCN-NEXT: s_waitcnt vmcnt(60) ; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[20:21] ; GCN-NEXT: s_waitcnt vmcnt(57) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll index 3e572f9..c92b78c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -365,8 +365,8 @@ define amdgpu_cs_chain void @alloca_and_call() { ; GISEL-GFX11-NEXT: s_mov_b32 s0, use@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s1, use@abs32@hi ; GISEL-GFX11-NEXT: s_mov_b32 s32, 16 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 -; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v0, off +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GISEL-GFX11-NEXT: s_endpgm ; @@ -378,8 +378,8 @@ define amdgpu_cs_chain void @alloca_and_call() { ; GISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi ; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] -; GISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_movk_i32 s32, 0x200 ; GISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GISEL-GFX10-NEXT: s_endpgm @@ -391,8 +391,8 @@ define amdgpu_cs_chain void @alloca_and_call() { ; DAGISEL-GFX11-NEXT: s_mov_b32 s1, use@abs32@hi ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, use@abs32@lo ; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 16 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 -; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v0, 4 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v0, off +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; DAGISEL-GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL-GFX11-NEXT: s_endpgm ; @@ -404,8 +404,8 @@ define amdgpu_cs_chain void @alloca_and_call() { ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo ; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] -; DAGISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:4 -; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 4 +; DAGISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; DAGISEL-GFX10-NEXT: s_movk_i32 s32, 0x200 ; DAGISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; DAGISEL-GFX10-NEXT: s_endpgm @@ -867,7 +867,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) { ; GISEL-GFX11-NEXT: s_mov_b32 s0, 1 ; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v4, 32, v0 +; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v4, 0, v0 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc @@ -882,7 +882,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) { ; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, 2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, 3 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v4, 4 -; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, 32, v0 +; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, 0, v0 ; GISEL-GFX10-NEXT: buffer_store_dword v1, v0, s[48:51], 0 offen ; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GISEL-GFX10-NEXT: buffer_store_dword v2, v0, s[48:51], 0 offen offset:4 @@ -898,7 +898,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) { ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v4, v8, 4, 32 +; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v4, v8, 4, 0 ; DAGISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc ; DAGISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; DAGISEL-GFX11-NEXT: s_endpgm @@ -907,7 +907,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) { ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 4 -; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v1, v8, 4, 32 +; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v1, v8, 4, 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, 3 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, 2 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v4, 1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll index 4190d07..8d9ed9b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll @@ -181,13 +181,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND -; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, off ; 4-byte Folded Reload ; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 @@ -198,13 +198,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND -; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload ; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 @@ -215,13 +215,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND -; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, off ; 4-byte Folded Reload ; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo @@ -232,13 +232,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND -; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo @@ -254,13 +254,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a ; GISEL-GFX11-LABEL: chain_preserve_to_chain: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND -; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, off ; 4-byte Folded Reload ; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 @@ -271,13 +271,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a ; GISEL-GFX10-LABEL: chain_preserve_to_chain: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND -; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload ; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 @@ -288,13 +288,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND -; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, off ; 4-byte Folded Reload ; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo @@ -305,13 +305,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND -; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo @@ -327,7 +327,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; GISEL-GFX11-LABEL: chain_preserve_to_chain_wwm: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 ; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo @@ -336,7 +336,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND -; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, off ; 4-byte Folded Reload ; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi @@ -349,7 +349,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; GISEL-GFX10-LABEL: chain_preserve_to_chain_wwm: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 ; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo @@ -358,7 +358,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND -; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload ; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi @@ -370,7 +370,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_wwm: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 ; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo @@ -379,7 +379,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND -; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, off ; 4-byte Folded Reload ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 ; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo @@ -392,7 +392,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_wwm: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 ; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo @@ -401,7 +401,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND -; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo @@ -422,8 +422,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v11, off offset:8 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off offset:4 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v11, off offset:4 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; GISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX11-NEXT: ;;#ASMSTART @@ -433,8 +433,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, off offset:4 -; GISEL-GFX11-NEXT: scratch_load_b32 v11, off, off offset:8 +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, off +; GISEL-GFX11-NEXT: scratch_load_b32 v11, off, off offset:4 ; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] @@ -442,8 +442,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: buffer_store_dword v11, off, s[48:51], 0 offset:8 ; 4-byte Folded Spill -; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: buffer_store_dword v11, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX10-NEXT: ;;#ASMSTART @@ -453,8 +453,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 offset:4 -; GISEL-GFX10-NEXT: buffer_load_dword v11, off, s[48:51], 0 offset:8 +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 +; GISEL-GFX10-NEXT: buffer_load_dword v11, off, s[48:51], 0 offset:4 ; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -463,8 +463,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX11-NEXT: s_clause 0x1 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v11, off offset:8 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off offset:4 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v11, off offset:4 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART @@ -474,8 +474,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 ; DAGISEL-GFX11-NEXT: s_clause 0x1 -; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, off offset:4 -; DAGISEL-GFX11-NEXT: scratch_load_b32 v11, off, off offset:8 +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, off +; DAGISEL-GFX11-NEXT: scratch_load_b32 v11, off, off offset:4 ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] @@ -483,8 +483,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX10-NEXT: buffer_store_dword v11, off, s[48:51], 0 offset:8 ; 4-byte Folded Spill -; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: buffer_store_dword v11, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART @@ -494,8 +494,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 ; DAGISEL-GFX10-NEXT: s_clause 0x1 -; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 offset:4 -; DAGISEL-GFX10-NEXT: buffer_load_dword v11, off, s[48:51], 0 offset:8 +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 +; DAGISEL-GFX10-NEXT: buffer_load_dword v11, off, s[48:51], 0 offset:4 ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -508,13 +508,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: s_mov_b32 s2, s0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND -; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, off ; 4-byte Folded Reload ; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 @@ -525,13 +525,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: s_mov_b32 s2, s0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND -; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload ; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 @@ -542,13 +542,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX11-NEXT: s_mov_b32 s2, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND -; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, off ; 4-byte Folded Reload ; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo @@ -559,13 +559,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX10-NEXT: s_mov_b32 s2, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND -; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo @@ -592,7 +592,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac ; GISEL-GFX11-NEXT: s_mov_b32 s0, 1 ; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v4, 32, v0 +; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v4, 0, v0 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc @@ -607,7 +607,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac ; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, 2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, 3 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v4, 4 -; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, 32, v0 +; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, 0, v0 ; GISEL-GFX10-NEXT: buffer_store_dword v1, v0, s[48:51], 0 offen ; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GISEL-GFX10-NEXT: buffer_store_dword v2, v0, s[48:51], 0 offen offset:4 @@ -623,7 +623,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v4, v8, 4, 32 +; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v4, v8, 4, 0 ; DAGISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc ; DAGISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; DAGISEL-GFX11-NEXT: s_endpgm @@ -632,7 +632,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 4 -; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v1, v8, 4, 32 +; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v1, v8, 4, 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, 3 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, 2 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v4, 1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index ff2f2c6..93c18de 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -209,8 +209,8 @@ for.end: ; R600-VECT: MOVA_INT -; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:6 ; encoding: [0x06,0x00,0x68,0xe0 -; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x68,0xe0 +; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:2 ; encoding: [0x02,0x00,0x68,0xe0 +; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; encoding: [0x00,0x00,0x68,0xe0 ; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort. ; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 @@ -238,8 +238,8 @@ entry: ; SI-PROMOTE-VECT-DAG: s_lshl_b32 ; SI-PROMOTE-VECT-DAG: v_lshrrev -; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x60,0xe0 -; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:5 ; encoding: [0x05,0x00,0x60,0xe0 +; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; encoding: [0x00,0x00,0x60,0xe0 +; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:1 ; encoding: [0x01,0x00,0x60,0xe0 define amdgpu_kernel void @char_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %0 = alloca [2 x i8], addrspace(5) @@ -258,7 +258,7 @@ entry: ; FUNC-LABEL: {{^}}no_overlap: ; ; A total of 5 bytes should be allocated and used. -; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; +; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll index 954994a..d33196b 100644 --- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -12,7 +12,7 @@ declare void @llvm.amdgcn.s.barrier() #2 ; SI-LABEL: {{^}}test_private_array_ptr_calc: -; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}} +; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}} ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64 ; SI-ALLOCA: s_barrier ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 381fb98..f72d22b 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -4655,11 +4655,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; VI-NEXT: s_add_u32 s36, s36, s1 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 -; VI-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; VI-NEXT: buffer_store_byte v0, off, s[36:39], 0 ; VI-NEXT: v_mov_b32_e32 v0, 8 -; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 -; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 -; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 +; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 +; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_movk_i32 s32, 0x400 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4682,11 +4682,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; CI-NEXT: s_add_u32 s36, s36, s1 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: v_mov_b32_e32 v0, 3 -; CI-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; CI-NEXT: buffer_store_byte v0, off, s[36:39], 0 ; CI-NEXT: v_mov_b32_e32 v0, 8 -; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 -; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 -; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 +; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 +; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_movk_i32 s32, 0x400 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4709,12 +4709,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GFX9-NEXT: s_add_u32 s36, s36, s1 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 -; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 -; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4736,9 +4736,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b8 off, v0, off offset:8 -; GFX11-NEXT: scratch_store_b32 off, v1, off offset:12 -; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX11-NEXT: scratch_store_b8 off, v0, off +; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -4753,11 +4753,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; HSA-NEXT: s_mov_b32 s3, 0x11e80000 ; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 3 -; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8 +; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; HSA-NEXT: v_mov_b32_e32 v0, 8 -; HSA-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 -; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 -; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 +; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; HSA-NEXT: s_movk_i32 s32, 0x400 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 @@ -4787,11 +4787,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 -; VI-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; VI-NEXT: buffer_store_byte v0, off, s[36:39], 0 ; VI-NEXT: v_mov_b32_e32 v0, 8 -; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 -; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 -; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 +; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 +; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; VI-NEXT: s_movk_i32 s32, 0x800 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4802,10 +4802,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v1, off, s[36:39], s32 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_mov_b32_e32 v0, 8 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; VI-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:16 -; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:20 +; VI-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:8 +; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:12 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -4824,11 +4824,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: v_mov_b32_e32 v0, 3 -; CI-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; CI-NEXT: buffer_store_byte v0, off, s[36:39], 0 ; CI-NEXT: v_mov_b32_e32 v0, 8 -; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 -; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 -; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 +; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 +; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; CI-NEXT: s_movk_i32 s32, 0x800 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4839,10 +4839,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: buffer_store_dword v1, off, s[36:39], s32 -; CI-NEXT: v_mov_b32_e32 v0, 16 +; CI-NEXT: v_mov_b32_e32 v0, 8 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CI-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:16 -; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:20 +; CI-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:8 +; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:12 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt vmcnt(1) @@ -4861,12 +4861,12 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 -; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 -; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4877,10 +4877,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v1, off, s[36:39], s32 -; GFX9-NEXT: v_mov_b32_e32 v0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:20 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:12 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -4898,16 +4898,16 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b8 off, v0, off offset:8 -; GFX11-NEXT: scratch_store_b32 off, v1, off offset:12 -; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX11-NEXT: scratch_store_b8 off, v0, off +; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 -; GFX11-NEXT: v_mov_b32_e32 v0, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, 8 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_u8 v0, off, off offset:16 -; GFX11-NEXT: scratch_load_b32 v1, off, off offset:20 +; GFX11-NEXT: scratch_load_u8 v0, off, off offset:8 +; GFX11-NEXT: scratch_load_b32 v1, off, off offset:12 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -4929,11 +4929,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-NEXT: s_mov_b32 s3, 0x11e80000 ; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: v_mov_b32_e32 v0, 3 -; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8 +; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; HSA-NEXT: v_mov_b32_e32 v0, 8 -; HSA-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 -; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 -; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 +; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; HSA-NEXT: s_movk_i32 s32, 0x800 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 @@ -4942,10 +4942,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; HSA-NEXT: s_waitcnt vmcnt(1) ; HSA-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; HSA-NEXT: v_mov_b32_e32 v0, 16 +; HSA-NEXT: v_mov_b32_e32 v0, 8 ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] -; HSA-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:16 -; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:20 +; HSA-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:8 +; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll index c74d5ef..1d2523d 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -509,13 +509,13 @@ define void @too_many_args_use_workitem_id_x_byval( ; Local stack object initialize. Offset 0 is the emergency spill slot. ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN-DAG: s_movk_i32 s32, 0x400 -; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 +; GCN: buffer_store_dword [[K]], off, s[0:3], 0 ; Pass %arg31 on stack ; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} ; GCN: buffer_store_dword [[K1:v[0-9]+]], off, s[0:3], s32{{$}} -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 49bf48a..0705d49 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -529,16 +529,16 @@ define void @too_many_args_use_workitem_id_x_byval( ; FIXEDABI: v_mov_b32_e32 v31, v0 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 ; FIXEDABI: s_movk_i32 s32, 0x400{{$}} -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0{{$}} ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXME: Why this reload? -; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}} +; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0{{$}} ; FIXEDABI-NOT: s32 -; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4 +; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 ; FIXEDABI: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { %alloca = alloca i32, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll index f9b44f4..927e45f 100644 --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}store_fi_lifetime: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[FI]] define amdgpu_kernel void @store_fi_lifetime(ptr addrspace(1) %out, i32 %in) #0 { entry: @@ -14,7 +14,7 @@ entry: ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] -; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}} +; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] @@ -27,16 +27,16 @@ define amdgpu_kernel void @stored_fi_to_lds(ptr addrspace(3) %ptr) #0 { ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects: -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO]] -; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} +; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} ; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]] define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(ptr addrspace(3) %ptr) #0 { %tmp0 = alloca float, addrspace(5) @@ -51,9 +51,9 @@ define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(ptr addrspace(3) %pt ; Same frame index is used multiple times in the store ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} -; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} +; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @stored_fi_to_self() #0 { %tmp = alloca ptr addrspace(5), addrspace(5) @@ -65,13 +65,13 @@ define amdgpu_kernel void @stored_fi_to_self() #0 { ; GCN-LABEL: {{^}}stored_fi_to_self_offset: ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}} -; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} +; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}} +; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2048{{$}} -; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}} -; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}} +; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}} +; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2048{{$}} define amdgpu_kernel void @stored_fi_to_self_offset() #0 { %tmp0 = alloca [512 x i32], addrspace(5) %tmp1 = alloca ptr addrspace(5), addrspace(5) @@ -86,15 +86,15 @@ define amdgpu_kernel void @stored_fi_to_self_offset() #0 { } ; GCN-LABEL: {{^}}stored_fi_to_fi: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} -; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} -; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} +; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} +; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} -; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} -; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} +; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}} +; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} define amdgpu_kernel void @stored_fi_to_fi() #0 { %tmp0 = alloca ptr addrspace(5), addrspace(5) %tmp1 = alloca ptr addrspace(5), addrspace(5) @@ -104,14 +104,14 @@ define amdgpu_kernel void @stored_fi_to_fi() #0 { store volatile ptr addrspace(5) inttoptr (i32 9999 to ptr addrspace(5)), ptr addrspace(5) %tmp2 - store volatile ptr addrspace(5) %tmp1, ptr addrspace(5) %tmp2 ; store offset 4 at offset 8 - store volatile ptr addrspace(5) %tmp2, ptr addrspace(5) %tmp1 ; store offset 8 at offset 4 + store volatile ptr addrspace(5) %tmp1, ptr addrspace(5) %tmp2 ; store offset 0 at offset 4 + store volatile ptr addrspace(5) %tmp2, ptr addrspace(5) %tmp1 ; store offset 4 at offset 0 ret void } ; GCN-LABEL: {{^}}stored_fi_to_global: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[FI]] define amdgpu_kernel void @stored_fi_to_global(ptr addrspace(1) %ptr) #0 { %tmp = alloca float, addrspace(5) @@ -122,14 +122,14 @@ define amdgpu_kernel void @stored_fi_to_global(ptr addrspace(1) %ptr) #0 { ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} -; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} +; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} +; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}} ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @stored_fi_to_global_2_small_objects(ptr addrspace(1) %ptr) #0 { %tmp0 = alloca float, addrspace(5) @@ -178,7 +178,7 @@ define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(ptr addrspace(1 ; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC_LO]], g1@gotpcrel32@lo+4 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+12 -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[FI]] define amdgpu_kernel void @cannot_select_assertzext_valuetype(ptr addrspace(1) %out, i32 %idx) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll index 42beb1c..7188883 100644 --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -30,7 +30,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 -; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_endpgm ; @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_endpgm ; @@ -48,14 +48,14 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_endpgm ; ; GFX1100-LABEL: test_kern_stack: ; GFX1100: ; %bb.0: ; %entry ; GFX1100-NEXT: v_mov_b32_e32 v0, 0 -; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:4 dlc +; GFX1100-NEXT: scratch_store_b32 off, v0, off dlc ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: s_endpgm entry: @@ -164,7 +164,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX803-NEXT: s_movk_i32 s32, 0x400 -; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 +; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_getpc_b64 s[16:17] ; GFX803-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 @@ -186,7 +186,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: s_movk_i32 s32, 0x400 -; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_getpc_b64 s[16:17] ; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 @@ -210,7 +210,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_getpc_b64 s[16:17] ; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 @@ -229,7 +229,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX1100-NEXT: s_mov_b32 s13, s14 ; GFX1100-NEXT: s_mov_b32 s14, s15 ; GFX1100-NEXT: s_mov_b32 s32, 16 -; GFX1100-NEXT: scratch_store_b32 off, v1, off offset:4 dlc +; GFX1100-NEXT: scratch_store_b32 off, v1, off dlc ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: s_getpc_b64 s[6:7] ; GFX1100-NEXT: s_add_u32 s6, s6, ex@rel32@lo+4 @@ -276,7 +276,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 -; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_endpgm ; @@ -286,7 +286,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_endpgm ; @@ -296,7 +296,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_mov_b32 s33, 0 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_endpgm ; @@ -304,7 +304,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX1100: ; %bb.0: ; %entry ; GFX1100-NEXT: v_mov_b32_e32 v0, 0 ; GFX1100-NEXT: s_mov_b32 s33, 0 -; GFX1100-NEXT: scratch_store_b32 off, v0, s33 offset:4 dlc +; GFX1100-NEXT: scratch_store_b32 off, v0, s33 dlc ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: s_endpgm entry: @@ -436,7 +436,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX803-NEXT: s_movk_i32 s32, 0x400 -; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 +; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_getpc_b64 s[16:17] ; GFX803-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 @@ -459,7 +459,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: s_movk_i32 s32, 0x400 -; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_getpc_b64 s[16:17] ; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 @@ -484,7 +484,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 +; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_getpc_b64 s[16:17] ; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 @@ -504,7 +504,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX1100-NEXT: s_mov_b32 s13, s14 ; GFX1100-NEXT: s_mov_b32 s14, s15 ; GFX1100-NEXT: s_mov_b32 s32, 16 -; GFX1100-NEXT: scratch_store_b32 off, v1, s33 offset:4 dlc +; GFX1100-NEXT: scratch_store_b32 off, v1, s33 dlc ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: s_getpc_b64 s[6:7] ; GFX1100-NEXT: s_add_u32 s6, s6, ex@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index cd36f6a..5615919 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -48,13 +48,13 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_mov_b64 s[4:5], exec ; GCN_DBG-NEXT: s_mov_b64 exec, -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm @@ -62,7 +62,7 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 @@ -87,13 +87,13 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm @@ -151,13 +151,13 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB1_2 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm @@ -165,7 +165,7 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 @@ -190,7 +190,7 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 ; GCN_DBG-NEXT: s_branch .LBB1_2 @@ -239,13 +239,13 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB2_2 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm @@ -253,7 +253,7 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 @@ -278,7 +278,7 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 ; GCN_DBG-NEXT: s_branch .LBB2_2 @@ -328,13 +328,13 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB3_2 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm @@ -342,7 +342,7 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 @@ -365,7 +365,7 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN_DBG-NEXT: s_branch .LBB3_2 @@ -441,13 +441,13 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB4_2 ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm @@ -455,7 +455,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3 @@ -481,7 +481,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 ; GCN_DBG-NEXT: s_branch .LBB4_2 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index afb7357..49f9f69 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -136,8 +136,8 @@ done: ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: ; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}} +; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088 glc{{$}} ; GCN: {{^}}.LBB4_2: define amdgpu_kernel void @test_sink_scratch_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) { entry: @@ -166,7 +166,8 @@ done: ret void } -; This ends up not fitting due to the reserved 4 bytes at offset 0 +; This used to be a special case when the scavenge slot was +; fixed at offset 0. ; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved( ; OPT-NOT: getelementptr [512 x i32] ; OPT: br i1 @@ -174,10 +175,8 @@ done: ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved: ; GCN: s_and_saveexec_b64 -; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4 -; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}} -; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4 -; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092 glc{{$}} +; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}} ; GCN: {{^.LBB[0-9]+}}_2: define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) { diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index dfc8361..397efb1 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -432,22 +432,22 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 -; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 +; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 +; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -464,19 +464,19 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_short off, v0, s4 offset:4 +; FLATSCR-NEXT: scratch_store_short off, v0, s4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_short off, v0, s4 offset:6 +; FLATSCR-NEXT: scratch_store_short off, v0, s4 offset:2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:8 +; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 -; FLATSCR-NEXT: scratch_load_dword v1, off, s0 offset:6 +; FLATSCR-NEXT: scratch_load_dword v0, off, s0 +; FLATSCR-NEXT: scratch_load_dword v1, off, s0 offset:2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR-NEXT: s_endpgm @@ -490,24 +490,24 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; GFX10_DEFAULT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) -; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) -; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 +; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) -; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 +; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10_DEFAULT-NEXT: s_clause 0x1 -; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 -; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 +; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 +; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1) ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 -; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 +; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10_DEFAULT-NEXT: s_endpgm @@ -524,21 +524,21 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; FLATSCR_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) -; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 offset:4 +; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) -; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 offset:6 +; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 offset:2 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) -; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:8 +; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: s_clause 0x1 -; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, s0 offset:4 -; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, s0 offset:6 +; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, s0 +; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, s0 offset:2 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR_GFX10-NEXT: s_endpgm @@ -550,19 +550,19 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc +; GFX11-NEXT: scratch_store_b16 off, v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b16 off, v0, off offset:6 dlc +; GFX11-NEXT: scratch_store_b16 off, v0, off offset:2 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b16 off, v0, off offset:8 dlc +; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 -; GFX11-NEXT: scratch_load_b32 v1, off, off offset:6 +; GFX11-NEXT: scratch_load_b32 v0, off, off +; GFX11-NEXT: scratch_load_b32 v1, off, off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 8bd60aa..6422bee 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -51,21 +51,21 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 ; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -73,12 +73,12 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 @@ -98,7 +98,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -106,12 +106,12 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 @@ -130,7 +130,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB0_3: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 @@ -139,7 +139,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 @@ -225,21 +225,21 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 ; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -247,12 +247,12 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 @@ -272,7 +272,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -280,12 +280,12 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 @@ -305,7 +305,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: .LBB1_3: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 @@ -315,7 +315,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s2, v0, 4 @@ -323,7 +323,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 @@ -341,7 +341,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_branch .LBB1_3 ; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 @@ -436,7 +436,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) @@ -445,7 +445,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 @@ -468,7 +468,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -476,9 +476,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 @@ -488,7 +488,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 ; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_2 @@ -496,7 +496,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB2_2: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 @@ -506,19 +506,19 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_5 ; GCN-O0-NEXT: ; %bb.3: ; %bb.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 @@ -538,12 +538,12 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB2_4: ; %bb.else ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 @@ -562,7 +562,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB2_5: ; %Flow1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 @@ -571,7 +571,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 @@ -684,11 +684,11 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s0, v1 ; GCN-O0-NEXT: s_mov_b32 s1, 0 @@ -707,9 +707,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3] ; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v6, v2 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 @@ -725,7 +725,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_1 @@ -733,7 +733,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB3_1: ; %Flow2 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 @@ -743,18 +743,18 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_8 ; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -772,15 +772,15 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_7 ; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then ; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -797,11 +797,11 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: s_mov_b32 s2, s0 @@ -818,15 +818,15 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_6 ; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2 ; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -842,7 +842,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB3_6: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 @@ -852,7 +852,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB3_7: ; %Flow1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 @@ -861,7 +861,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 @@ -938,34 +938,34 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 ; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB4_2 ; GCN-O0-NEXT: ; %bb.1: ; %bb.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 @@ -983,7 +983,7 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: .LBB4_2: ; %bb.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll index 1d2e211..d94e75c8 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll @@ -699,7 +699,7 @@ define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrsp ; GCN-LABEL: {{^}}commute_frameindex: ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} ; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]] define amdgpu_kernel void @commute_frameindex(ptr addrspace(1) nocapture %out) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 6ef14d3..2b5a8d9 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -26,7 +26,7 @@ ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 -; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 ; 4-byte Folded Spill ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]], [[CMP0]] ; GCN: s_mov_b64 exec, s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]] @@ -50,7 +50,7 @@ ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 @@ -99,7 +99,7 @@ endif: ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 -; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 ; 4-byte Folded Spill ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]] @@ -123,7 +123,7 @@ endif: ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 @@ -176,11 +176,11 @@ end: ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] -; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 ; 4-byte Folded Spill ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 -; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, [[CMP0]] @@ -189,11 +189,11 @@ end: ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] ; GCN: [[FLOW]]: ; %Flow -; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] ; 4-byte Folded Reload +; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] +; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0 ; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1 diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll index 5fda337..3802dc5 100644 --- a/llvm/test/CodeGen/AMDGPU/extload-private.ll +++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i8_sext_private: -; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} +; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0{{$}} define amdgpu_kernel void @load_i8_sext_private(ptr addrspace(1) %out) { entry: %tmp0 = alloca i8, addrspace(5) @@ -13,7 +13,7 @@ entry: } ; FUNC-LABEL: {{^}}load_i8_zext_private: -; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} +; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0{{$}} define amdgpu_kernel void @load_i8_zext_private(ptr addrspace(1) %out) { entry: %tmp0 = alloca i8, addrspace(5) @@ -24,7 +24,7 @@ entry: } ; FUNC-LABEL: {{^}}load_i16_sext_private: -; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} +; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0{{$}} define amdgpu_kernel void @load_i16_sext_private(ptr addrspace(1) %out) { entry: %tmp0 = alloca i16, addrspace(5) @@ -35,7 +35,7 @@ entry: } ; FUNC-LABEL: {{^}}load_i16_zext_private: -; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 glc{{$}} +; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}} define amdgpu_kernel void @load_i16_zext_private(ptr addrspace(1) %out) { entry: %tmp0 = alloca i16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index 51a4db1..1ba7e70 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -410,6 +410,8 @@ entry: } ; GCN-LABEL: {{^}}bit4_extelt: +; FIXME: One v_mov_b32_e32 vN, 0 should suffice +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN-DAG: buffer_store_byte [[ZERO]], diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index bcf6bda..57991d6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; FLAT_SCR_OPT-NEXT: s_mov_b64 s[0:1], src_private_base -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 4 +; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 ; FLAT_SCR_OPT-NEXT: flat_store_dword v[0:1], v2 @@ -22,7 +22,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { ; FLAT_SCR_ARCH-LABEL: stack_object_addrspacecast_in_kernel_no_calls: ; FLAT_SCR_ARCH: ; %bb.0: ; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[0:1], src_private_base -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, 4 +; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, s1 ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 ; FLAT_SCR_ARCH-NEXT: flat_store_dword v[0:1], v2 @@ -43,7 +43,7 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() { ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 0 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s0 offset:4 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s0 ; FLAT_SCR_OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; FLAT_SCR_OPT-NEXT: s_endpgm ; @@ -51,7 +51,7 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() { ; FLAT_SCR_ARCH: ; %bb.0: ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 0 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s0 offset:4 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s0 ; FLAT_SCR_ARCH-NEXT: s_waitcnt_vscnt null, 0x0 ; FLAT_SCR_ARCH-NEXT: s_endpgm %alloca = alloca i32, addrspace(5) @@ -120,7 +120,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0 ; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1 ; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 4 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 0 ; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 ; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 @@ -221,7 +221,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 4 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 0 ; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 ; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 @@ -243,7 +243,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0 ; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1 ; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 4 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 0 ; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 ; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 @@ -344,7 +344,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 4 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 0 ; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 ; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 8284a77..0af57c6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -15,7 +15,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 @@ -36,7 +36,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX940-GISEL-LABEL: soff1_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 @@ -58,7 +58,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 @@ -76,7 +76,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 @@ -95,7 +95,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -110,7 +110,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS @@ -138,7 +138,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 @@ -159,7 +159,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX940-GISEL-LABEL: soff1_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -184,7 +184,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 @@ -221,7 +221,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -237,7 +237,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS @@ -265,7 +265,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 @@ -286,7 +286,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX940-GISEL-LABEL: soff1_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -311,7 +311,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 @@ -330,7 +330,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 @@ -348,7 +348,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -364,7 +364,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS @@ -392,7 +392,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -414,7 +414,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX940-GISEL-LABEL: soff2_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1 @@ -439,7 +439,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 @@ -458,7 +458,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 @@ -478,7 +478,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -495,7 +495,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS @@ -523,7 +523,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -544,7 +544,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX940-GISEL-LABEL: soff2_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -571,7 +571,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 @@ -591,7 +591,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 @@ -612,7 +612,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -629,7 +629,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS @@ -657,7 +657,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -678,7 +678,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX940-GISEL-LABEL: soff2_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -705,7 +705,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 @@ -725,7 +725,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 @@ -746,7 +746,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -763,7 +763,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS @@ -791,7 +791,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 @@ -813,7 +813,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX940-GISEL-LABEL: soff4_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2 @@ -838,7 +838,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 @@ -857,7 +857,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 @@ -877,7 +877,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -894,7 +894,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS @@ -922,7 +922,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 @@ -943,7 +943,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX940-GISEL-LABEL: soff4_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -970,7 +970,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 @@ -990,7 +990,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -1028,7 +1028,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS @@ -1056,7 +1056,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, 2 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1076,7 +1076,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX940-GISEL-LABEL: soff4_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1103,7 +1103,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 @@ -1143,7 +1143,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -1160,7 +1160,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 84e3879..687d845 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -23,10 +23,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:52 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:4 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_kernel: @@ -43,10 +43,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:52 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_kernel: @@ -59,10 +59,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:52 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:36 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:20 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:4 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:48 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:32 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: zero_init_kernel: @@ -75,10 +75,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:52 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:36 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:20 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:4 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:48 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: zero_init_kernel: @@ -98,10 +98,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:52 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:4 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_kernel: @@ -112,10 +112,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX940-NEXT: s_mov_b32 s3, s0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:52 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 sc0 sc1 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 sc0 sc1 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_kernel: @@ -137,10 +137,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:52 -; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36 -; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20 -; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:4 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_kernel: @@ -162,10 +162,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:52 -; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 -; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 -; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_kernel: @@ -178,10 +178,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 -; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:52 -; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:36 -; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:20 -; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:4 +; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:48 +; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:32 +; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16 +; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off ; GFX11-PAL-NEXT: s_endpgm ; ; GFX12-PAL-LABEL: zero_init_kernel: @@ -194,10 +194,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:52 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:36 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:20 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:4 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:48 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:32 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off ; GFX12-PAL-NEXT: s_endpgm %alloca = alloca [32 x i16], align 2, addrspace(5) call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) @@ -381,11 +381,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: s_add_i32 s1, s1, 4 +; GFX9-NEXT: s_add_i32 s1, s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, 4 +; GFX9-NEXT: s_add_i32 s0, s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -402,8 +402,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-NEXT: s_add_i32 s0, s0, 4 -; GFX10-NEXT: s_add_i32 s1, s1, 4 +; GFX10-NEXT: s_add_i32 s0, s0, 0 +; GFX10-NEXT: s_add_i32 s1, s1, 0 ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -418,8 +418,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-NEXT: s_add_i32 s0, s0, 4 -; GFX11-NEXT: s_add_i32 s1, s1, 4 +; GFX11-NEXT: s_add_i32 s0, s0, 0 +; GFX11-NEXT: s_add_i32 s1, s1, 0 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -434,8 +434,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-NEXT: s_add_co_i32 s0, s0, 4 -; GFX12-NEXT: s_add_co_i32 s1, s1, 4 +; GFX12-NEXT: s_add_co_i32 s0, s0, 0 +; GFX12-NEXT: s_add_co_i32 s1, s1, 0 ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -455,11 +455,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 +; GFX9-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 +; GFX9-PAL-NEXT: s_add_i32 s0, s0, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm @@ -471,11 +471,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_add_i32 s1, s1, 4 +; GFX940-NEXT: s_add_i32 s1, s1, 0 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: s_add_i32 s0, s0, 0 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -497,8 +497,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 -; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 +; GFX10-PAL-NEXT: s_add_i32 s0, s0, 0 +; GFX10-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -513,8 +513,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4 -; GFX11-PAL-NEXT: s_add_i32 s1, s1, 4 +; GFX11-PAL-NEXT: s_add_i32 s0, s0, 0 +; GFX11-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -529,8 +529,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, 4 -; GFX12-PAL-NEXT: s_add_co_i32 s1, s1, 4 +; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, 0 +; GFX12-PAL-NEXT: s_add_co_i32 s1, s1, 0 ; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -552,13 +552,13 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s2, 2 -; GFX9-NEXT: s_add_i32 s0, s0, 4 +; GFX9-NEXT: s_add_i32 s0, s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-NEXT: s_add_i32 s0, s0, 4 +; GFX9-NEXT: s_add_i32 s0, s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -573,8 +573,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-NEXT: s_add_i32 s1, s1, 4 -; GFX10-NEXT: s_add_i32 s0, s0, 4 +; GFX10-NEXT: s_add_i32 s1, s1, 0 +; GFX10-NEXT: s_add_i32 s0, s0, 0 ; GFX10-NEXT: scratch_store_dword off, v0, s1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc @@ -587,8 +587,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-NEXT: s_add_i32 s0, s0, 4 -; GFX11-NEXT: s_add_i32 s1, s1, 4 +; GFX11-NEXT: s_add_i32 s0, s0, 0 +; GFX11-NEXT: s_add_i32 s1, s1, 0 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -601,8 +601,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-NEXT: s_add_co_i32 s0, s0, 4 -; GFX12-NEXT: s_add_co_i32 s1, s1, 4 +; GFX12-NEXT: s_add_co_i32 s0, s0, 0 +; GFX12-NEXT: s_add_co_i32 s1, s1, 0 ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -621,11 +621,11 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 +; GFX9-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 +; GFX9-PAL-NEXT: s_add_i32 s0, s0, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm @@ -634,12 +634,12 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_add_i32 s1, s1, 4 +; GFX940-NEXT: s_add_i32 s1, s1, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: s_add_i32 s0, s0, 0 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -659,8 +659,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 -; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 +; GFX10-PAL-NEXT: s_add_i32 s0, s0, 0 +; GFX10-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -673,8 +673,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4 -; GFX11-PAL-NEXT: s_add_i32 s1, s1, 4 +; GFX11-PAL-NEXT: s_add_i32 s0, s0, 0 +; GFX11-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -687,8 +687,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, 4 -; GFX12-PAL-NEXT: s_add_co_i32 s1, s1, 4 +; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, 0 +; GFX12-PAL-NEXT: s_add_co_i32 s1, s1, 0 ; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -710,11 +710,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v1, 4, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -727,8 +727,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc @@ -739,8 +739,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0 -; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 +; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -750,8 +750,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_sub_nc_u32_e32 v2, 4, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0, v0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -763,9 +763,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-PAL-NEXT: v_add_u32_e32 v1, 4, v0 +; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 4, v0 +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 @@ -780,9 +780,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0 +; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -800,8 +800,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 4, v0 -; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 4, v0 +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 0, v0 +; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc @@ -812,8 +812,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc +; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -823,8 +823,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 @@ -1073,10 +1073,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:260 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:292 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:308 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:304 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_small_offset_kernel: @@ -1085,7 +1085,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 @@ -1095,15 +1095,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_small_offset_kernel: ; GFX11: ; %bb.0: -; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1113,15 +1113,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:260 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:276 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:292 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:308 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:256 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:272 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:288 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:304 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: zero_init_small_offset_kernel: ; GFX12: ; %bb.0: -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1131,10 +1131,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:260 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:276 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:292 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:308 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:256 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:272 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:288 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:304 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: zero_init_small_offset_kernel: @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 @@ -1156,15 +1156,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:260 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:292 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:308 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:304 ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_small_offset_kernel: ; GFX940: ; %bb.0: -; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_mov_b32 s0, 0 ; GFX940-NEXT: s_mov_b32 s1, s0 @@ -1172,10 +1172,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX940-NEXT: s_mov_b32 s3, s0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 sc0 sc1 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 sc0 sc1 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 sc0 sc1 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 sc0 sc1 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 @@ -1199,10 +1199,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:260 -; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276 -; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:292 -; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:308 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:304 ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 @@ -1226,15 +1226,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260 -; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 -; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 -; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_small_offset_kernel: ; GFX11-PAL: ; %bb.0: -; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1244,15 +1244,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 -; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:260 -; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:276 -; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:292 -; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:308 +; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:256 +; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:272 +; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:288 +; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:304 ; GFX11-PAL-NEXT: s_endpgm ; ; GFX12-PAL-LABEL: zero_init_small_offset_kernel: ; GFX12-PAL: ; %bb.0: -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 ; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1262,10 +1262,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:260 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:276 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:292 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:308 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:256 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:272 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:288 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:304 ; GFX12-PAL-NEXT: s_endpgm %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) @@ -1470,16 +1470,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc +; GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_addk_i32 s1, 0x104 +; GFX9-NEXT: s_addk_i32 s1, 0x100 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_addk_i32 s0, 0x104 +; GFX9-NEXT: s_addk_i32 s0, 0x100 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -1491,15 +1491,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-NEXT: s_addk_i32 s0, 0x104 -; GFX10-NEXT: s_addk_i32 s1, 0x104 +; GFX10-NEXT: s_addk_i32 s0, 0x100 +; GFX10-NEXT: s_addk_i32 s1, 0x100 ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -1509,15 +1509,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-NEXT: s_addk_i32 s0, 0x104 -; GFX11-NEXT: s_addk_i32 s1, 0x104 +; GFX11-NEXT: s_addk_i32 s0, 0x100 +; GFX11-NEXT: s_addk_i32 s1, 0x100 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -1527,15 +1527,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-NEXT: s_addk_co_i32 s0, 0x104 -; GFX12-NEXT: s_addk_co_i32 s1, 0x104 +; GFX12-NEXT: s_addk_co_i32 s0, 0x100 +; GFX12-NEXT: s_addk_co_i32 s1, 0x100 ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -1553,16 +1553,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 +; GFX9-PAL-NEXT: s_addk_i32 s1, 0x100 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 +; GFX9-PAL-NEXT: s_addk_i32 s0, 0x100 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm @@ -1570,17 +1570,17 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX940-LABEL: store_load_sindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_addk_i32 s1, 0x104 +; GFX940-NEXT: s_addk_i32 s1, 0x100 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_addk_i32 s0, 0x104 +; GFX940-NEXT: s_addk_i32 s0, 0x100 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -1598,15 +1598,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 -; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 -; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 +; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x100 +; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x100 ; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -1625,15 +1625,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 -; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 +; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x100 +; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x100 ; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -1643,15 +1643,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-PAL-NEXT: s_addk_i32 s0, 0x104 -; GFX11-PAL-NEXT: s_addk_i32 s1, 0x104 +; GFX11-PAL-NEXT: s_addk_i32 s0, 0x100 +; GFX11-PAL-NEXT: s_addk_i32 s1, 0x100 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -1661,15 +1661,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x104 -; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x104 +; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x100 +; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x100 ; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -1694,16 +1694,16 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, s2, 2 -; GFX9-NEXT: s_addk_i32 s0, 0x104 +; GFX9-NEXT: s_addk_i32 s0, 0x100 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-NEXT: s_addk_i32 s0, 0x104 +; GFX9-NEXT: s_addk_i32 s0, 0x100 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -1714,14 +1714,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-NEXT: s_addk_i32 s1, 0x104 -; GFX10-NEXT: s_addk_i32 s0, 0x104 +; GFX10-NEXT: s_addk_i32 s1, 0x100 +; GFX10-NEXT: s_addk_i32 s0, 0x100 ; GFX10-NEXT: scratch_store_dword off, v0, s1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc @@ -1730,14 +1730,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; ; GFX11-LABEL: store_load_sindex_small_offset_foo: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-NEXT: s_addk_i32 s0, 0x104 -; GFX11-NEXT: s_addk_i32 s1, 0x104 +; GFX11-NEXT: s_addk_i32 s0, 0x100 +; GFX11-NEXT: s_addk_i32 s1, 0x100 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -1746,14 +1746,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; ; GFX12-LABEL: store_load_sindex_small_offset_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-NEXT: s_addk_co_i32 s0, 0x104 -; GFX12-NEXT: s_addk_co_i32 s1, 0x104 +; GFX12-NEXT: s_addk_co_i32 s0, 0x100 +; GFX12-NEXT: s_addk_co_i32 s1, 0x100 ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -1770,32 +1770,32 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 +; GFX9-PAL-NEXT: s_addk_i32 s1, 0x100 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 +; GFX9-PAL-NEXT: s_addk_i32 s0, 0x100 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_small_offset_foo: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_addk_i32 s1, 0x104 +; GFX940-NEXT: s_addk_i32 s1, 0x100 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_addk_i32 s0, 0x104 +; GFX940-NEXT: s_addk_i32 s0, 0x100 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -1812,14 +1812,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 -; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 -; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 +; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x100 +; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x100 ; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -1837,14 +1837,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 -; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 +; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x100 +; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x100 ; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -1853,14 +1853,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-PAL-NEXT: s_addk_i32 s0, 0x104 -; GFX11-PAL-NEXT: s_addk_i32 s1, 0x104 +; GFX11-PAL-NEXT: s_addk_i32 s0, 0x100 +; GFX11-PAL-NEXT: s_addk_i32 s1, 0x100 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -1869,14 +1869,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_small_offset_foo: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x104 -; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x104 +; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x100 +; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x100 ; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -1901,14 +1901,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc +; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x100, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0x100, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -1921,10 +1921,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc +; GFX10-NEXT: scratch_load_dword v3, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x100, v0 ; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc @@ -1934,10 +1934,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc +; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 -; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1946,10 +1946,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-LABEL: store_load_vindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS +; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1967,25 +1967,25 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc +; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x104, v0 +; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x100, v0 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x100, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:260 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v0, v1, off offset:256 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0 +; GFX940-NEXT: v_sub_u32_e32 v0, 0x100, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -2004,10 +2004,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1010-PAL-NEXT: scratch_load_dword v3, off, s0 offset:4 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v3, off, s0 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x100, v0 ; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc @@ -2027,10 +2027,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x100, v0 ; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc @@ -2040,10 +2040,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc +; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2052,10 +2052,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS +; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 @@ -2305,7 +2305,7 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX12-LABEL: zero_init_large_offset_kernel: ; GFX12: ; %bb.0: -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -2315,10 +2315,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16388 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16404 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16420 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16436 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16384 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16400 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16416 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16432 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: zero_init_large_offset_kernel: @@ -2441,7 +2441,7 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX12-PAL-LABEL: zero_init_large_offset_kernel: ; GFX12-PAL: ; %bb.0: -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 ; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -2451,10 +2451,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16388 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16404 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16420 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16436 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16384 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16400 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16416 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16432 ; GFX12-PAL-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) @@ -2768,15 +2768,15 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-NEXT: s_addk_co_i32 s0, 0x4004 -; GFX12-NEXT: s_addk_co_i32 s1, 0x4004 +; GFX12-NEXT: s_addk_co_i32 s0, 0x4000 +; GFX12-NEXT: s_addk_co_i32 s1, 0x4000 ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -2902,15 +2902,15 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4004 -; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x4004 +; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4000 +; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x4000 ; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -2987,14 +2987,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; ; GFX12-LABEL: store_load_sindex_large_offset_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-NEXT: s_addk_co_i32 s0, 0x4004 -; GFX12-NEXT: s_addk_co_i32 s1, 0x4004 +; GFX12-NEXT: s_addk_co_i32 s0, 0x4000 +; GFX12-NEXT: s_addk_co_i32 s1, 0x4000 ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -3110,14 +3110,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_large_offset_foo: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4004 -; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x4004 +; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4000 +; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x4000 ; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -3188,10 +3188,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-LABEL: store_load_vindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS +; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3296,10 +3296,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS +; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 @@ -3537,11 +3537,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX12-LABEL: store_load_large_imm_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX12-NEXT: scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 off, v0, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -3642,11 +3642,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX12-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX12-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 -; GFX12-PAL-NEXT: scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: @@ -3812,7 +3812,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 @@ -3834,7 +3834,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 0 ; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc @@ -3847,7 +3847,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, 0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1024 glc dlc @@ -3860,9 +3860,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1028 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1028 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -3871,7 +3871,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] ; GFX9-PAL-NEXT: s_mov_b32 s4, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff @@ -3889,7 +3889,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX940-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -3915,7 +3915,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 0 ; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc @@ -3928,7 +3928,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1024 glc dlc @@ -3941,9 +3941,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1028 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: @@ -4732,11 +4732,11 @@ define amdgpu_ps void @large_offset() { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use v0 ; GFX12-NEXT: ;;#ASMEND @@ -4850,11 +4850,11 @@ define amdgpu_ps void @large_offset() { ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 -; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 scope:SCOPE_SYS +; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 +; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800 ; GFX12-PAL-NEXT: ;;#ASMSTART ; GFX12-PAL-NEXT: ; use v0 ; GFX12-PAL-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir b/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir index d7a1b2d..17ec6f5 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir @@ -22,7 +22,7 @@ body: | ; GFX11: liveins: $sgpr0_sgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: renamable $vgpr0 = V_MOV_B32_e32 123, implicit $exec - ; GFX11-NEXT: renamable $vgpr0 = SCRATCH_LOAD_SHORT_D16_HI_ST 4, 0, killed renamable $vgpr0, implicit $exec, implicit $flat_scr + ; GFX11-NEXT: renamable $vgpr0 = SCRATCH_LOAD_SHORT_D16_HI_ST 0, 0, killed renamable $vgpr0, implicit $exec, implicit $flat_scr ; GFX11-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr0_sgpr1, 4, 0 ; GFX11-NEXT: renamable $sgpr0 = S_LSHL_B32 killed renamable $sgpr0, 1, implicit-def dead $scc ; GFX11-NEXT: renamable $vgpr1 = COPY killed renamable $sgpr0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index 028f328..eeddc22 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -310,7 +310,7 @@ ret: ; GFX11-LABEL: tied_operand_test: ; GFX11: ; %bb.0: ; %entry -; GFX11-DAG: scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off offset:4 +; GFX11-DAG: scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off ; GFX11-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7b ; GFX11-DAG: ds_store_b16 v{{[0-9]+}}, [[LDRESULT]] offset:10 ; GFX11-DAG: ds_store_b16 v{{[0-9]+}}, [[C]] offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 6eec8d5..9c7ce39 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -1227,9 +1227,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -1271,9 +1271,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-NEXT: ; %bb.1: @@ -1431,9 +1431,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -1475,9 +1475,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -2457,9 +2457,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -2501,9 +2501,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-NEXT: ; %bb.1: @@ -2661,9 +2661,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -2705,9 +2705,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -4355,9 +4355,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -4399,9 +4399,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-NEXT: ; %bb.1: @@ -4559,9 +4559,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -4603,9 +4603,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index c927a0e..11d35c5 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -1331,9 +1331,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -1375,9 +1375,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-NEXT: ; %bb.1: @@ -1535,9 +1535,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -1579,9 +1579,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -2561,9 +2561,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -2605,9 +2605,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-NEXT: ; %bb.1: @@ -2765,9 +2765,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -2809,9 +2809,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -4563,9 +4563,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -4607,9 +4607,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-NEXT: ; %bb.1: @@ -4767,9 +4767,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -4811,9 +4811,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 -; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll index 5882043..b9269e2 100644 --- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll @@ -7,12 +7,12 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xfffc, [[FI]] ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], define amdgpu_kernel void @scratch_buffer_known_high_masklo16() { %alloca = alloca i32, align 4, addrspace(5) - store volatile i32 0, ptr addrspace(5) %alloca + store volatile i32 15, ptr addrspace(5) %alloca %toint = ptrtoint ptr addrspace(5) %alloca to i32 %masked = and i32 %toint, 65535 store volatile i32 %masked, ptr addrspace(1) undef @@ -20,7 +20,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() { } ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} ; SCRATCH128K-NOT: v_and_b32 ; SCRATCH256K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]] ; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]] @@ -28,7 +28,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() { ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], define amdgpu_kernel void @scratch_buffer_known_high_masklo17() { %alloca = alloca i32, align 4, addrspace(5) - store volatile i32 0, ptr addrspace(5) %alloca + store volatile i32 15, ptr addrspace(5) %alloca %toint = ptrtoint ptr addrspace(5) %alloca to i32 %masked = and i32 %toint, 131071 store volatile i32 %masked, ptr addrspace(1) undef @@ -36,7 +36,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() { } ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo18: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} ; SCRATCH128K-NOT: v_and_b32 ; SCRATCH256K-NOT: v_and_b32 ; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]] @@ -44,7 +44,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() { ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], define amdgpu_kernel void @scratch_buffer_known_high_masklo18() { %alloca = alloca i32, align 4, addrspace(5) - store volatile i32 0, ptr addrspace(5) %alloca + store volatile i32 15, ptr addrspace(5) %alloca %toint = ptrtoint ptr addrspace(5) %alloca to i32 %masked = and i32 %toint, 262143 store volatile i32 %masked, ptr addrspace(1) undef @@ -52,7 +52,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo18() { } ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo20: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} ; SCRATCH128K-NOT: v_and_b32 ; SCRATCH256K-NOT: v_and_b32 ; SCRATCH1024K-NOT: v_and_b32 @@ -60,7 +60,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo18() { ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], define amdgpu_kernel void @scratch_buffer_known_high_masklo20() { %alloca = alloca i32, align 4, addrspace(5) - store volatile i32 0, ptr addrspace(5) %alloca + store volatile i32 15, ptr addrspace(5) %alloca %toint = ptrtoint ptr addrspace(5) %alloca to i32 %masked = and i32 %toint, 1048575 store volatile i32 %masked, ptr addrspace(1) undef @@ -68,12 +68,12 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo20() { } ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo21: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} ; GCN-NOT: v_and_b32 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], define amdgpu_kernel void @scratch_buffer_known_high_masklo21() { %alloca = alloca i32, align 4, addrspace(5) - store volatile i32 0, ptr addrspace(5) %alloca + store volatile i32 15, ptr addrspace(5) %alloca %toint = ptrtoint ptr addrspace(5) %alloca to i32 %masked = and i32 %toint, 2097151 store volatile i32 %masked, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 823c444..f736ca7 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -969,7 +969,7 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: s_add_u32 s4, s4, s3 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_addc_u32 s5, s5, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 4 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 3 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -980,16 +980,16 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_and_b32_e32 v3, 3, v3 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0 offset:7 -; GCN-NEXT: buffer_store_byte v3, off, s[4:7], 0 offset:6 -; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:5 +; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0 offset:3 +; GCN-NEXT: buffer_store_byte v3, off, s[4:7], 0 offset:2 +; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 ; GCN-NEXT: buffer_store_byte v1, v0, s[4:7], 0 offen -; GCN-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:5 -; GCN-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:6 -; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:7 +; GCN-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:1 +; GCN-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:2 +; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:3 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll index 5873e9c..6f61179 100644 --- a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll @@ -4,7 +4,7 @@ ; alignment of the stack ; CHECK-LABEL: {{^}}no_args: -; CHECK: ScratchSize: 5{{$}} +; CHECK: ScratchSize: 8{{$}} define amdgpu_kernel void @no_args() { %alloca = alloca i8, addrspace(5) store volatile i8 0, ptr addrspace(5) %alloca @@ -12,7 +12,7 @@ define amdgpu_kernel void @no_args() { } ; CHECK-LABEL: {{^}}force_align32: -; CHECK: ScratchSize: 5{{$}} +; CHECK: ScratchSize: 8{{$}} define amdgpu_kernel void @force_align32(<8 x i32>) { %alloca = alloca i8, addrspace(5) store volatile i8 0, ptr addrspace(5) %alloca @@ -20,7 +20,7 @@ define amdgpu_kernel void @force_align32(<8 x i32>) { } ; CHECK-LABEL: {{^}}force_align64: -; CHECK: ScratchSize: 5{{$}} +; CHECK: ScratchSize: 8{{$}} define amdgpu_kernel void @force_align64(<16 x i32>) { %alloca = alloca i8, addrspace(5) store volatile i8 0, ptr addrspace(5) %alloca @@ -28,7 +28,7 @@ define amdgpu_kernel void @force_align64(<16 x i32>) { } ; CHECK-LABEL: {{^}}force_align128: -; CHECK: ScratchSize: 5{{$}} +; CHECK: ScratchSize: 8{{$}} define amdgpu_kernel void @force_align128(<32 x i32>) { %alloca = alloca i8, addrspace(5) store volatile i8 0, ptr addrspace(5) %alloca @@ -36,7 +36,7 @@ define amdgpu_kernel void @force_align128(<32 x i32>) { } ; CHECK-LABEL: {{^}}force_align256: -; CHECK: ScratchSize: 5{{$}} +; CHECK: ScratchSize: 8{{$}} define amdgpu_kernel void @force_align256(<64 x i32>) { %alloca = alloca i8, addrspace(5) store volatile i8 0, ptr addrspace(5) %alloca diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll index 13a8033..a209dcf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -444,7 +444,7 @@ main_body: ; for stack access. ; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset: -; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} +; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} ; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) { %alloca = alloca i32, addrspace(5) @@ -455,7 +455,7 @@ define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) { } ; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset: -; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 0{{$}} ; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s ; CHECK: buffer_load_dword v0, v[[[FI]]:[[HI]] define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll index e9d9b66..8598b78 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll @@ -3,7 +3,7 @@ ; FIXME: Requires stack object to not assert ; GCN-LABEL: {{^}}test_ps: ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GCN: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GCN: buffer_store_dword v0, off, s[4:7], 0{{$}} ; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: ; return @@ -17,7 +17,7 @@ define amdgpu_ps i32 @test_ps() #1 { ; GCN-LABEL: {{^}}test_cs: ; GCN: s_mov_b64 s[4:5], s[0:1] -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:4 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0{{$}} ; GCN: s_load_dword s0, s[0:1], 0x0 define amdgpu_cs i32 @test_cs() #1 { %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index e789db1..0284f44 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -58,7 +58,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %o ; Make sure this doesn't crash. ; CHECK-LABEL: {{^}}test_readfirstlane_fi: -; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 4 +; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0 define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { %alloca = alloca i32, addrspace(5) %int = ptrtoint ptr addrspace(5) %alloca to i32 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index b288535..21e27bf 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3478,19 +3478,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xffff, v15 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v13 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v12 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 @@ -3562,17 +3562,17 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -3801,20 +3801,20 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v15 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v13 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v19, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v20, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v21, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v19, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v20, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v21, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 @@ -3885,16 +3885,16 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -4289,11 +4289,11 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 @@ -4370,10 +4370,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -4602,20 +4602,20 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v13 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 16, v12 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v14, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v14, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v10 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v11, 0, 16 @@ -4686,16 +4686,16 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -7270,11 +7270,11 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v3 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v15 @@ -7334,16 +7334,16 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v39 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v39 @@ -7363,10 +7363,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index af96165..0f9cc33 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -3031,11 +3031,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v29 ; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v30 ; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v31 -; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) -; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) @@ -3090,10 +3090,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192 -; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160 @@ -3611,11 +3611,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[8:11], 0 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v10 @@ -3654,11 +3654,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index ee2c590..940287d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -390,7 +390,7 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-NEXT: s_add_u32 s16, s16, s3 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 -; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_brev_b32 s0, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -401,7 +401,7 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4 +; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -421,11 +421,11 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1 ; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8 -; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off offset:4 +; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-SCRATCH-NEXT: ;;#ASMSTART ; GCN-SCRATCH-NEXT: ;;#ASMEND -; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off offset:4 +; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s10 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s11 @@ -460,15 +460,15 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 ; GCN-NEXT: s_add_u32 s4, s4, s3 ; GCN-NEXT: s_addc_u32 s5, s5, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0x40d00000 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:8 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: exp mrt0 v0, off, off, off done vm @@ -482,15 +482,15 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 ; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40d00000 -; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off offset:4 +; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-SCRATCH-NEXT: scratch_store_dword off, v1, off offset:8 +; GCN-SCRATCH-NEXT: scratch_store_dword off, v1, off offset:4 ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-SCRATCH-NEXT: ;;#ASMSTART ; GCN-SCRATCH-NEXT: ;;#ASMEND ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: scratch_load_dword v0, off, off offset:4 -; GCN-SCRATCH-NEXT: scratch_load_dword v1, off, off offset:8 +; GCN-SCRATCH-NEXT: scratch_load_dword v0, off, off +; GCN-SCRATCH-NEXT: scratch_load_dword v1, off, off offset:4 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-SCRATCH-NEXT: exp mrt0 v0, off, off, off done vm diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll index 301f971..4ba5f3a 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -137,11 +137,11 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { ; so a possibly negative base index can't be used for the vgpr offset. ; GCN-LABEL: {{^}}store_private_unknown_bits_vaddr: -; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 4 +; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 0 ; SICIVI: v_add_{{i|u}}32_e32 [[ADDR1:v[0-9]+]], vcc, 32, [[ADDR0]] ; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} -; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 4, +; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 0, ; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 { %alloca = alloca [16 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 75da11b..45fbaaa 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) + ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) @@ -48,7 +48,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -86,7 +86,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) + ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) @@ -94,7 +94,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll index d6d559b..d898a13 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -110,7 +110,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: v_writelane_b32 v2, s10, 62 ; GCN-NEXT: v_writelane_b32 v2, s11, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] @@ -201,7 +201,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: v_writelane_b32 v1, s10, 62 ; GCN-NEXT: v_writelane_b32 v1, s11, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] @@ -215,7 +215,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: v_writelane_b32 v0, s10, 6 ; GCN-NEXT: v_writelane_b32 v0, s11, 7 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -223,10 +223,10 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_readlane_b32 s8, v2, 56 @@ -319,7 +319,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: v_readlane_b32 s6, v1, 6 ; GCN-NEXT: v_readlane_b32 s7, v1, 7 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] @@ -423,13 +423,13 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB0_2: ; %ret ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ; kill: killed $vgpr2 ; GCN-NEXT: ; kill: killed $vgpr1 @@ -570,7 +570,7 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_writelane_b32 v1, s18, 62 ; GCN-NEXT: v_writelane_b32 v1, s19, 63 ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] @@ -589,7 +589,7 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_writelane_b32 v0, s2, 8 ; GCN-NEXT: v_writelane_b32 v0, s3, 9 ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -597,10 +597,10 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_readlane_b32 s16, v1, 8 @@ -698,10 +698,10 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %ret ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: ; kill: killed $vgpr0 @@ -747,10 +747,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND @@ -840,7 +840,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_writelane_b32 v1, s18, 62 ; GCN-NEXT: v_writelane_b32 v1, s19, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] @@ -849,7 +849,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -857,7 +857,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_cbranch_scc1 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s36, v1, 32 @@ -909,7 +909,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_readlane_b32 s30, v1, 14 ; GCN-NEXT: v_readlane_b32 s31, v1, 15 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[16:31] @@ -947,10 +947,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %ret ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: ; kill: killed $vgpr0 @@ -999,10 +999,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND @@ -1092,7 +1092,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: v_writelane_b32 v1, s18, 62 ; GCN-NEXT: v_writelane_b32 v1, s19, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] @@ -1101,7 +1101,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1109,7 +1109,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s36, v2, 32 @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: v_readlane_b32 s30, v2, 14 ; GCN-NEXT: v_readlane_b32 s31, v2, 15 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v0 @@ -1205,10 +1205,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB3_2: ; %ret ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: ; kill: killed $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir index d349339..8e2a56b 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir @@ -36,12 +36,12 @@ body: | ; GCN-LABEL: name: preserve_active_lanes_above_args ; GCN: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr10, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10 ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) - ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_ST 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_ST 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec $vgpr8 = COPY renamable killed $vgpr10 @@ -70,8 +70,8 @@ body: | ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 @@ -81,8 +81,8 @@ body: | ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10 @@ -142,8 +142,8 @@ body: | ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 @@ -152,8 +152,8 @@ body: | ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir index ae920f9..765597fe 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir @@ -38,14 +38,14 @@ body: | ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc @@ -72,8 +72,8 @@ body: | ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 @@ -86,8 +86,8 @@ body: | ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir index 23c6afd..e4cbae6 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir @@ -39,7 +39,7 @@ body: | ; GFX908-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr7, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX908-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec @@ -148,74 +148,74 @@ body: | ; GFX908-NEXT: $vgpr35 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr35, implicit $exec, implicit $exec ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr8, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr10, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr12, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr13, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr16, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr17, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr20, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr21, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr22, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr23, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (store (s32) into %stack.19, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (store (s32) into %stack.19, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr24, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr25, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (store (s32) into %stack.21, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (store (s32) into %stack.21, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr26, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (store (s32) into %stack.22, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (store (s32) into %stack.22, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr27, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (store (s32) into %stack.23, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (store (s32) into %stack.23, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr28, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (store (s32) into %stack.24, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (store (s32) into %stack.24, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr29, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (store (s32) into %stack.25, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (store (s32) into %stack.25, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr30, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (store (s32) into %stack.26, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (store (s32) into %stack.26, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr31, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (store (s32) into %stack.27, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (store (s32) into %stack.27, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr34, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (store (s32) into %stack.28, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (store (s32) into %stack.28, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr35, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (store (s32) into %stack.29, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (store (s32) into %stack.29, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr36, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (store (s32) into %stack.30, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (store (s32) into %stack.30, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr37, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (store (s32) into %stack.31, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (store (s32) into %stack.31, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr38, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (store (s32) into %stack.32, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (store (s32) into %stack.32, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr39, implicit $exec, implicit $exec - ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec :: (store (s32) into %stack.33, addrspace 5) + ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (store (s32) into %stack.33, addrspace 5) ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr40, implicit $exec, implicit $exec ; GFX908-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, implicit $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, implicit $vgpr35 - ; GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec @@ -287,39 +287,39 @@ body: | ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec, implicit $exec ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec - ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GFX908-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) - ; GFX908-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; GFX908-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) - ; GFX908-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) - ; GFX908-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) - ; GFX908-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5) - ; GFX908-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5) - ; GFX908-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) - ; GFX908-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5) - ; GFX908-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5) - ; GFX908-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5) - ; GFX908-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5) - ; GFX908-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5) - ; GFX908-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5) - ; GFX908-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5) - ; GFX908-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5) - ; GFX908-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5) - ; GFX908-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (load (s32) from %stack.19, addrspace 5) - ; GFX908-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5) - ; GFX908-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (load (s32) from %stack.21, addrspace 5) - ; GFX908-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (load (s32) from %stack.22, addrspace 5) - ; GFX908-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (load (s32) from %stack.23, addrspace 5) - ; GFX908-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (load (s32) from %stack.24, addrspace 5) - ; GFX908-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (load (s32) from %stack.25, addrspace 5) - ; GFX908-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (load (s32) from %stack.26, addrspace 5) - ; GFX908-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (load (s32) from %stack.27, addrspace 5) - ; GFX908-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (load (s32) from %stack.28, addrspace 5) - ; GFX908-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (load (s32) from %stack.29, addrspace 5) - ; GFX908-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (load (s32) from %stack.30, addrspace 5) - ; GFX908-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (load (s32) from %stack.31, addrspace 5) - ; GFX908-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (load (s32) from %stack.32, addrspace 5) - ; GFX908-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec :: (load (s32) from %stack.33, addrspace 5) + ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; GFX908-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX908-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; GFX908-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; GFX908-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) + ; GFX908-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) + ; GFX908-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5) + ; GFX908-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5) + ; GFX908-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) + ; GFX908-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5) + ; GFX908-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5) + ; GFX908-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5) + ; GFX908-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5) + ; GFX908-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5) + ; GFX908-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5) + ; GFX908-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5) + ; GFX908-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5) + ; GFX908-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5) + ; GFX908-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (load (s32) from %stack.19, addrspace 5) + ; GFX908-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5) + ; GFX908-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (load (s32) from %stack.21, addrspace 5) + ; GFX908-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (load (s32) from %stack.22, addrspace 5) + ; GFX908-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (load (s32) from %stack.23, addrspace 5) + ; GFX908-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (load (s32) from %stack.24, addrspace 5) + ; GFX908-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (load (s32) from %stack.25, addrspace 5) + ; GFX908-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (load (s32) from %stack.26, addrspace 5) + ; GFX908-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (load (s32) from %stack.27, addrspace 5) + ; GFX908-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (load (s32) from %stack.28, addrspace 5) + ; GFX908-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (load (s32) from %stack.29, addrspace 5) + ; GFX908-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (load (s32) from %stack.30, addrspace 5) + ; GFX908-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (load (s32) from %stack.31, addrspace 5) + ; GFX908-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (load (s32) from %stack.32, addrspace 5) + ; GFX908-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (load (s32) from %stack.33, addrspace 5) ; GFX908-NEXT: S_NOP 0, implicit renamable $agpr0, implicit killed renamable $vgpr1, implicit killed renamable $vgpr2, implicit killed renamable $vgpr3, implicit killed renamable $vgpr4, implicit killed renamable $vgpr5, implicit killed renamable $vgpr6, implicit killed renamable $vgpr7, implicit killed renamable $vgpr8, implicit killed renamable $vgpr9, implicit killed renamable $vgpr10, implicit killed renamable $vgpr11, implicit killed renamable $vgpr12, implicit killed renamable $vgpr13, implicit killed renamable $vgpr14, implicit killed renamable $vgpr15, implicit killed renamable $vgpr16, implicit killed renamable $vgpr17, implicit killed renamable $vgpr18, implicit killed renamable $vgpr19, implicit killed renamable $vgpr20, implicit killed renamable $vgpr21, implicit killed renamable $vgpr22, implicit killed renamable $vgpr23, implicit killed renamable $vgpr24, implicit killed renamable $vgpr25, implicit killed renamable $vgpr26, implicit killed renamable $vgpr27, implicit killed renamable $vgpr28, implicit killed renamable $vgpr29, implicit killed renamable $vgpr30, implicit killed renamable $vgpr31, implicit killed renamable $vgpr32, implicit killed renamable $vgpr33, implicit killed renamable $vgpr34 ; GFX908-NEXT: S_ENDPGM 0, implicit killed renamable $agpr0 %v0:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index 272daac..4cc469b 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -73,14 +73,14 @@ ; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0 ; GFX11-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0 -; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]] -; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]] -; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]] +; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x200, [[CLAMP_IDX]] +; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0}}, [[CLAMP_IDX]] +; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0}}, [[CLAMP_IDX]] ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off -; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off offset:128 +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off{{$}} define amdgpu_ps float @ps_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index 573fa7ac..242ecd8 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -15,10 +15,10 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND @@ -106,7 +106,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: v_writelane_b32 v1, s22, 62 ; GCN-NEXT: v_writelane_b32 v1, s23, 63 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[6:7] @@ -115,7 +115,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: v_writelane_b32 v0, s6, 0 ; GCN-NEXT: v_writelane_b32 v0, s7, 1 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -123,7 +123,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s4, v1, 0 @@ -143,7 +143,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: v_readlane_b32 s18, v1, 14 ; GCN-NEXT: v_readlane_b32 s19, v1, 15 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] @@ -213,10 +213,10 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB0_2: ; %ret ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: ; kill: killed $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir index 059eb6d..f19b0a5 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir @@ -68,74 +68,74 @@ body: | ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0 ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0 ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13 ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13 ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14 ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 28, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 @@ -145,12 +145,12 @@ body: | ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 @@ -168,12 +168,12 @@ body: | ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 96, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 @@ -207,16 +207,16 @@ body: | ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0 ; GCN64-MUBUF-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 262144, implicit-def dead $scc ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, killed $sgpr2, 0, 0, 0, implicit $exec :: (store (s32) into %stack.8, align 4096, addrspace 5) - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; ; GCN32-MUBUF-LABEL: name: check_spill @@ -232,74 +232,74 @@ body: | ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0 ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0 ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13 ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13 ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14 ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 15, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 28, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 @@ -309,12 +309,12 @@ body: | ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 65535, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 @@ -332,12 +332,12 @@ body: | ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 96, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 @@ -371,16 +371,16 @@ body: | ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0 ; GCN32-MUBUF-NEXT: $sgpr1 = S_ADD_I32 $sgpr33, 131072, implicit-def dead $scc ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, killed $sgpr1, 0, 0, 0, implicit $exec :: (store (s32) into %stack.8, align 4096, addrspace 5) - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; ; GCN64-FLATSCR-LABEL: name: check_spill @@ -392,74 +392,74 @@ body: | ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0 ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0 ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13 ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13 ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14 ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 @@ -469,12 +469,12 @@ body: | ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 @@ -492,12 +492,12 @@ body: | ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 96, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 @@ -531,16 +531,16 @@ body: | ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0 ; GCN64-FLATSCR-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 4096, implicit-def dead $scc ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, killed $sgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.8, align 4096, addrspace 5) - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 renamable $sgpr12 = IMPLICIT_DEF SI_SPILL_S32_SAVE killed $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 @@ -626,52 +626,52 @@ body: | ; GCN64-MUBUF-NEXT: $sgpr29 = S_ADDC_U32 $sgpr29, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr28_sgpr29_sgpr30_sgpr31 ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13 ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1 - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14 ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2 - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 28, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2 ; GCN64-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3 - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2 ; GCN64-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3 ; GCN64-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4 - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 @@ -681,11 +681,11 @@ body: | ; GCN64-MUBUF-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5 ; GCN64-MUBUF-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6 ; GCN64-MUBUF-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7 - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 96, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 @@ -703,11 +703,11 @@ body: | ; GCN64-MUBUF-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13 ; GCN64-MUBUF-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14 ; GCN64-MUBUF-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15 - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5) ; GCN64-MUBUF-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN64-MUBUF-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 @@ -741,15 +741,15 @@ body: | ; GCN64-MUBUF-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29 ; GCN64-MUBUF-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30 ; GCN64-MUBUF-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31 - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0 - ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 262144, implicit-def dead $scc ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, killed $sgpr2, 0, 0, 0, implicit $exec :: (load (s32) from %stack.8, align 4096, addrspace 5) ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 - ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; ; GCN32-MUBUF-LABEL: name: check_reload @@ -764,52 +764,52 @@ body: | ; GCN32-MUBUF-NEXT: $sgpr97 = S_ADDC_U32 $sgpr97, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13 ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1 - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14 ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2 - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 15, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 28, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2 ; GCN32-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3 - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2 ; GCN32-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3 ; GCN32-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4 - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 @@ -819,11 +819,11 @@ body: | ; GCN32-MUBUF-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5 ; GCN32-MUBUF-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6 ; GCN32-MUBUF-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7 - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 65535, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 96, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 @@ -841,11 +841,11 @@ body: | ; GCN32-MUBUF-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13 ; GCN32-MUBUF-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14 ; GCN32-MUBUF-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15 - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5) ; GCN32-MUBUF-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN32-MUBUF-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 @@ -879,15 +879,15 @@ body: | ; GCN32-MUBUF-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29 ; GCN32-MUBUF-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30 ; GCN32-MUBUF-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31 - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0 - ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $sgpr1 = S_ADD_I32 $sgpr33, 131072, implicit-def dead $scc ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, killed $sgpr1, 0, 0, 0, implicit $exec :: (load (s32) from %stack.8, align 4096, addrspace 5) ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 - ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0 ; ; GCN64-FLATSCR-LABEL: name: check_reload @@ -898,52 +898,52 @@ body: | ; GCN64-FLATSCR-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13 ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1 - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14 ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2 - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2 ; GCN64-FLATSCR-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3 - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5) ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2 ; GCN64-FLATSCR-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3 ; GCN64-FLATSCR-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4 - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5) ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 @@ -953,11 +953,11 @@ body: | ; GCN64-FLATSCR-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5 ; GCN64-FLATSCR-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6 ; GCN64-FLATSCR-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7 - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 96, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5) ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 @@ -975,11 +975,11 @@ body: | ; GCN64-FLATSCR-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13 ; GCN64-FLATSCR-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14 ; GCN64-FLATSCR-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15 - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5) ; GCN64-FLATSCR-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN64-FLATSCR-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 @@ -1013,15 +1013,15 @@ body: | ; GCN64-FLATSCR-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29 ; GCN64-FLATSCR-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30 ; GCN64-FLATSCR-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31 - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0 - ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 4096, implicit-def dead $scc ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.8, align 4096, addrspace 5) ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 - ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index e67b5e4..c5a5a52 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -8,7 +8,7 @@ ; Make sure we are handling hazards correctly. ; SGPR: v_mov_b32_e32 v0, vcc_lo ; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; 4-byte Folded Reload +; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 ; 4-byte Folded Reload ; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index 5871a78..c9413b6 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -75,31 +75,31 @@ use: ; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GFX908-DAG: v_accvgpr_read_b32 v5, a0 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill +; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill ; GFX908-DAG: v_accvgpr_read_b32 v5, a1 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill +; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill ; GFX908-DAG: v_accvgpr_read_b32 v5, a2 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill +; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill ; GFX908-DAG: v_accvgpr_read_b32 v5, a3 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill +; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill -; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill -; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill -; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill -; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill +; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill +; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill +; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill +; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill ; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] -; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload -; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload -; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload -; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload +; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Reload +; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload +; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload +; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload ; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off -; GFX90A-DAG: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload -; GFX90A-DAG: buffer_load_dword v3, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload -; GFX90A-DAG: buffer_load_dword v4, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload -; GFX90A-DAG: buffer_load_dword v5, off, s[4:7], 0 offset:16 ; 4-byte Folded Reload +; GFX90A-DAG: buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload +; GFX90A-DAG: buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload +; GFX90A-DAG: buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload +; GFX90A-DAG: buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload ; GFX90A: global_store_dwordx4 v[0:1], v[2:5], off ; GCN: ScratchSize: 20 diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll index 4a13a74..f192f25 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -17,7 +17,7 @@ ; TOVMEM: s_mov_b64 [[COPY_EXEC:s\[[0-9]+:[0-9]+\]]], exec ; TOVMEM: s_mov_b64 exec, 1 ; TOVMEM: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0 -; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; 4-byte Folded Spill +; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; 4-byte Folded Spill ; TOVMEM: s_mov_b64 exec, [[COPY_EXEC]] ; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]] @@ -26,7 +26,7 @@ ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], [[M0_LANE]] ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]] -; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; 4-byte Folded Reload +; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; 4-byte Folded Reload ; TOVMEM: s_waitcnt vmcnt(0) ; TOVMEM: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]], 0 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll index 7ad3520..baca66a 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -11,14 +11,14 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_endpgm ; @@ -27,16 +27,16 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc +; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0xffc +; FLATSCR-NEXT: s_movk_i32 s0, 0xff8 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:8 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_endpgm entry: @@ -277,19 +277,19 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 glc +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Spill ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; v[0:1] @@ -301,16 +301,16 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0xff8 +; FLATSCR-NEXT: s_movk_i32 s0, 0xff4 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc +; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0xff8 +; FLATSCR-NEXT: s_movk_i32 s0, 0xff4 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 1458a93..bea2e6d 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10315,8 +10315,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s34, 0x84800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s34, 0x84800 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10351,8 +10351,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s34, 0x85000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s34, 0x85000 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10387,8 +10387,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s34, 0x85800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s34, 0x85800 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10431,8 +10431,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s36, 0x86000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s36, 0x86000 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10449,8 +10449,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[34:35] ; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s44, 0x86800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s44, 0x86800 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10463,8 +10463,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[34:35] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x21b0 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x21b0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10494,8 +10494,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_or_b64 exec, exec, vcc ; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s6, 0x80400 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s6, 0x80400 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10509,8 +10509,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 s[36:37], s[0:1] ; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s6, 0x80800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s6, 0x80800 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir index 3892ceb..537aca1 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir @@ -50,28 +50,28 @@ body: | ; GFX9-NEXT: $vcc = IMPLICIT_DEF ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc - ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GFX9-NEXT: $vcc = IMPLICIT_DEF ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc - ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) - ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) ; GFX9-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc ; GFX9-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1 - ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; ; GFX10-LABEL: name: check_vcc @@ -87,28 +87,28 @@ body: | ; GFX10-NEXT: $vcc = IMPLICIT_DEF ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc - ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GFX10-NEXT: $vcc = IMPLICIT_DEF ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc - ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) - ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) ; GFX10-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc ; GFX10-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1 - ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; ; GFX11-LABEL: name: check_vcc @@ -118,28 +118,28 @@ body: | ; GFX11-NEXT: $vcc = IMPLICIT_DEF ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc - ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GFX11-NEXT: $vcc = IMPLICIT_DEF ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) + ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc - ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0 - ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5) - ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) ; GFX11-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc ; GFX11-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1 - ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 $vcc = IMPLICIT_DEF SI_SPILL_S64_SAVE $vcc, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll index 3720933..eb211f7 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -8,6 +8,9 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; VI: ; %bb.0: ; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -56,6 +59,9 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -101,6 +107,8 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; GFX9-NEXT: .amdhsa_exception_int_div_zero 0 ; GFX9-NEXT: .end_amdhsa_kernel ; GFX9-NEXT: .text + %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca + store volatile i8 3, ptr addrspace(5) %clutter %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void @@ -111,6 +119,9 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; VI: ; %bb.0: ; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -119,7 +130,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; VI-NEXT: .p2align 6 ; VI-NEXT: .amdhsa_kernel stackrealign_attr ; VI-NEXT: .amdhsa_group_segment_fixed_size 0 -; VI-NEXT: .amdhsa_private_segment_fixed_size 8 +; VI-NEXT: .amdhsa_private_segment_fixed_size 12 ; VI-NEXT: .amdhsa_kernarg_size 0 ; VI-NEXT: .amdhsa_user_sgpr_count 6 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 @@ -159,6 +170,9 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -167,7 +181,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; GFX9-NEXT: .p2align 6 ; GFX9-NEXT: .amdhsa_kernel stackrealign_attr ; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 -; GFX9-NEXT: .amdhsa_private_segment_fixed_size 8 +; GFX9-NEXT: .amdhsa_private_segment_fixed_size 12 ; GFX9-NEXT: .amdhsa_kernarg_size 0 ; GFX9-NEXT: .amdhsa_user_sgpr_count 6 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 @@ -204,6 +218,8 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; GFX9-NEXT: .amdhsa_exception_int_div_zero 0 ; GFX9-NEXT: .end_amdhsa_kernel ; GFX9-NEXT: .text + %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca + store volatile i8 3, ptr addrspace(5) %clutter %alloca.align = alloca i32, align 4, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 4 ret void @@ -214,6 +230,9 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; VI: ; %bb.0: ; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -262,6 +281,9 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -307,6 +329,8 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; GFX9-NEXT: .amdhsa_exception_int_div_zero 0 ; GFX9-NEXT: .end_amdhsa_kernel ; GFX9-NEXT: .text + %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca + store volatile i8 3, ptr addrspace(5) %clutter %alloca.align = alloca i32, align 4, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll index b1a939d..e378a83 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll @@ -3,8 +3,8 @@ declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1 -; ERROR: error: :0:0: stack frame size (131061) exceeds limit (131056) in function 'stack_size_limit_wave64' -; GCN: ; ScratchSize: 131061 +; ERROR: error: :0:0: stack frame size (131064) exceeds limit (131056) in function 'stack_size_limit_wave64' +; GCN: ; ScratchSize: 131064 define amdgpu_kernel void @stack_size_limit_wave64() #0 { entry: %alloca = alloca [131057 x i8], align 1, addrspace(5) @@ -12,8 +12,8 @@ entry: ret void } -; ERROR: error: :0:0: stack frame size (262117) exceeds limit (262112) in function 'stack_size_limit_wave32' -; GCN: ; ScratchSize: 262117 +; ERROR: error: :0:0: stack frame size (262120) exceeds limit (262112) in function 'stack_size_limit_wave32' +; GCN: ; ScratchSize: 262120 define amdgpu_kernel void @stack_size_limit_wave32() #1 { entry: %alloca = alloca [262113 x i8], align 1, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index d8db2d5..8c5b894 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -878,7 +878,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-OPT-NEXT: s_lshr_b32 s6, s0, 5 ; WAVE32-OPT-NEXT: s_mov_b64 s[0:1], s[8:9] ; WAVE32-OPT-NEXT: s_mov_b64 s[2:3], s[10:11] -; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[8:11], s32 offset:4 ; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -904,7 +904,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-OPT-NEXT: s_lshr_b32 s6, s0, 6 ; WAVE64-OPT-NEXT: s_mov_b64 s[0:1], s[8:9] ; WAVE64-OPT-NEXT: s_mov_b64 s[2:3], s[10:11] -; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[8:11], s32 offset:4 ; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -935,10 +935,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 1 ; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:4 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[20:21] ; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[22:23] @@ -1020,7 +1020,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:132 ; 4-byte Folded Reload +; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s1, v0, 1 @@ -1053,10 +1053,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: s_lshr_b32 s0, s0, 6 ; WAVE64-O0-NEXT: v_writelane_b32 v3, s0, 1 ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill +; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill ; WAVE64-O0-NEXT: s_mov_b64 exec, s[20:21] ; WAVE64-O0-NEXT: v_mov_b32_e32 v3, 42 -; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:4 +; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 ; WAVE64-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE64-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; WAVE64-O0-NEXT: s_mov_b64 s[2:3], s[26:27] @@ -1138,7 +1138,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload +; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[20:21] ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE64-O0-NEXT: v_readlane_b32 s1, v0, 1 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:4 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[20:21] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[22:23] diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll index 808f006..137bd0f 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll @@ -16,33 +16,33 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" { ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:160 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v0, off offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v0, off offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: global_store_b32 v[0:1], v5, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 @@ -83,16 +83,16 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:448 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v3, off offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:576 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v3, off ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:720 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: v_cmpx_eq_u32_e32 0, v2 ; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0 ; CHECK-NEXT: s_cbranch_execz .LBB1_2 @@ -101,23 +101,27 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: global_store_b32 v[0:1], v10, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS @@ -134,7 +138,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 @@ -142,10 +146,6 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload -; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS -; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: ; implicit-def: $vgpr0 @@ -159,23 +159,27 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: global_store_b32 v[0:1], v10, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS @@ -192,7 +196,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 @@ -200,17 +204,13 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload -; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS -; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: .LBB1_4: ; %.exit ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 7aaf945..0cabfa9 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -25,7 +25,7 @@ ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload ; GCN: NumVgprs: 256 -; GCN: ScratchSize: 768 +; GCN: ScratchSize: 640 define amdgpu_vs void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, ptr addrspace(4) inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index 3a3860d..a1d3e2a 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -16,12 +16,12 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; CHECK-NEXT: v_mov_b32_e32 v2, v0 ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: global_load_ushort v3, v1, s[4:5] offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 @@ -32,7 +32,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-NEXT: v_writelane_b32 v0, s4, 0 ; CHECK-NEXT: v_writelane_b32 v0, s5, 1 ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; CHECK-NEXT: s_mov_b64 exec, s[4:5] @@ -40,20 +40,20 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-NEXT: ; %bb.1: ; %bb193 ; CHECK-NEXT: .LBB0_2: ; %bb194 ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_readlane_b32 s4, v1, 0 ; CHECK-NEXT: v_readlane_b32 s5, v1, 1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], v0, s4 ; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; %bb201 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, V2@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, V2@rel32@hi+12 @@ -66,7 +66,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-NEXT: ; divergent unreachable ; CHECK-NEXT: .LBB0_4: ; %UnifiedReturnBlock ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 78e8ab1..f78b408 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -433,416 +433,416 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:176 ; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[4:5] offset:160 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[4:5] offset:144 ; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[4:5] offset:128 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[4:5] offset:112 ; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[4:5] offset:96 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[4:5] offset:80 ; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[4:5] offset:64 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[4:5] offset:48 ; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[4:5] offset:32 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[4:5] offset:16 ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 @@ -853,494 +853,494 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[6:7] offset:192 ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v3 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v3 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v2 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v1 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v0 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[6:7] offset:176 ; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[6:7] offset:160 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[6:7] offset:144 ; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[6:7] offset:128 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[6:7] offset:112 ; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[6:7] offset:96 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[6:7] offset:80 ; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[6:7] offset:64 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[6:7] offset:48 ; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[6:7] offset:32 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[6:7] offset:16 ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0 ; GFX906-NEXT: .LBB6_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62 ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v63, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62 ; GFX906-NEXT: v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v58, 8, v58 ; GFX906-NEXT: v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1349,42 +1349,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v54, 8, v54 ; GFX906-NEXT: v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1393,42 +1393,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v50, 8, v50 ; GFX906-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1437,42 +1437,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v46, 8, v46 ; GFX906-NEXT: v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1481,42 +1481,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:64 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v42, 8, v42 ; GFX906-NEXT: v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1525,42 +1525,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:80 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v38, 8, v38 ; GFX906-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1569,42 +1569,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:96 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX906-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1613,42 +1613,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:112 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30 ; GFX906-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1657,42 +1657,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:128 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26 ; GFX906-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1701,42 +1701,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:144 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v22, 8, v22 ; GFX906-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1745,42 +1745,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:160 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v18, 8, v18 ; GFX906-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1789,36 +1789,36 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:176 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v13, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v13, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX906-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -1826,27 +1826,27 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:192 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1854,9 +1854,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1864,8 +1864,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1874,21 +1874,21 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:208 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1896,9 +1896,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1906,8 +1906,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1916,15 +1916,15 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: s_waitcnt vmcnt(3) @@ -1934,9 +1934,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1944,9 +1944,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1954,8 +1954,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 46ff9a9..95dfb12 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -2035,9 +2035,9 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec -; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, v1 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] @@ -2059,11 +2059,11 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { ; GFX10-W32-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_lshl_add_u32 v2, v2, 2, 4 +; GFX10-W32-NEXT: v_lshl_add_u32 v2, v2, 2, 0 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX10-W32-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 6003d03..47c976d 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -124,7 +124,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 0 @@ -150,9 +150,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -165,22 +165,22 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v3, s0 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[0:1], exec ; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 5 ; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -195,10 +195,10 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 5 @@ -208,8 +208,8 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_readlane_b32 s3, v0, 2 ; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 3 ; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 4 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v4 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -349,7 +349,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v7, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v7, s5, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O0-NEXT: v_readlane_b32 s0, v7, 2 @@ -358,7 +358,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 @@ -388,7 +388,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9 ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 @@ -427,7 +427,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] @@ -584,7 +584,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_writelane_b32 v12, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v12, s5, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O0-NEXT: v_readlane_b32 s0, v12, 2 @@ -593,7 +593,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 @@ -624,7 +624,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8 ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 @@ -659,7 +659,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 @@ -670,7 +670,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 @@ -994,7 +994,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 0 @@ -1020,9 +1020,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -1035,22 +1035,22 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v3, s0 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[0:1], exec ; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 5 ; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O0-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -1065,10 +1065,10 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB8_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 5 @@ -1078,8 +1078,8 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_readlane_b32 s3, v0, 2 ; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 3 ; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 4 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v4 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -1219,7 +1219,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: v_writelane_b32 v7, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v7, s5, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O0-NEXT: v_readlane_b32 s0, v7, 2 @@ -1228,7 +1228,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 @@ -1258,7 +1258,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9 ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] @@ -1454,7 +1454,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_writelane_b32 v12, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v12, s5, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O0-NEXT: v_readlane_b32 s0, v12, 2 @@ -1463,7 +1463,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 @@ -1494,7 +1494,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8 ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 @@ -1529,7 +1529,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 @@ -1540,7 +1540,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll index abb9806..2588d88 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -38,7 +38,7 @@ ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 5 -; CHECK-NEXT: scavengeFI: '%fixed-stack.0' +; CHECK-NEXT: scavengeFI: '%stack.0' ; CHECK-NEXT: vgprForAGPRCopy: '' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' @@ -303,7 +303,7 @@ ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 5 -; CHECK-NEXT: scavengeFI: '%fixed-stack.0' +; CHECK-NEXT: scavengeFI: '%stack.0' ; CHECK-NEXT: vgprForAGPRCopy: '' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll index f2144b8..9939366 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -38,7 +38,7 @@ ; AFTER-PEI-NEXT: fp64-fp16-output-denormals: true ; AFTER-PEI-NEXT: highBitsOf32BitAddress: 0 ; AFTER-PEI-NEXT: occupancy: 5 -; AFTER-PEI-NEXT: scavengeFI: '%fixed-stack.0' +; AFTER-PEI-NEXT: scavengeFI: '%stack.3' ; AFTER-PEI-NEXT: vgprForAGPRCopy: '' ; AFTER-PEI-NEXT: sgprForEXECCopy: '' ; AFTER-PEI-NEXT: longBranchReservedReg: '' diff --git a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll index b3bdf96..b795ad1 100644 --- a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll +++ b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll @@ -36,15 +36,15 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) define amdgpu_kernel void @kernel1( ; CHECK: {{.*}}DW_TAG_formal_parameter -; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +4, DW_OP_lit1, DW_OP_swap, DW_OP_xderef) +; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +0, DW_OP_lit1, DW_OP_swap, DW_OP_xderef) ; CHECK-NEXT: DW_AT_name {{.*}}"ArgN" i32 %ArgN, ; CHECK: {{.*}}DW_TAG_formal_parameter -; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +8, DW_OP_lit1, DW_OP_swap, DW_OP_xderef) +; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +4, DW_OP_lit1, DW_OP_swap, DW_OP_xderef) ; CHECK-NEXT: DW_AT_name {{.*}}"ArgA" ptr addrspace(1) %ArgA, ; CHECK: {{.*}}DW_TAG_formal_parameter -; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +16, DW_OP_lit1, DW_OP_swap, DW_OP_xderef) +; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +12, DW_OP_lit1, DW_OP_swap, DW_OP_xderef) ; CHECK-NEXT: DW_AT_name {{.*}}"ArgB" ptr addrspace(1) %ArgB) !dbg !13 { entry: -- cgit v1.1 From 0d9decc6694c188e2f7fa17d140ba9bd7cc98b6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 8 Feb 2024 22:38:28 +0100 Subject: [clang][Interp] Handle invalid CXXCtorInitializer expressions Their type might be a null type, in which case we need to abort here. --- clang/lib/AST/Interp/ByteCodeStmtGen.cpp | 4 ++++ clang/test/AST/Interp/records.cpp | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp index b0ec90a..bedcc78d 100644 --- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp @@ -144,6 +144,10 @@ bool ByteCodeStmtGen::visitFunc(const FunctionDecl *F) { auto emitFieldInitializer = [&](const Record::Field *F, unsigned FieldOffset, const Expr *InitExpr) -> bool { + // We don't know what to do with these, so just return false. + if (InitExpr->getType().isNull()) + return false; + if (std::optional T = this->classify(InitExpr)) { if (!this->visit(InitExpr)) return false; diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp index fb50d1c..93da831 100644 --- a/clang/test/AST/Interp/records.cpp +++ b/clang/test/AST/Interp/records.cpp @@ -1228,3 +1228,14 @@ namespace InheritedConstructor { constexpr S s(1); } } + +namespace InvalidCtorInitializer { + struct X { + int Y; + constexpr X() // expected-note {{declared here}} + : Y(fo_o_()) {} // both-error {{use of undeclared identifier 'fo_o_'}} + }; + // no crash on evaluating the constexpr ctor. + constexpr int Z = X().Y; // both-error {{constexpr variable 'Z' must be initialized by a constant expression}} \ + // expected-note {{undefined constructor 'X'}} +} -- cgit v1.1 From 173e674ba55eb93e8af43f2eece7feffe9954b34 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Fri, 9 Feb 2024 00:19:52 -0800 Subject: [clang-format] Fix an out-of-bounds bug uncovered by 763139afc19d --- clang/unittests/Format/QualifierFixerTest.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp index 324366c..4e1768d 100644 --- a/clang/unittests/Format/QualifierFixerTest.cpp +++ b/clang/unittests/Format/QualifierFixerTest.cpp @@ -1100,8 +1100,6 @@ TEST_F(QualifierFixerTest, IsQualifierType) { NotTokens[3], ConfiguredTokens)); EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( NotTokens[4], ConfiguredTokens)); - EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - NotTokens[5], ConfiguredTokens)); EXPECT_FALSE( LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[0])); @@ -1113,8 +1111,6 @@ TEST_F(QualifierFixerTest, IsQualifierType) { LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[3])); EXPECT_FALSE( LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[4])); - EXPECT_FALSE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[5])); } TEST_F(QualifierFixerTest, IsMacro) { -- cgit v1.1 From b9079baaddfed5e604fbfaa1d81a7a1c38e78c26 Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Fri, 9 Feb 2024 09:27:04 +0100 Subject: [NFC] clang-format utils/TableGen (#80973) ``` find llvm/utils/TableGen -iname "*.h" -o -iname "*.cpp" | xargs clang-format-16 -i ``` Split from #80847 --- llvm/utils/TableGen/AsmMatcherEmitter.cpp | 593 +++++---- llvm/utils/TableGen/AsmWriterEmitter.cpp | 242 ++-- llvm/utils/TableGen/AsmWriterInst.cpp | 69 +- llvm/utils/TableGen/AsmWriterInst.h | 166 +-- llvm/utils/TableGen/CTagsEmitter.cpp | 7 +- llvm/utils/TableGen/CallingConvEmitter.cpp | 48 +- llvm/utils/TableGen/CodeEmitterGen.cpp | 41 +- llvm/utils/TableGen/CodeGenDAGPatterns.cpp | 579 ++++----- llvm/utils/TableGen/CodeGenDAGPatterns.h | 209 ++- llvm/utils/TableGen/CodeGenHwModes.cpp | 16 +- llvm/utils/TableGen/CodeGenHwModes.h | 70 +- llvm/utils/TableGen/CodeGenInstruction.cpp | 147 ++- llvm/utils/TableGen/CodeGenInstruction.h | 602 +++++---- llvm/utils/TableGen/CodeGenIntrinsics.h | 16 +- llvm/utils/TableGen/CodeGenMapTable.cpp | 132 +- llvm/utils/TableGen/CodeGenRegisters.cpp | 286 +++-- llvm/utils/TableGen/CodeGenRegisters.h | 1354 ++++++++++---------- llvm/utils/TableGen/CodeGenSchedule.cpp | 338 +++-- llvm/utils/TableGen/CodeGenSchedule.h | 49 +- llvm/utils/TableGen/CodeGenTarget.cpp | 62 +- llvm/utils/TableGen/CodeGenTarget.h | 36 +- llvm/utils/TableGen/DAGISelEmitter.cpp | 35 +- llvm/utils/TableGen/DAGISelMatcher.cpp | 74 +- llvm/utils/TableGen/DAGISelMatcher.h | 518 ++++---- llvm/utils/TableGen/DAGISelMatcherEmitter.cpp | 167 ++- llvm/utils/TableGen/DAGISelMatcherGen.cpp | 304 +++-- llvm/utils/TableGen/DAGISelMatcherOpt.cpp | 5 +- llvm/utils/TableGen/DFAEmitter.cpp | 10 +- llvm/utils/TableGen/DFAPacketizerEmitter.cpp | 6 +- llvm/utils/TableGen/DXILEmitter.cpp | 25 +- llvm/utils/TableGen/DecoderEmitter.cpp | 136 +- llvm/utils/TableGen/DisassemblerEmitter.cpp | 4 +- llvm/utils/TableGen/FastISelEmitter.cpp | 138 +- llvm/utils/TableGen/InfoByHwMode.cpp | 63 +- llvm/utils/TableGen/InfoByHwMode.h | 41 +- llvm/utils/TableGen/InstrDocsEmitter.cpp | 26 +- llvm/utils/TableGen/InstrInfoEmitter.cpp | 178 ++- llvm/utils/TableGen/IntrinsicEmitter.cpp | 42 +- llvm/utils/TableGen/OptParserEmitter.cpp | 9 +- llvm/utils/TableGen/PredicateExpander.cpp | 4 +- llvm/utils/TableGen/PseudoLoweringEmitter.cpp | 94 +- llvm/utils/TableGen/RegisterBankEmitter.cpp | 10 +- llvm/utils/TableGen/RegisterInfoEmitter.cpp | 226 ++-- llvm/utils/TableGen/SDNodeProperties.h | 2 +- llvm/utils/TableGen/SearchableTableEmitter.cpp | 42 +- llvm/utils/TableGen/SequenceToOffsetTable.h | 12 +- llvm/utils/TableGen/SubtargetEmitter.cpp | 379 +++--- llvm/utils/TableGen/SubtargetFeatureInfo.cpp | 2 +- llvm/utils/TableGen/SubtargetFeatureInfo.h | 3 +- llvm/utils/TableGen/TableGenBackends.h | 2 - llvm/utils/TableGen/Types.cpp | 4 +- llvm/utils/TableGen/Types.h | 2 +- .../TableGen/WebAssemblyDisassemblerEmitter.cpp | 6 +- .../TableGen/X86CompressEVEXTablesEmitter.cpp | 11 +- llvm/utils/TableGen/X86DisassemblerShared.h | 4 +- llvm/utils/TableGen/X86DisassemblerTables.cpp | 21 +- llvm/utils/TableGen/X86DisassemblerTables.h | 52 +- llvm/utils/TableGen/X86ModRMFilters.cpp | 12 +- llvm/utils/TableGen/X86ModRMFilters.h | 29 +- llvm/utils/TableGen/X86RecognizableInstr.cpp | 2 +- llvm/utils/TableGen/X86RecognizableInstr.h | 2 +- 61 files changed, 3923 insertions(+), 3841 deletions(-) diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp index 011d96a..9065885 100644 --- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp +++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp @@ -140,10 +140,11 @@ class AsmMatcherInfo; // RegisterSets can be seen in the outputted AsmMatcher tables occasionally, and // can even affect compiler output (at least seen in diagnostics produced when // all matches fail). So we use a type that sorts them consistently. -typedef std::set RegisterSet; +typedef std::set RegisterSet; class AsmMatcherEmitter { RecordKeeper &Records; + public: AsmMatcherEmitter(RecordKeeper &R) : Records(R) {} @@ -166,7 +167,7 @@ struct ClassInfo { /// The (first) user defined class, subsequent user defined classes are /// UserClass0+1, and so on. - UserClass0 = 1<<16 + UserClass0 = 1 << 16 }; /// Kind - The class kind, which is either a predefined kind, or (UserClass0 + @@ -176,7 +177,7 @@ struct ClassInfo { /// SuperClasses - The super classes of this class. Note that for simplicities /// sake user operands only record their immediate super class, while register /// operands include all superclasses. - std::vector SuperClasses; + std::vector SuperClasses; /// Name - The full class name, suitable for use in an enum. std::string Name; @@ -204,10 +205,12 @@ struct ClassInfo { /// For register classes: the records for all the registers in this class. RegisterSet Registers; - /// For custom match classes: the diagnostic kind for when the predicate fails. + /// For custom match classes: the diagnostic kind for when the predicate + /// fails. std::string DiagnosticType; - /// For custom match classes: the diagnostic string for when the predicate fails. + /// For custom match classes: the diagnostic string for when the predicate + /// fails. std::string DiagnosticString; /// Is this operand optional and not always required. @@ -224,9 +227,7 @@ public: } /// isUserClass() - Check if this is a user defined class. - bool isUserClass() const { - return Kind >= UserClass0; - } + bool isUserClass() const { return Kind >= UserClass0; } /// isRelatedTo - Check whether this class is "related" to \p RHS. Classes /// are related if they are in the same class hierarchy. @@ -244,8 +245,8 @@ public: RegisterSet Tmp; std::insert_iterator II(Tmp, Tmp.begin()); std::set_intersection(Registers.begin(), Registers.end(), - RHS.Registers.begin(), RHS.Registers.end(), - II, LessRecordByID()); + RHS.Registers.begin(), RHS.Registers.end(), II, + LessRecordByID()); return !Tmp.empty(); } @@ -469,7 +470,7 @@ struct MatchableInfo { unsigned SrcOperand2) { ResOperand X; X.Kind = TiedOperand; - X.TiedOperands = { TiedOperandNum, SrcOperand1, SrcOperand2 }; + X.TiedOperands = {TiedOperandNum, SrcOperand1, SrcOperand2}; X.MINumOperands = 1; return X; } @@ -503,7 +504,7 @@ struct MatchableInfo { Record *const TheDef; /// DefRec - This is the definition that it came from. - PointerUnion DefRec; + PointerUnion DefRec; const CodeGenInstruction *getResultInst() const { if (isa(DefRec)) @@ -542,16 +543,13 @@ struct MatchableInfo { bool UseInstAsmMatchConverter; MatchableInfo(const CodeGenInstruction &CGI) - : AsmVariantID(0), AsmString(CGI.AsmString), TheDef(CGI.TheDef), DefRec(&CGI), - UseInstAsmMatchConverter(true) { - } + : AsmVariantID(0), AsmString(CGI.AsmString), TheDef(CGI.TheDef), + DefRec(&CGI), UseInstAsmMatchConverter(true) {} MatchableInfo(std::unique_ptr Alias) - : AsmVariantID(0), AsmString(Alias->AsmString), TheDef(Alias->TheDef), - DefRec(Alias.release()), - UseInstAsmMatchConverter( - TheDef->getValueAsBit("UseInstAsmMatchConverter")) { - } + : AsmVariantID(0), AsmString(Alias->AsmString), TheDef(Alias->TheDef), + DefRec(Alias.release()), UseInstAsmMatchConverter(TheDef->getValueAsBit( + "UseInstAsmMatchConverter")) {} // Could remove this and the dtor if PointerUnion supported unique_ptr // elements with a dynamic failure/assertion (like the one below) in the case @@ -576,9 +574,8 @@ struct MatchableInfo { void formTwoOperandAlias(StringRef Constraint); void initialize(const AsmMatcherInfo &Info, - SmallPtrSetImpl &SingletonRegisters, - AsmVariantInfo const &Variant, - bool HasMnemonicFirst); + SmallPtrSetImpl &SingletonRegisters, + AsmVariantInfo const &Variant, bool HasMnemonicFirst); /// validate - Return true if this matchable is a valid thing to match against /// and perform a bunch of validity checking. @@ -603,9 +600,9 @@ struct MatchableInfo { } int findAsmOperandOriginallyNamed(StringRef N) const { - auto I = - find_if(AsmOperands, - [&](const AsmOperand &Op) { return Op.OrigSrcOpName == N; }); + auto I = find_if(AsmOperands, [&](const AsmOperand &Op) { + return Op.OrigSrcOpName == N; + }); return (I != AsmOperands.end()) ? I - AsmOperands.begin() : -1; } @@ -706,7 +703,7 @@ private: struct OperandMatchEntry { unsigned OperandMask; - const MatchableInfo* MI; + const MatchableInfo *MI; ClassInfo *CI; static OperandMatchEntry create(const MatchableInfo *mi, ClassInfo *ci, @@ -740,21 +737,21 @@ public: std::vector OperandMatchInfo; /// Map of Register records to their class information. - typedef std::map RegisterClassesTy; + typedef std::map RegisterClassesTy; RegisterClassesTy RegisterClasses; /// Map of Predicate records to their subtarget information. std::map SubtargetFeatures; /// Map of AsmOperandClass records to their class information. - std::map AsmOperandClasses; + std::map AsmOperandClasses; /// Map of RegisterClass records to their class information. - std::map RegisterClassClasses; + std::map RegisterClassClasses; private: /// Map of token to class information which has already been constructed. - std::map TokenClasses; + std::map TokenClasses; private: /// getTokenClass - Lookup or create the class for the given token. @@ -767,7 +764,7 @@ private: /// buildRegisterClasses - Build the ClassInfo* instances for register /// classes. - void buildRegisterClasses(SmallPtrSetImpl &SingletonRegisters); + void buildRegisterClasses(SmallPtrSetImpl &SingletonRegisters); /// buildOperandClasses - Build the ClassInfo* instances for user defined /// operand classes. @@ -779,8 +776,7 @@ private: MatchableInfo::AsmOperand &Op); public: - AsmMatcherInfo(Record *AsmParser, - CodeGenTarget &Target, + AsmMatcherInfo(Record *AsmParser, CodeGenTarget &Target, RecordKeeper &Records); /// Construct the various tables used during matching. @@ -798,9 +794,7 @@ public: return I == SubtargetFeatures.end() ? nullptr : &I->second; } - RecordKeeper &getRecords() const { - return Records; - } + RecordKeeper &getRecords() const { return Records; } bool hasOptionalOperands() const { return any_of(Classes, @@ -812,7 +806,8 @@ public: #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void MatchableInfo::dump() const { - errs() << TheDef->getName() << " -- " << "flattened:\"" << AsmString <<"\"\n"; + errs() << TheDef->getName() << " -- " + << "flattened:\"" << AsmString << "\"\n"; errs() << " variant: " << AsmVariantID << "\n"; @@ -850,7 +845,7 @@ parseTwoOperandConstraint(StringRef S, ArrayRef Loc) { void MatchableInfo::formTwoOperandAlias(StringRef Constraint) { // Figure out which operands are aliased and mark them as tied. std::pair Ops = - parseTwoOperandConstraint(Constraint, TheDef->getLoc()); + parseTwoOperandConstraint(Constraint, TheDef->getLoc()); // Find the AsmOperands that refer to the operands we're aliasing. int SrcAsmOperand = findAsmOperandNamed(Ops.first); @@ -858,11 +853,11 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) { if (SrcAsmOperand == -1) PrintFatalError(TheDef->getLoc(), "unknown source two-operand alias operand '" + Ops.first + - "'."); + "'."); if (DstAsmOperand == -1) PrintFatalError(TheDef->getLoc(), "unknown destination two-operand alias operand '" + - Ops.second + "'."); + Ops.second + "'."); // Find the ResOperand that refers to the operand we're aliasing away // and update it to refer to the combined operand instead. @@ -878,7 +873,7 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) { // Adjust the ResOperand references to any AsmOperands that followed // the one we just deleted. for (ResOperand &Op : ResOperands) { - switch(Op.Kind) { + switch (Op.Kind) { default: // Nothing to do for operands that don't reference AsmOperands. break; @@ -892,10 +887,9 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) { /// extractSingletonRegisterForAsmOperand - Extract singleton register, /// if present, from specified token. -static void -extractSingletonRegisterForAsmOperand(MatchableInfo::AsmOperand &Op, - const AsmMatcherInfo &Info, - StringRef RegisterPrefix) { +static void extractSingletonRegisterForAsmOperand(MatchableInfo::AsmOperand &Op, + const AsmMatcherInfo &Info, + StringRef RegisterPrefix) { StringRef Tok = Op.Token; // If this token is not an isolated token, i.e., it isn't separated from @@ -922,13 +916,12 @@ extractSingletonRegisterForAsmOperand(MatchableInfo::AsmOperand &Op, } void MatchableInfo::initialize(const AsmMatcherInfo &Info, - SmallPtrSetImpl &SingletonRegisters, + SmallPtrSetImpl &SingletonRegisters, AsmVariantInfo const &Variant, bool HasMnemonicFirst) { AsmVariantID = Variant.AsmVariantNo; - AsmString = - CodeGenInstruction::FlattenAsmStringVariants(AsmString, - Variant.AsmVariantNo); + AsmString = CodeGenInstruction::FlattenAsmStringVariants( + AsmString, Variant.AsmVariantNo); tokenizeAsmString(Info, Variant); @@ -936,7 +929,7 @@ void MatchableInfo::initialize(const AsmMatcherInfo &Info, // simple string, not a $foo variable or a singleton register. if (AsmOperands.empty()) PrintFatalError(TheDef->getLoc(), - "Instruction '" + TheDef->getName() + "' has no tokens"); + "Instruction '" + TheDef->getName() + "' has no tokens"); assert(!AsmOperands[0].Token.empty()); if (HasMnemonicFirst) { @@ -1045,7 +1038,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info, size_t EndPos = String.find('}', i); assert(EndPos != StringRef::npos && "Missing brace in operand reference!"); - addAsmOperand(String.slice(i, EndPos+1), IsIsolatedToken); + addAsmOperand(String.slice(i, EndPos + 1), IsIsolatedToken); Prev = EndPos + 1; i = EndPos; IsIsolatedToken = false; @@ -1070,16 +1063,16 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool IsAlias) const { // isCodeGenOnly if they are pseudo instructions. if (AsmString.find('\n') != std::string::npos) PrintFatalError(TheDef->getLoc(), - "multiline instruction is not valid for the asmparser, " - "mark it isCodeGenOnly"); + "multiline instruction is not valid for the asmparser, " + "mark it isCodeGenOnly"); // Remove comments from the asm string. We know that the asmstring only // has one line. if (!CommentDelimiter.empty() && StringRef(AsmString).contains(CommentDelimiter)) PrintFatalError(TheDef->getLoc(), - "asmstring for instruction has comment character in it, " - "mark it isCodeGenOnly"); + "asmstring for instruction has comment character in it, " + "mark it isCodeGenOnly"); // Reject matchables with operand modifiers, these aren't something we can // handle, the target should be refactored to use operands instead of @@ -1092,17 +1085,17 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool IsAlias) const { for (const AsmOperand &Op : AsmOperands) { StringRef Tok = Op.Token; if (Tok[0] == '$' && Tok.contains(':')) - PrintFatalError(TheDef->getLoc(), - "matchable with operand modifier '" + Tok + - "' not supported by asm matcher. Mark isCodeGenOnly!"); + PrintFatalError( + TheDef->getLoc(), + "matchable with operand modifier '" + Tok + + "' not supported by asm matcher. Mark isCodeGenOnly!"); // Verify that any operand is only mentioned once. // We reject aliases and ignore instructions for now. if (!IsAlias && TheDef->getValueAsString("AsmMatchConverter").empty() && Tok[0] == '$' && !OperandNames.insert(std::string(Tok)).second) { LLVM_DEBUG({ errs() << "warning: '" << TheDef->getName() << "': " - << "ignoring instruction with tied operand '" - << Tok << "'\n"; + << "ignoring instruction with tied operand '" << Tok << "'\n"; }); return false; } @@ -1116,15 +1109,33 @@ static std::string getEnumNameForToken(StringRef Str) { for (char C : Str) { switch (C) { - case '*': Res += "_STAR_"; break; - case '%': Res += "_PCT_"; break; - case ':': Res += "_COLON_"; break; - case '!': Res += "_EXCLAIM_"; break; - case '.': Res += "_DOT_"; break; - case '<': Res += "_LT_"; break; - case '>': Res += "_GT_"; break; - case '-': Res += "_MINUS_"; break; - case '#': Res += "_HASH_"; break; + case '*': + Res += "_STAR_"; + break; + case '%': + Res += "_PCT_"; + break; + case ':': + Res += "_COLON_"; + break; + case '!': + Res += "_EXCLAIM_"; + break; + case '.': + Res += "_DOT_"; + break; + case '<': + Res += "_LT_"; + break; + case '>': + Res += "_GT_"; + break; + case '-': + Res += "_MINUS_"; + break; + case '#': + Res += "_HASH_"; + break; default: if (isAlnum(C)) Res += C; @@ -1166,8 +1177,7 @@ AsmMatcherInfo::getOperandClass(const CGIOperandList::OperandInfo &OI, return getOperandClass(Rec, SubOpIdx); } -ClassInfo * -AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) { +ClassInfo *AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) { if (Rec->isSubClassOf("RegisterOperand")) { // RegisterOperand may have an associated ParserMatchClass. If it does, // use it, else just fall back to the underlying register class. @@ -1177,7 +1187,7 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) { "Record `" + Rec->getName() + "' does not have a ParserMatchClass!\n"); - if (DefInit *DI= dyn_cast(R->getValue())) { + if (DefInit *DI = dyn_cast(R->getValue())) { Record *MatchClass = DI->getDef(); if (ClassInfo *CI = AsmOperandClasses[MatchClass]) return CI; @@ -1186,8 +1196,9 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) { // No custom match class. Just use the register class. Record *ClassRec = Rec->getValueAsDef("RegClass"); if (!ClassRec) - PrintFatalError(Rec->getLoc(), "RegisterOperand `" + Rec->getName() + - "' has no associated register class!\n"); + PrintFatalError(Rec->getLoc(), + "RegisterOperand `" + Rec->getName() + + "' has no associated register class!\n"); if (ClassInfo *CI = RegisterClassClasses[ClassRec]) return CI; PrintFatalError(Rec->getLoc(), "register class has no class info!"); @@ -1200,8 +1211,9 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) { } if (!Rec->isSubClassOf("Operand")) - PrintFatalError(Rec->getLoc(), "Operand `" + Rec->getName() + - "' does not derive from class Operand!\n"); + PrintFatalError(Rec->getLoc(), + "Operand `" + Rec->getName() + + "' does not derive from class Operand!\n"); Record *MatchClass = Rec->getValueAsDef("ParserMatchClass"); if (ClassInfo *CI = AsmOperandClasses[MatchClass]) return CI; @@ -1210,19 +1222,18 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) { } struct LessRegisterSet { - bool operator() (const RegisterSet &LHS, const RegisterSet & RHS) const { + bool operator()(const RegisterSet &LHS, const RegisterSet &RHS) const { // std::set defines its own compariso "operator<", but it // performs a lexicographical comparison by T's innate comparison // for some reason. We don't want non-deterministic pointer // comparisons so use this instead. - return std::lexicographical_compare(LHS.begin(), LHS.end(), - RHS.begin(), RHS.end(), - LessRecordByID()); + return std::lexicographical_compare(LHS.begin(), LHS.end(), RHS.begin(), + RHS.end(), LessRecordByID()); } }; -void AsmMatcherInfo:: -buildRegisterClasses(SmallPtrSetImpl &SingletonRegisters) { +void AsmMatcherInfo::buildRegisterClasses( + SmallPtrSetImpl &SingletonRegisters) { const auto &Registers = Target.getRegBank().getRegisters(); auto &RegClassList = Target.getRegBank().getRegClasses(); @@ -1244,7 +1255,7 @@ buildRegisterClasses(SmallPtrSetImpl &SingletonRegisters) { // Introduce derived sets where necessary (when a register does not determine // a unique register set class), and build the mapping of registers to the set // they should classify to. - std::map RegisterMap; + std::map RegisterMap; for (const CodeGenRegister &CGR : Registers) { // Compute the intersection of all sets containing this register. RegisterSet ContainingSet; @@ -1273,7 +1284,7 @@ buildRegisterClasses(SmallPtrSetImpl &SingletonRegisters) { } // Construct the register classes. - std::map RegisterSetClasses; + std::map RegisterSetClasses; unsigned Index = 0; for (const RegisterSet &RS : RegisterSets) { Classes.emplace_front(); @@ -1298,9 +1309,8 @@ buildRegisterClasses(SmallPtrSetImpl &SingletonRegisters) { for (const RegisterSet &RS : RegisterSets) { ClassInfo *CI = RegisterSetClasses[RS]; for (const RegisterSet &RS2 : RegisterSets) - if (RS != RS2 && - std::includes(RS2.begin(), RS2.end(), RS.begin(), RS.end(), - LessRecordByID())) + if (RS != RS2 && std::includes(RS2.begin(), RS2.end(), RS.begin(), + RS.end(), LessRecordByID())) CI->SuperClasses.push_back(RegisterSetClasses[RS2]); } @@ -1354,8 +1364,8 @@ buildRegisterClasses(SmallPtrSetImpl &SingletonRegisters) { } void AsmMatcherInfo::buildOperandClasses() { - std::vector AsmOperands = - Records.getAllDerivedDefinitions("AsmOperandClass"); + std::vector AsmOperands = + Records.getAllDerivedDefinitions("AsmOperandClass"); // Pre-populate AsmOperandClasses map. for (Record *Rec : AsmOperands) { @@ -1438,11 +1448,9 @@ void AsmMatcherInfo::buildOperandClasses() { } } -AsmMatcherInfo::AsmMatcherInfo(Record *asmParser, - CodeGenTarget &target, +AsmMatcherInfo::AsmMatcherInfo(Record *asmParser, CodeGenTarget &target, RecordKeeper &records) - : Records(records), AsmParser(asmParser), Target(target) { -} + : Records(records), AsmParser(asmParser), Target(target) {} /// buildOperandMatchInfo - Build the necessary information to handle user /// defined operand parsing methods. @@ -1476,8 +1484,8 @@ void AsmMatcherInfo::buildOperandMatchInfo() { for (const auto &OCM : OpClassMask) { unsigned OpMask = OCM.second; ClassInfo *CI = OCM.first; - OperandMatchInfo.push_back(OperandMatchEntry::create(MI.get(), CI, - OpMask)); + OperandMatchInfo.push_back( + OperandMatchEntry::create(MI.get(), CI, OpMask)); } } } @@ -1499,7 +1507,7 @@ void AsmMatcherInfo::buildInfo() { // Parse the instructions; we need to do this first so that we can gather the // singleton register classes. - SmallPtrSet SingletonRegisters; + SmallPtrSet SingletonRegisters; unsigned VariantCount = Target.getAsmParserVariantCount(); for (unsigned VC = 0; VC != VariantCount; ++VC) { Record *AsmVariant = Target.getAsmParserVariant(VC); @@ -1511,8 +1519,7 @@ void AsmMatcherInfo::buildInfo() { AsmVariant->getValueAsString("TokenizingCharacters"); Variant.SeparatorCharacters = AsmVariant->getValueAsString("SeparatorCharacters"); - Variant.BreakCharacters = - AsmVariant->getValueAsString("BreakCharacters"); + Variant.BreakCharacters = AsmVariant->getValueAsString("BreakCharacters"); Variant.Name = AsmVariant->getValueAsString("Name"); Variant.AsmVariantNo = AsmVariant->getValueAsInt("Variant"); @@ -1546,8 +1553,8 @@ void AsmMatcherInfo::buildInfo() { // Parse all of the InstAlias definitions and stick them in the list of // matchables. - std::vector AllInstAliases = - Records.getAllDerivedDefinitions("InstAlias"); + std::vector AllInstAliases = + Records.getAllDerivedDefinitions("InstAlias"); for (Record *InstAlias : AllInstAliases) { auto Alias = std::make_unique(InstAlias, Target); @@ -1654,14 +1661,14 @@ void AsmMatcherInfo::buildInfo() { // Process token alias definitions and set up the associated superclass // information. - std::vector AllTokenAliases = - Records.getAllDerivedDefinitions("TokenAlias"); + std::vector AllTokenAliases = + Records.getAllDerivedDefinitions("TokenAlias"); for (Record *Rec : AllTokenAliases) { ClassInfo *FromClass = getTokenClass(Rec->getValueAsString("FromToken")); ClassInfo *ToClass = getTokenClass(Rec->getValueAsString("ToToken")); if (FromClass == ToClass) PrintFatalError(Rec->getLoc(), - "error: Destination value identical to source value."); + "error: Destination value identical to source value."); FromClass->SuperClasses.push_back(ToClass); } @@ -1681,10 +1688,9 @@ void AsmMatcherInfo::buildInfo() { /// buildInstructionOperandReference - The specified operand is a reference to a /// named operand such as $src. Resolve the Class and OperandInfo pointers. -void AsmMatcherInfo:: -buildInstructionOperandReference(MatchableInfo *II, - StringRef OperandName, - unsigned AsmOpIdx) { +void AsmMatcherInfo::buildInstructionOperandReference(MatchableInfo *II, + StringRef OperandName, + unsigned AsmOpIdx) { const CodeGenInstruction &CGI = *cast(II->DefRec); const CGIOperandList &Operands = CGI.Operands; MatchableInfo::AsmOperand *Op = &II->AsmOperands[AsmOpIdx]; @@ -1708,7 +1714,8 @@ buildInstructionOperandReference(MatchableInfo *II, for (unsigned SI = 1, SE = Operands[Idx].MINumOperands; SI != SE; ++SI) { MatchableInfo::AsmOperand NewAsmOp(/*IsIsolatedToken=*/true, Token); NewAsmOp.SubOpIdx = SI; - II->AsmOperands.insert(II->AsmOperands.begin()+AsmOpIdx+SI, NewAsmOp); + II->AsmOperands.insert(II->AsmOperands.begin() + AsmOpIdx + SI, + NewAsmOp); } // Replace Op with first suboperand. Op = &II->AsmOperands[AsmOpIdx]; // update the pointer in case it moved @@ -1760,8 +1767,8 @@ void AsmMatcherInfo::buildAliasOperandReference(MatchableInfo *II, // Use the match class from the Alias definition, not the // destination instruction, as we may have an immediate that's // being munged by the match class. - Op.Class = getOperandClass(CGA.ResultOperands[i].getRecord(), - Op.SubOpIdx); + Op.Class = + getOperandClass(CGA.ResultOperands[i].getRecord(), Op.SubOpIdx); Op.SrcOpName = OperandName; Op.OrigSrcOpName = OperandName; return; @@ -1812,8 +1819,8 @@ void MatchableInfo::buildInstructionResultOperands() { // Add a separate ResOperand for each suboperand. for (unsigned AI = 0; AI < NumOperands; ++AI) { - assert(AsmOperands[SrcOperand+AI].SubOpIdx == (int)AI && - AsmOperands[SrcOperand+AI].SrcOpName == OpInfo.Name && + assert(AsmOperands[SrcOperand + AI].SubOpIdx == (int)AI && + AsmOperands[SrcOperand + AI].SrcOpName == OpInfo.Name && "unexpected AsmOperands for suboperands"); ResOperands.push_back(ResOperand::getRenderedOp(SrcOperand + AI, 1)); } @@ -1886,8 +1893,9 @@ void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) { // Handle all the suboperands for this operand. const std::string &OpName = OpInfo->Name; - for ( ; AliasOpNo < LastOpNo && - CGA.ResultInstOperandIndex[AliasOpNo].first == i; ++AliasOpNo) { + for (; AliasOpNo < LastOpNo && + CGA.ResultInstOperandIndex[AliasOpNo].first == i; + ++AliasOpNo) { int SubIdx = CGA.ResultInstOperandIndex[AliasOpNo].second; // Find out what operand from the asmparser that this MCInst operand @@ -1897,17 +1905,18 @@ void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) { StringRef Name = CGA.ResultOperands[AliasOpNo].getName(); int SrcOperand = findAsmOperand(Name, SubIdx); if (SrcOperand == -1) - PrintFatalError(TheDef->getLoc(), "Instruction '" + - TheDef->getName() + "' has operand '" + OpName + - "' that doesn't appear in asm string!"); + PrintFatalError(TheDef->getLoc(), + "Instruction '" + TheDef->getName() + + "' has operand '" + OpName + + "' that doesn't appear in asm string!"); // Add it to the operand references. If it is added a second time, the // record won't be updated and it will fail later on. OperandRefs.try_emplace(Name, SrcOperand); unsigned NumOperands = (SubIdx == -1 ? OpInfo->MINumOperands : 1); - ResOperands.push_back(ResOperand::getRenderedOp(SrcOperand, - NumOperands)); + ResOperands.push_back( + ResOperand::getRenderedOp(SrcOperand, NumOperands)); break; } case CodeGenInstAlias::ResultOperand::K_Imm: { @@ -1952,7 +1961,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, raw_ostream &OS) { SmallSetVector OperandConversionKinds; SmallSetVector InstructionConversionKinds; - std::vector > ConversionTable; + std::vector> ConversionTable; size_t MaxRowLength = 2; // minimum is custom converter plus terminator. // TargetOperandClass - This is the target's operand class, like X86Operand. @@ -2009,7 +2018,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, CvtOS << " break;\n"; CvtOS << " case CVT_Tied: {\n"; CvtOS << " assert(OpIdx < (size_t)(std::end(TiedAsmOperandTable) -\n"; - CvtOS << " std::begin(TiedAsmOperandTable)) &&\n"; + CvtOS + << " std::begin(TiedAsmOperandTable)) &&\n"; CvtOS << " \"Tied operand not found\");\n"; CvtOS << " unsigned TiedResOpnd = TiedAsmOperandTable[OpIdx][0];\n"; CvtOS << " if (TiedResOpnd != (uint8_t)-1)\n"; @@ -2048,7 +2058,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, // Map of e.g. <0, 2, 3> -> "Tie_0_2_3" enum label. std::map, std::string> - TiedOperandsEnumMap; + TiedOperandsEnumMap; for (auto &II : Infos) { // Check if we have a custom match function. @@ -2073,8 +2083,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, ConversionTable.back().push_back(CVT_Done); // Add the handler to the conversion driver function. - CvtOS << " case CVT_" - << getEnumNameForToken(AsmMatchConverter) << ":\n" + CvtOS << " case CVT_" << getEnumNameForToken(AsmMatchConverter) + << ":\n" << " " << AsmMatchConverter << "(Inst, Operands);\n" << " break;\n"; @@ -2088,7 +2098,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, std::vector ConversionRow; // Compute the convert enum and the case body. - MaxRowLength = std::max(MaxRowLength, II->ResOperands.size()*2 + 1 ); + MaxRowLength = std::max(MaxRowLength, II->ResOperands.size() * 2 + 1); for (unsigned i = 0, e = II->ResOperands.size(); i != e; ++i) { const MatchableInfo::ResOperand &OpInfo = II->ResOperands[i]; @@ -2098,7 +2108,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, case MatchableInfo::ResOperand::RenderAsmOperand: { // This comes from something we parsed. const MatchableInfo::AsmOperand &Op = - II->AsmOperands[OpInfo.AsmOperandNum]; + II->AsmOperands[OpInfo.AsmOperandNum]; // Registers are always converted the same, don't duplicate the // conversion function based on them. @@ -2111,8 +2121,9 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, // Add the conversion kind, if necessary, and get the associated ID // the index of its entry in the vector). - std::string Name = "CVT_" + (Op.Class->isRegisterClass() ? "Reg" : - Op.Class->RenderMethod); + std::string Name = + "CVT_" + + (Op.Class->isRegisterClass() ? "Reg" : Op.Class->RenderMethod); if (Op.Class->IsOptional) { // For optional operands we must also care about DefaultMethod assert(HasOptionalOperands); @@ -2121,8 +2132,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, Name = getEnumNameForToken(Name); bool IsNewConverter = false; - unsigned ID = getConverterOperandID(Name, OperandConversionKinds, - IsNewConverter); + unsigned ID = + getConverterOperandID(Name, OperandConversionKinds, IsNewConverter); // Add the operand entry to the instruction kind conversion row. ConversionRow.push_back(ID); @@ -2171,10 +2182,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, // operand from the earlier one.We can only tie single MCOperand values. assert(OpInfo.MINumOperands == 1 && "Not a singular MCOperand"); uint8_t TiedOp = OpInfo.TiedOperands.ResOpnd; - uint8_t SrcOp1 = - OpInfo.TiedOperands.SrcOpnd1Idx + HasMnemonicFirst; - uint8_t SrcOp2 = - OpInfo.TiedOperands.SrcOpnd2Idx + HasMnemonicFirst; + uint8_t SrcOp1 = OpInfo.TiedOperands.SrcOpnd1Idx + HasMnemonicFirst; + uint8_t SrcOp2 = OpInfo.TiedOperands.SrcOpnd2Idx + HasMnemonicFirst; assert((i > TiedOp || TiedOp == (uint8_t)-1) && "Tied operand precedes its target!"); auto TiedTupleName = std::string("Tie") + utostr(TiedOp) + '_' + @@ -2198,8 +2207,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, std::string Name = "CVT_" + Ty; bool IsNewConverter = false; - unsigned ID = getConverterOperandID(Name, OperandConversionKinds, - IsNewConverter); + unsigned ID = + getConverterOperandID(Name, OperandConversionKinds, IsNewConverter); // Add the operand entry to the instruction kind conversion row. ConversionRow.push_back(ID); ConversionRow.push_back(0); @@ -2230,8 +2239,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, Signature += "__" + Name; Name = "CVT_" + Name; bool IsNewConverter = false; - unsigned ID = getConverterOperandID(Name, OperandConversionKinds, - IsNewConverter); + unsigned ID = + getConverterOperandID(Name, OperandConversionKinds, IsNewConverter); // Add the operand entry to the instruction kind conversion row. ConversionRow.push_back(ID); ConversionRow.push_back(0); @@ -2289,9 +2298,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, OS << "static const uint8_t TiedAsmOperandTable[][3] = {\n"; for (auto &KV : TiedOperandsEnumMap) { - OS << " /* " << KV.second << " */ { " - << utostr(std::get<0>(KV.first)) << ", " - << utostr(std::get<1>(KV.first)) << ", " + OS << " /* " << KV.second << " */ { " << utostr(std::get<0>(KV.first)) + << ", " << utostr(std::get<1>(KV.first)) << ", " << utostr(std::get<2>(KV.first)) << " },\n"; } OS << "};\n\n"; @@ -2402,7 +2410,8 @@ static void emitMatchClassEnumeration(CodeGenTarget &Target, /// emitMatchClassDiagStrings - Emit a function to get the diagnostic text to be /// used when an assembly operand does not match the expected operand class. -static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info, raw_ostream &OS) { +static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info, + raw_ostream &OS) { // If the target does not use DiagnosticString for any operands, don't emit // an unused function. if (llvm::all_of(Info.Classes, [](const ClassInfo &CI) { @@ -2415,12 +2424,12 @@ static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info, raw_ostream & << "MatchResultTy MatchResult) {\n"; OS << " switch (MatchResult) {\n"; - for (const auto &CI: Info.Classes) { + for (const auto &CI : Info.Classes) { if (!CI.DiagnosticString.empty()) { assert(!CI.DiagnosticType.empty() && "DiagnosticString set without DiagnosticType"); - OS << " case " << Info.Target.getName() - << "AsmParser::Match_" << CI.DiagnosticType << ":\n"; + OS << " case " << Info.Target.getName() << "AsmParser::Match_" + << CI.DiagnosticType << ":\n"; OS << " return \"" << CI.DiagnosticString << "\";\n"; } } @@ -2441,7 +2450,7 @@ static void emitRegisterMatchErrorFunc(AsmMatcherInfo &Info, raw_ostream &OS) { OS << " return MCTargetAsmParser::Match_InvalidOperand;\n"; } else { OS << " switch (RegisterClass) {\n"; - for (const auto &CI: Info.Classes) { + for (const auto &CI : Info.Classes) { if (CI.isRegisterClass() && !CI.DiagnosticType.empty()) { OS << " case " << CI.Name << ":\n"; OS << " return " << Info.Target.getName() << "AsmParser::Match_" @@ -2458,8 +2467,7 @@ static void emitRegisterMatchErrorFunc(AsmMatcherInfo &Info, raw_ostream &OS) { } /// emitValidateOperandClass - Emit the function to validate an operand class. -static void emitValidateOperandClass(AsmMatcherInfo &Info, - raw_ostream &OS) { +static void emitValidateOperandClass(AsmMatcherInfo &Info, raw_ostream &OS) { OS << "static unsigned validateOperandClass(MCParsedAsmOperand &GOp, " << "MatchClassKind Kind) {\n"; OS << " " << Info.Target.getName() << "Operand &Operand = (" @@ -2495,8 +2503,7 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info, OS << " return " << Info.Target.getName() << "AsmParser::Match_" << CI.DiagnosticType << ";\n"; OS << " break;\n"; - } - else + } else OS << " break;\n"; OS << " }\n"; } @@ -2508,8 +2515,8 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info, OS << " switch (Operand.getReg()) {\n"; OS << " default: OpKind = InvalidMatchClass; break;\n"; for (const auto &RC : Info.RegisterClasses) - OS << " case " << RC.first->getValueAsString("Namespace") << "::" - << RC.first->getName() << ": OpKind = " << RC.second->Name + OS << " case " << RC.first->getValueAsString("Namespace") + << "::" << RC.first->getName() << ": OpKind = " << RC.second->Name << "; break;\n"; OS << " }\n"; OS << " return isSubclass(OpKind, Kind) ? " @@ -2676,7 +2683,8 @@ static void emitOperandDiagnosticTypes(AsmMatcherInfo &Info, raw_ostream &OS) { Types.insert(OpClassEntry.second->DiagnosticType); } - if (Types.empty()) return; + if (Types.empty()) + return; // Now emit the enum entries. for (StringRef Type : Types) @@ -2709,7 +2717,7 @@ static void emitGetSubtargetFeatureName(AsmMatcherInfo &Info, raw_ostream &OS) { static std::string GetAliasRequiredFeatures(Record *R, const AsmMatcherInfo &Info) { - std::vector ReqFeatures = R->getValueAsListOfDefs("Predicates"); + std::vector ReqFeatures = R->getValueAsListOfDefs("Predicates"); std::string Result; if (ReqFeatures.empty()) @@ -2719,8 +2727,9 @@ static std::string GetAliasRequiredFeatures(Record *R, const SubtargetFeatureInfo *F = Info.getSubtargetFeature(ReqFeatures[i]); if (!F) - PrintFatalError(R->getLoc(), "Predicate '" + ReqFeatures[i]->getName() + - "' is not marked as an AssemblerPredicate!"); + PrintFatalError(R->getLoc(), + "Predicate '" + ReqFeatures[i]->getName() + + "' is not marked as an AssemblerPredicate!"); if (i) Result += " && "; @@ -2731,21 +2740,21 @@ static std::string GetAliasRequiredFeatures(Record *R, return Result; } -static void emitMnemonicAliasVariant(raw_ostream &OS,const AsmMatcherInfo &Info, - std::vector &Aliases, - unsigned Indent = 0, - StringRef AsmParserVariantName = StringRef()){ +static void +emitMnemonicAliasVariant(raw_ostream &OS, const AsmMatcherInfo &Info, + std::vector &Aliases, unsigned Indent = 0, + StringRef AsmParserVariantName = StringRef()) { // Keep track of all the aliases from a mnemonic. Use an std::map so that the // iteration order of the map is stable. - std::map > AliasesFromMnemonic; + std::map> AliasesFromMnemonic; for (Record *R : Aliases) { // FIXME: Allow AssemblerVariantName to be a comma separated list. StringRef AsmVariantName = R->getValueAsString("AsmVariantName"); if (AsmVariantName != AsmParserVariantName) continue; - AliasesFromMnemonic[R->getValueAsString("FromMnemonic").lower()] - .push_back(R); + AliasesFromMnemonic[R->getValueAsString("FromMnemonic").lower()].push_back( + R); } if (AliasesFromMnemonic.empty()) return; @@ -2754,7 +2763,7 @@ static void emitMnemonicAliasVariant(raw_ostream &OS,const AsmMatcherInfo &Info, // by the string remapper. std::vector Cases; for (const auto &AliasEntry : AliasesFromMnemonic) { - const std::vector &ToVec = AliasEntry.second; + const std::vector &ToVec = AliasEntry.second; // Loop through each alias and emit code that handles each case. If there // are two instructions without predicates, emit an error. If there is one, @@ -2818,12 +2827,13 @@ static bool emitMnemonicAliases(raw_ostream &OS, const AsmMatcherInfo &Info, if (!MatchPrefix.empty()) return false; - std::vector Aliases = - Info.getRecords().getAllDerivedDefinitions("MnemonicAlias"); - if (Aliases.empty()) return false; + std::vector Aliases = + Info.getRecords().getAllDerivedDefinitions("MnemonicAlias"); + if (Aliases.empty()) + return false; OS << "static void applyMnemonicAliases(StringRef &Mnemonic, " - "const FeatureBitset &Features, unsigned VariantID) {\n"; + "const FeatureBitset &Features, unsigned VariantID) {\n"; OS << " switch (VariantID) {\n"; unsigned VariantCount = Target.getAsmParserVariantCount(); for (unsigned VC = 0; VC != VariantCount; ++VC) { @@ -2859,17 +2869,15 @@ emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target, // Emit the static custom operand parsing table; OS << "namespace {\n"; OS << " struct OperandMatchEntry {\n"; - OS << " " << getMinimalTypeForRange(MaxMnemonicIndex) - << " Mnemonic;\n"; - OS << " " << getMinimalTypeForRange(MaxMask) - << " OperandMask;\n"; + OS << " " << getMinimalTypeForRange(MaxMnemonicIndex) << " Mnemonic;\n"; + OS << " " << getMinimalTypeForRange(MaxMask) << " OperandMask;\n"; OS << " " << getMinimalTypeForRange( std::distance(Info.Classes.begin(), Info.Classes.end()) + 2 /* Include 'InvalidMatchClass' and 'OptionalMatchClass' */) << " Class;\n"; OS << " " << getMinimalTypeForRange(MaxFeaturesIndex) - << " RequiredFeaturesIdx;\n\n"; + << " RequiredFeaturesIdx;\n\n"; OS << " StringRef getMnemonic() const {\n"; OS << " return StringRef(MnemonicTable + Mnemonic + 1,\n"; OS << " MnemonicTable[Mnemonic]);\n"; @@ -2903,13 +2911,13 @@ emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target, // Store a pascal-style length byte in the mnemonic. std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.lower(); - OS << StringTable.GetOrAddStringOffset(LenMnemonic, false) - << " /* " << II.Mnemonic << " */, "; + OS << StringTable.GetOrAddStringOffset(LenMnemonic, false) << " /* " + << II.Mnemonic << " */, "; OS << OMI.OperandMask; OS << " /* "; ListSeparator LS; - for (int i = 0, e = 31; i !=e; ++i) + for (int i = 0, e = 31; i != e; ++i) if (OMI.OperandMask & (1 << i)) OS << LS << i; OS << " */, "; @@ -2958,7 +2966,8 @@ emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target, // Emit code to get the available features. OS << " // Get the current feature set.\n"; - OS << " const FeatureBitset &AvailableFeatures = getAvailableFeatures();\n\n"; + OS << " const FeatureBitset &AvailableFeatures = " + "getAvailableFeatures();\n\n"; OS << " // Get the next operand index.\n"; OS << " unsigned NextOpNum = Operands.size()" @@ -3064,7 +3073,7 @@ static void emitMnemonicSpellChecker(raw_ostream &OS, CodeGenTarget &Target, << "MnemonicSpellCheck(StringRef S, const FeatureBitset &FBS," << " unsigned VariantID) {\n"; if (!VariantCount) - OS << " return \"\";"; + OS << " return \"\";"; else { OS << " const unsigned MaxEditDist = 2;\n"; OS << " std::vector Candidates;\n"; @@ -3112,10 +3121,8 @@ static void emitMnemonicSpellChecker(raw_ostream &OS, CodeGenTarget &Target, OS << "\n"; } -static void emitMnemonicChecker(raw_ostream &OS, - CodeGenTarget &Target, - unsigned VariantCount, - bool HasMnemonicFirst, +static void emitMnemonicChecker(raw_ostream &OS, CodeGenTarget &Target, + unsigned VariantCount, bool HasMnemonicFirst, bool HasMnemonicAliases) { OS << "static bool " << Target.getName() << "CheckMnemonic(StringRef Mnemonic,\n"; @@ -3125,7 +3132,7 @@ static void emitMnemonicChecker(raw_ostream &OS, << "unsigned VariantID) {\n"; if (!VariantCount) { - OS << " return false;\n"; + OS << " return false;\n"; } else { if (HasMnemonicAliases) { OS << " // Process all MnemonicAliases to remap the mnemonic.\n"; @@ -3232,9 +3239,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { #endif DEBUG_WITH_TYPE("instruction_info", { - for (const auto &MI : Info.Matchables) - MI->dump(); - }); + for (const auto &MI : Info.Matchables) + MI->dump(); + }); // Check for ambiguous matchables. DEBUG_WITH_TYPE("ambiguous_instrs", { @@ -3256,8 +3263,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { } } if (NumAmbiguous) - errs() << "warning: " << NumAmbiguous - << " ambiguous matchables!\n"; + errs() << "warning: " << NumAmbiguous << " ambiguous matchables!\n"; }); // Compute the information on the custom operand parsing. @@ -3275,12 +3281,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << "#undef GET_ASSEMBLER_HEADER\n"; OS << " // This should be included into the middle of the declaration of\n"; OS << " // your subclasses implementation of MCTargetAsmParser.\n"; - OS << " FeatureBitset ComputeAvailableFeatures(const FeatureBitset &FB) const;\n"; + OS << " FeatureBitset ComputeAvailableFeatures(const FeatureBitset &FB) " + "const;\n"; if (HasOptionalOperands) { OS << " void convertToMCInst(unsigned Kind, MCInst &Inst, " << "unsigned Opcode,\n" << " const OperandVector &Operands,\n" - << " const SmallBitVector &OptionalOperandsMask);\n"; + << " const SmallBitVector " + "&OptionalOperandsMask);\n"; } else { OS << " void convertToMCInst(unsigned Kind, MCInst &Inst, " << "unsigned Opcode,\n" @@ -3291,7 +3299,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << " unsigned MatchInstructionImpl(const OperandVector &Operands,\n" << " MCInst &Inst,\n"; if (ReportMultipleNearMisses) - OS << " SmallVectorImpl *NearMisses,\n"; + OS << " SmallVectorImpl " + "*NearMisses,\n"; else OS << " uint64_t &ErrorInfo,\n" << " FeatureBitset &MissingFeatures,\n"; @@ -3304,11 +3313,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { << " bool matchingInlineAsm,\n" << " unsigned VariantID = 0) {\n" << " FeatureBitset MissingFeatures;\n" - << " return MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,\n" + << " return MatchInstructionImpl(Operands, Inst, ErrorInfo, " + "MissingFeatures,\n" << " matchingInlineAsm, VariantID);\n" << " }\n\n"; - if (!Info.OperandMatchInfo.empty()) { OS << " ParseStatus MatchOperandParserImpl(\n"; OS << " OperandVector &Operands,\n"; @@ -3362,9 +3371,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { // Generate the convertToMCInst function to convert operands into an MCInst. // Also, generate the convertToMapAndConstraints function for MS-style inline // assembly. The latter doesn't actually generate a MCInst. - unsigned NumConverters = emitConvertFuncs(Target, ClassName, Info.Matchables, - HasMnemonicFirst, - HasOptionalOperands, OS); + unsigned NumConverters = + emitConvertFuncs(Target, ClassName, Info.Matchables, HasMnemonicFirst, + HasOptionalOperands, OS); // Emit the enumeration for classes which participate in matching. emitMatchClassEnumeration(Target, Info.Classes, OS); @@ -3406,8 +3415,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { // Store a pascal-style length byte in the mnemonic. std::string LenMnemonic = char(MI->Mnemonic.size()) + MI->Mnemonic.lower(); - MaxMnemonicIndex = std::max(MaxMnemonicIndex, - StringTable.GetOrAddStringOffset(LenMnemonic, false)); + MaxMnemonicIndex = std::max( + MaxMnemonicIndex, StringTable.GetOrAddStringOffset(LenMnemonic, false)); } OS << "static const char MnemonicTable[] =\n"; @@ -3476,13 +3485,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { // following the mnemonic. OS << "namespace {\n"; OS << " struct MatchEntry {\n"; - OS << " " << getMinimalTypeForRange(MaxMnemonicIndex) - << " Mnemonic;\n"; + OS << " " << getMinimalTypeForRange(MaxMnemonicIndex) << " Mnemonic;\n"; OS << " uint16_t Opcode;\n"; - OS << " " << getMinimalTypeForRange(NumConverters) - << " ConvertFn;\n"; + OS << " " << getMinimalTypeForRange(NumConverters) << " ConvertFn;\n"; OS << " " << getMinimalTypeForRange(FeatureBitsets.size()) - << " RequiredFeaturesIdx;\n"; + << " RequiredFeaturesIdx;\n"; OS << " " << getMinimalTypeForRange( std::distance(Info.Classes.begin(), Info.Classes.end()) + @@ -3524,9 +3531,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { std::string LenMnemonic = char(MI->Mnemonic.size()) + MI->Mnemonic.lower(); OS << " { " << StringTable.GetOrAddStringOffset(LenMnemonic, false) - << " /* " << MI->Mnemonic << " */, " - << Target.getInstNamespace() << "::" - << MI->getResultInst()->TheDef->getName() << ", " + << " /* " << MI->Mnemonic << " */, " << Target.getInstNamespace() + << "::" << MI->getResultInst()->TheDef->getName() << ", " << MI->ConversionFnKind << ", "; // Write the required features mask. @@ -3563,17 +3569,17 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { if (!ReportMultipleNearMisses) { OS << " // Eliminate obvious mismatches.\n"; - OS << " if (Operands.size() > " - << (MaxNumOperands + HasMnemonicFirst) << ") {\n"; - OS << " ErrorInfo = " - << (MaxNumOperands + HasMnemonicFirst) << ";\n"; + OS << " if (Operands.size() > " << (MaxNumOperands + HasMnemonicFirst) + << ") {\n"; + OS << " ErrorInfo = " << (MaxNumOperands + HasMnemonicFirst) << ";\n"; OS << " return Match_InvalidOperand;\n"; OS << " }\n\n"; } // Emit code to get the available features. OS << " // Get the current feature set.\n"; - OS << " const FeatureBitset &AvailableFeatures = getAvailableFeatures();\n\n"; + OS << " const FeatureBitset &AvailableFeatures = " + "getAvailableFeatures();\n\n"; OS << " // Get the instruction mnemonic, which is the first token.\n"; if (HasMnemonicFirst) { @@ -3632,7 +3638,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { "std::equal_range(Start, End, Mnemonic.lower(), LessOpcode());\n\n"; } - OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"AsmMatcher: found \" <<\n" + OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"AsmMatcher: found \" " + "<<\n" << " std::distance(MnemonicRange.first, MnemonicRange.second) <<\n" << " \" encodings with mnemonic '\" << Mnemonic << \"'\\n\");\n\n"; @@ -3647,15 +3654,20 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { "FeatureBitsets[it->RequiredFeaturesIdx];\n"; OS << " bool HasRequiredFeatures =\n"; OS << " (AvailableFeatures & RequiredFeatures) == RequiredFeatures;\n"; - OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Trying to match opcode \"\n"; - OS << " << MII.getName(it->Opcode) << \"\\n\");\n"; + OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Trying to match " + "opcode \"\n"; + OS << " << MII.getName(it->Opcode) " + "<< \"\\n\");\n"; if (ReportMultipleNearMisses) { - OS << " // Some state to record ways in which this instruction did not match.\n"; + OS << " // Some state to record ways in which this instruction did not " + "match.\n"; OS << " NearMissInfo OperandNearMiss = NearMissInfo::getSuccess();\n"; OS << " NearMissInfo FeaturesNearMiss = NearMissInfo::getSuccess();\n"; - OS << " NearMissInfo EarlyPredicateNearMiss = NearMissInfo::getSuccess();\n"; - OS << " NearMissInfo LatePredicateNearMiss = NearMissInfo::getSuccess();\n"; + OS << " NearMissInfo EarlyPredicateNearMiss = " + "NearMissInfo::getSuccess();\n"; + OS << " NearMissInfo LatePredicateNearMiss = " + "NearMissInfo::getSuccess();\n"; OS << " bool MultipleInvalidOperands = false;\n"; } @@ -3676,30 +3688,39 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << " auto Formal = " << "static_cast(it->Classes[FormalIdx]);\n"; OS << " DEBUG_WITH_TYPE(\"asm-matcher\",\n"; - OS << " dbgs() << \" Matching formal operand class \" << getMatchClassName(Formal)\n"; - OS << " << \" against actual operand at index \" << ActualIdx);\n"; + OS << " dbgs() << \" Matching formal operand class \" " + "<< getMatchClassName(Formal)\n"; + OS << " << \" against actual operand at index \" " + "<< ActualIdx);\n"; OS << " if (ActualIdx < Operands.size())\n"; OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \" (\";\n"; - OS << " Operands[ActualIdx]->print(dbgs()); dbgs() << \"): \");\n"; + OS << " Operands[ActualIdx]->print(dbgs()); dbgs() << " + "\"): \");\n"; OS << " else\n"; OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \": \");\n"; OS << " if (ActualIdx >= Operands.size()) {\n"; OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"actual operand " "index out of range\\n\");\n"; if (ReportMultipleNearMisses) { - OS << " bool ThisOperandValid = (Formal == " <<"InvalidMatchClass) || " - "isSubclass(Formal, OptionalMatchClass);\n"; + OS << " bool ThisOperandValid = (Formal == " + << "InvalidMatchClass) || " + "isSubclass(Formal, OptionalMatchClass);\n"; OS << " if (!ThisOperandValid) {\n"; OS << " if (!OperandNearMiss) {\n"; OS << " // Record info about match failure for later use.\n"; - OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"recording too-few-operands near miss\\n\");\n"; + OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"recording " + "too-few-operands near miss\\n\");\n"; OS << " OperandNearMiss =\n"; - OS << " NearMissInfo::getTooFewOperands(Formal, it->Opcode);\n"; - OS << " } else if (OperandNearMiss.getKind() != NearMissInfo::NearMissTooFewOperands) {\n"; - OS << " // If more than one operand is invalid, give up on this match entry.\n"; + OS << " NearMissInfo::getTooFewOperands(Formal, " + "it->Opcode);\n"; + OS << " } else if (OperandNearMiss.getKind() != " + "NearMissInfo::NearMissTooFewOperands) {\n"; + OS << " // If more than one operand is invalid, give up on this " + "match entry.\n"; OS << " DEBUG_WITH_TYPE(\n"; OS << " \"asm-matcher\",\n"; - OS << " dbgs() << \"second invalid operand, giving up on this opcode\\n\");\n"; + OS << " dbgs() << \"second invalid operand, giving up on " + "this opcode\\n\");\n"; OS << " MultipleInvalidOperands = true;\n"; OS << " break;\n"; OS << " }\n"; @@ -3731,17 +3752,20 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << " unsigned Diag = validateOperandClass(Actual, Formal);\n"; OS << " if (Diag == Match_Success) {\n"; OS << " DEBUG_WITH_TYPE(\"asm-matcher\",\n"; - OS << " dbgs() << \"match success using generic matcher\\n\");\n"; + OS << " dbgs() << \"match success using generic " + "matcher\\n\");\n"; OS << " ++ActualIdx;\n"; OS << " continue;\n"; OS << " }\n"; OS << " // If the generic handler indicates an invalid operand\n"; OS << " // failure, check for a special case.\n"; OS << " if (Diag != Match_Success) {\n"; - OS << " unsigned TargetDiag = validateTargetOperandClass(Actual, Formal);\n"; + OS << " unsigned TargetDiag = validateTargetOperandClass(Actual, " + "Formal);\n"; OS << " if (TargetDiag == Match_Success) {\n"; OS << " DEBUG_WITH_TYPE(\"asm-matcher\",\n"; - OS << " dbgs() << \"match success using target matcher\\n\");\n"; + OS << " dbgs() << \"match success using target " + "matcher\\n\");\n"; OS << " ++ActualIdx;\n"; OS << " continue;\n"; OS << " }\n"; @@ -3758,38 +3782,46 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { if (HasOptionalOperands) { OS << " OptionalOperandsMask.set(FormalIdx);\n"; } - OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"ignoring optional operand\\n\");\n"; + OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"ignoring " + "optional operand\\n\");\n"; OS << " continue;\n"; OS << " }\n"; if (ReportMultipleNearMisses) { OS << " if (!OperandNearMiss) {\n"; - OS << " // If this is the first invalid operand we have seen, record some\n"; + OS << " // If this is the first invalid operand we have seen, " + "record some\n"; OS << " // information about it.\n"; OS << " DEBUG_WITH_TYPE(\n"; OS << " \"asm-matcher\",\n"; OS << " dbgs()\n"; - OS << " << \"operand match failed, recording near-miss with diag code \"\n"; + OS << " << \"operand match failed, recording near-miss with " + "diag code \"\n"; OS << " << Diag << \"\\n\");\n"; OS << " OperandNearMiss =\n"; - OS << " NearMissInfo::getMissedOperand(Diag, Formal, it->Opcode, ActualIdx);\n"; + OS << " NearMissInfo::getMissedOperand(Diag, Formal, " + "it->Opcode, ActualIdx);\n"; OS << " ++ActualIdx;\n"; OS << " } else {\n"; - OS << " // If more than one operand is invalid, give up on this match entry.\n"; + OS << " // If more than one operand is invalid, give up on this " + "match entry.\n"; OS << " DEBUG_WITH_TYPE(\n"; OS << " \"asm-matcher\",\n"; - OS << " dbgs() << \"second operand mismatch, skipping this opcode\\n\");\n"; + OS << " dbgs() << \"second operand mismatch, skipping this " + "opcode\\n\");\n"; OS << " MultipleInvalidOperands = true;\n"; OS << " break;\n"; OS << " }\n"; OS << " }\n\n"; } else { - OS << " // If this operand is broken for all of the instances of this\n"; + OS << " // If this operand is broken for all of the instances of " + "this\n"; OS << " // mnemonic, keep track of it so we can report loc info.\n"; OS << " // If we already had a match that only failed due to a\n"; OS << " // target predicate, that diagnostic is preferred.\n"; OS << " if (!HadMatchOtherThanPredicate &&\n"; - OS << " (it == MnemonicRange.first || ErrorInfo <= ActualIdx)) {\n"; + OS << " (it == MnemonicRange.first || ErrorInfo <= ActualIdx)) " + "{\n"; OS << " if (HasRequiredFeatures && (ErrorInfo != ActualIdx || Diag " "!= Match_InvalidOperand))\n"; OS << " RetCode = Diag;\n"; @@ -3805,8 +3837,10 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << " if (MultipleInvalidOperands) {\n"; else OS << " if (!OperandsValid) {\n"; - OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: multiple \"\n"; - OS << " \"operand mismatches, ignoring \"\n"; + OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: " + "multiple \"\n"; + OS << " \"operand mismatches, " + "ignoring \"\n"; OS << " \"this opcode\\n\");\n"; OS << " continue;\n"; OS << " }\n"; @@ -3817,13 +3851,16 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << " HadMatchOtherThanFeatures = true;\n"; OS << " FeatureBitset NewMissingFeatures = RequiredFeatures & " "~AvailableFeatures;\n"; - OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Missing target features:\";\n"; - OS << " for (unsigned I = 0, E = NewMissingFeatures.size(); I != E; ++I)\n"; + OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Missing target " + "features:\";\n"; + OS << " for (unsigned I = 0, E = " + "NewMissingFeatures.size(); I != E; ++I)\n"; OS << " if (NewMissingFeatures[I])\n"; OS << " dbgs() << ' ' << I;\n"; OS << " dbgs() << \"\\n\");\n"; if (ReportMultipleNearMisses) { - OS << " FeaturesNearMiss = NearMissInfo::getMissedFeature(NewMissingFeatures);\n"; + OS << " FeaturesNearMiss = " + "NearMissInfo::getMissedFeature(NewMissingFeatures);\n"; } else { OS << " if (NewMissingFeatures.count() <=\n" " MissingFeatures.count())\n"; @@ -3848,10 +3885,12 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { << " Inst.clear();\n"; OS << " DEBUG_WITH_TYPE(\n"; OS << " \"asm-matcher\",\n"; - OS << " dbgs() << \"Early target match predicate failed with diag code \"\n"; + OS << " dbgs() << \"Early target match predicate failed with diag " + "code \"\n"; OS << " << MatchResult << \"\\n\");\n"; if (ReportMultipleNearMisses) { - OS << " EarlyPredicateNearMiss = NearMissInfo::getMissedPredicate(MatchResult);\n"; + OS << " EarlyPredicateNearMiss = " + "NearMissInfo::getMissedPredicate(MatchResult);\n"; } else { OS << " RetCode = MatchResult;\n" << " HadMatchOtherThanPredicate = true;\n" @@ -3860,20 +3899,27 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << " }\n\n"; if (ReportMultipleNearMisses) { - OS << " // If we did not successfully match the operands, then we can't convert to\n"; + OS << " // If we did not successfully match the operands, then we can't " + "convert to\n"; OS << " // an MCInst, so bail out on this instruction variant now.\n"; OS << " if (OperandNearMiss) {\n"; - OS << " // If the operand mismatch was the only problem, reprrt it as a near-miss.\n"; - OS << " if (NearMisses && !FeaturesNearMiss && !EarlyPredicateNearMiss) {\n"; + OS << " // If the operand mismatch was the only problem, reprrt it as " + "a near-miss.\n"; + OS << " if (NearMisses && !FeaturesNearMiss && " + "!EarlyPredicateNearMiss) {\n"; OS << " DEBUG_WITH_TYPE(\n"; OS << " \"asm-matcher\",\n"; OS << " dbgs()\n"; - OS << " << \"Opcode result: one mismatched operand, adding near-miss\\n\");\n"; + OS << " << \"Opcode result: one mismatched operand, adding " + "near-miss\\n\");\n"; OS << " NearMisses->push_back(OperandNearMiss);\n"; OS << " } else {\n"; - OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: multiple \"\n"; - OS << " \"types of mismatch, so not \"\n"; - OS << " \"reporting near-miss\\n\");\n"; + OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: " + "multiple \"\n"; + OS << " \"types of " + "mismatch, so not \"\n"; + OS << " \"reporting " + "near-miss\\n\");\n"; OS << " }\n"; OS << " continue;\n"; OS << " }\n\n"; @@ -3905,11 +3951,13 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { << " if ((MatchResult = checkTargetMatchPredicate(Inst)) !=" << " Match_Success) {\n" << " DEBUG_WITH_TYPE(\"asm-matcher\",\n" - << " dbgs() << \"Target match predicate failed with diag code \"\n" + << " dbgs() << \"Target match predicate failed with " + "diag code \"\n" << " << MatchResult << \"\\n\");\n" << " Inst.clear();\n"; if (ReportMultipleNearMisses) { - OS << " LatePredicateNearMiss = NearMissInfo::getMissedPredicate(MatchResult);\n"; + OS << " LatePredicateNearMiss = " + "NearMissInfo::getMissedPredicate(MatchResult);\n"; } else { OS << " RetCode = MatchResult;\n" << " HadMatchOtherThanPredicate = true;\n" @@ -3923,10 +3971,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << " (int)(bool)EarlyPredicateNearMiss +\n"; OS << " (int)(bool)LatePredicateNearMiss);\n"; OS << " if (NumNearMisses == 1) {\n"; - OS << " // We had exactly one type of near-miss, so add that to the list.\n"; - OS << " assert(!OperandNearMiss && \"OperandNearMiss was handled earlier\");\n"; - OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: found one type of \"\n"; - OS << " \"mismatch, so reporting a \"\n"; + OS << " // We had exactly one type of near-miss, so add that to the " + "list.\n"; + OS << " assert(!OperandNearMiss && \"OperandNearMiss was handled " + "earlier\");\n"; + OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: " + "found one type of \"\n"; + OS << " \"mismatch, so " + "reporting a \"\n"; OS << " \"near-miss\\n\");\n"; OS << " if (NearMisses && FeaturesNearMiss)\n"; OS << " NearMisses->push_back(FeaturesNearMiss);\n"; @@ -3937,10 +3989,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << "\n"; OS << " continue;\n"; OS << " } else if (NumNearMisses > 1) {\n"; - OS << " // This instruction missed in more than one way, so ignore it.\n"; - OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: multiple \"\n"; - OS << " \"types of mismatch, so not \"\n"; - OS << " \"reporting near-miss\\n\");\n"; + OS << " // This instruction missed in more than one way, so ignore " + "it.\n"; + OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: " + "multiple \"\n"; + OS << " \"types of mismatch, " + "so not \"\n"; + OS << " \"reporting " + "near-miss\\n\");\n"; OS << " continue;\n"; OS << " }\n"; } @@ -3952,7 +4008,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { if (HasDeprecation) { OS << " std::string Info;\n"; - OS << " if (!getParser().getTargetParser().getTargetOptions().MCNoDeprecatedWarn &&\n"; + OS << " if " + "(!getParser().getTargetParser().getTargetOptions()." + "MCNoDeprecatedWarn &&\n"; OS << " MII.getDeprecatedInfo(Inst, getSTI(), Info)) {\n"; OS << " SMLoc Loc = ((" << Target.getName() << "Operand &)*Operands[0]).getStartLoc();\n"; @@ -3969,7 +4027,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << " DEBUG_WITH_TYPE(\n"; OS << " \"asm-matcher\",\n"; - OS << " dbgs() << \"Opcode result: complete match, selecting this opcode\\n\");\n"; + OS << " dbgs() << \"Opcode result: complete match, selecting this " + "opcode\\n\");\n"; OS << " return Match_Success;\n"; OS << " }\n\n"; @@ -4002,8 +4061,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << "\n#ifdef GET_MNEMONIC_CHECKER\n"; OS << "#undef GET_MNEMONIC_CHECKER\n\n"; - emitMnemonicChecker(OS, Target, VariantCount, - HasMnemonicFirst, HasMnemonicAliases); + emitMnemonicChecker(OS, Target, VariantCount, HasMnemonicFirst, + HasMnemonicAliases); OS << "#endif // GET_MNEMONIC_CHECKER\n\n"; } diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp index e0cd5fa..c05991f 100644 --- a/llvm/utils/TableGen/AsmWriterEmitter.cpp +++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp @@ -64,6 +64,7 @@ public: AsmWriterEmitter(RecordKeeper &R); void run(raw_ostream &o); + private: void EmitGetMnemonic( raw_ostream &o, @@ -84,9 +85,9 @@ private: } // end anonymous namespace -static void PrintCases(std::vector> &OpsToPrint, raw_ostream &O, - bool PassSubtarget) { +static void +PrintCases(std::vector> &OpsToPrint, + raw_ostream &O, bool PassSubtarget) { O << " case " << OpsToPrint.back().first << ":"; AsmWriterOperand TheOp = OpsToPrint.back().second; OpsToPrint.pop_back(); @@ -94,9 +95,9 @@ static void PrintCases(std::vector &Insts, - raw_ostream &O, bool PassSubtarget) { +static void EmitInstructions(std::vector &Insts, raw_ostream &O, + bool PassSubtarget) { AsmWriterInst FirstInst = Insts.back(); Insts.pop_back(); std::vector SimilarInsts; unsigned DifferingOperand = ~0; for (unsigned i = Insts.size(); i != 0; --i) { - unsigned DiffOp = Insts[i-1].MatchesAllButOneOp(FirstInst); + unsigned DiffOp = Insts[i - 1].MatchesAllButOneOp(FirstInst); if (DiffOp != ~1U) { - if (DifferingOperand == ~0U) // First match! + if (DifferingOperand == ~0U) // First match! DifferingOperand = DiffOp; // If this differs in the same operand as the rest of the instructions in // this class, move it to the SimilarInsts list. if (DifferingOperand == DiffOp || DiffOp == ~0U) { - SimilarInsts.push_back(Insts[i-1]); - Insts.erase(Insts.begin()+i-1); + SimilarInsts.push_back(Insts[i - 1]); + Insts.erase(Insts.begin() + i - 1); } } } - O << " case " << FirstInst.CGI->Namespace << "::" - << FirstInst.CGI->TheDef->getName() << ":\n"; + O << " case " << FirstInst.CGI->Namespace + << "::" << FirstInst.CGI->TheDef->getName() << ":\n"; for (const AsmWriterInst &AWI : SimilarInsts) - O << " case " << AWI.CGI->Namespace << "::" - << AWI.CGI->TheDef->getName() << ":\n"; + O << " case " << AWI.CGI->Namespace << "::" << AWI.CGI->TheDef->getName() + << ":\n"; for (unsigned i = 0, e = FirstInst.Operands.size(); i != e; ++i) { if (i != DifferingOperand) { // If the operand is the same for all instructions, just print it. @@ -143,14 +144,15 @@ static void EmitInstructions(std::vector &Insts, O << " switch (MI->getOpcode()) {\n"; O << " default: llvm_unreachable(\"Unexpected opcode.\");\n"; std::vector> OpsToPrint; - OpsToPrint.push_back(std::make_pair(FirstInst.CGI->Namespace.str() + "::" + - FirstInst.CGI->TheDef->getName().str(), - FirstInst.Operands[i])); + OpsToPrint.push_back( + std::make_pair(FirstInst.CGI->Namespace.str() + + "::" + FirstInst.CGI->TheDef->getName().str(), + FirstInst.Operands[i])); for (const AsmWriterInst &AWI : SimilarInsts) { - OpsToPrint.push_back(std::make_pair(AWI.CGI->Namespace.str()+"::" + - AWI.CGI->TheDef->getName().str(), - AWI.Operands[i])); + OpsToPrint.push_back(std::make_pair( + AWI.CGI->Namespace.str() + "::" + AWI.CGI->TheDef->getName().str(), + AWI.Operands[i])); } std::reverse(OpsToPrint.begin(), OpsToPrint.end()); while (!OpsToPrint.empty()) @@ -162,11 +164,10 @@ static void EmitInstructions(std::vector &Insts, O << " break;\n"; } -void AsmWriterEmitter:: -FindUniqueOperandCommands(std::vector &UniqueOperandCommands, - std::vector> &InstIdxs, - std::vector &InstOpsUsed, - bool PassSubtarget) const { +void AsmWriterEmitter::FindUniqueOperandCommands( + std::vector &UniqueOperandCommands, + std::vector> &InstIdxs, + std::vector &InstOpsUsed, bool PassSubtarget) const { // This vector parallels UniqueOperandCommands, keeping track of which // instructions each case are used for. It is a comma separated string of // enums. @@ -177,9 +178,10 @@ FindUniqueOperandCommands(std::vector &UniqueOperandCommands, for (size_t i = 0, e = Instructions.size(); i != e; ++i) { const AsmWriterInst &Inst = Instructions[i]; if (Inst.Operands.empty()) - continue; // Instruction already done. + continue; // Instruction already done. - std::string Command = " "+Inst.Operands[0].getCode(PassSubtarget)+"\n"; + std::string Command = + " " + Inst.Operands[0].getCode(PassSubtarget) + "\n"; // Check to see if we already have 'Command' in UniqueOperandCommands. // If not, add it. @@ -203,12 +205,12 @@ FindUniqueOperandCommands(std::vector &UniqueOperandCommands, // For each entry of UniqueOperandCommands, there is a set of instructions // that uses it. If the next command of all instructions in the set are // identical, fold it into the command. - for (size_t CommandIdx = 0, e = UniqueOperandCommands.size(); - CommandIdx != e; ++CommandIdx) { + for (size_t CommandIdx = 0, e = UniqueOperandCommands.size(); CommandIdx != e; + ++CommandIdx) { const auto &Idxs = InstIdxs[CommandIdx]; - for (unsigned Op = 1; ; ++Op) { + for (unsigned Op = 1;; ++Op) { // Find the first instruction in the set. const AsmWriterInst &FirstInst = Instructions[Idxs.front()]; // If this instruction has no more operands, we isn't anything to merge @@ -227,8 +229,8 @@ FindUniqueOperandCommands(std::vector &UniqueOperandCommands, // Okay, everything in this command set has the same next operand. Add it // to UniqueOperandCommands and remember that it was consumed. - std::string Command = " " + - FirstInst.Operands[Op].getCode(PassSubtarget) + "\n"; + std::string Command = + " " + FirstInst.Operands[Op].getCode(PassSubtarget) + "\n"; UniqueOperandCommands[CommandIdx] += Command; InstOpsUsed[CommandIdx]++; @@ -239,35 +241,58 @@ FindUniqueOperandCommands(std::vector &UniqueOperandCommands, for (unsigned i = 0, e = InstrsForCase.size(); i != e; ++i) { std::string Instrs = InstrsForCase[i]; if (Instrs.size() > 70) { - Instrs.erase(Instrs.begin()+70, Instrs.end()); + Instrs.erase(Instrs.begin() + 70, Instrs.end()); Instrs += "..."; } if (!Instrs.empty()) - UniqueOperandCommands[i] = " // " + Instrs + "\n" + - UniqueOperandCommands[i]; + UniqueOperandCommands[i] = + " // " + Instrs + "\n" + UniqueOperandCommands[i]; } } static void UnescapeString(std::string &Str) { for (unsigned i = 0; i != Str.size(); ++i) { - if (Str[i] == '\\' && i != Str.size()-1) { - switch (Str[i+1]) { - default: continue; // Don't execute the code after the switch. - case 'a': Str[i] = '\a'; break; - case 'b': Str[i] = '\b'; break; - case 'e': Str[i] = 27; break; - case 'f': Str[i] = '\f'; break; - case 'n': Str[i] = '\n'; break; - case 'r': Str[i] = '\r'; break; - case 't': Str[i] = '\t'; break; - case 'v': Str[i] = '\v'; break; - case '"': Str[i] = '\"'; break; - case '\'': Str[i] = '\''; break; - case '\\': Str[i] = '\\'; break; + if (Str[i] == '\\' && i != Str.size() - 1) { + switch (Str[i + 1]) { + default: + continue; // Don't execute the code after the switch. + case 'a': + Str[i] = '\a'; + break; + case 'b': + Str[i] = '\b'; + break; + case 'e': + Str[i] = 27; + break; + case 'f': + Str[i] = '\f'; + break; + case 'n': + Str[i] = '\n'; + break; + case 'r': + Str[i] = '\r'; + break; + case 't': + Str[i] = '\t'; + break; + case 'v': + Str[i] = '\v'; + break; + case '"': + Str[i] = '\"'; + break; + case '\'': + Str[i] = '\''; + break; + case '\\': + Str[i] = '\\'; + break; } // Nuke the second character. - Str.erase(Str.begin()+i+1); + Str.erase(Str.begin() + i + 1); } } } @@ -281,14 +306,19 @@ static void UnescapeString(std::string &Str) { /// causes non-standard escape character warnings. static void UnescapeAliasString(std::string &Str) { for (unsigned i = 0; i != Str.size(); ++i) { - if (Str[i] == '\\' && i != Str.size()-1) { - switch (Str[i+1]) { - default: continue; // Don't execute the code after the switch. - case '{': Str[i] = '{'; break; - case '}': Str[i] = '}'; break; + if (Str[i] == '\\' && i != Str.size() - 1) { + switch (Str[i + 1]) { + default: + continue; // Don't execute the code after the switch. + case '{': + Str[i] = '{'; + break; + case '}': + Str[i] = '}'; + break; } // Nuke the second character. - Str.erase(Str.begin()+i+1); + Str.erase(Str.begin() + i + 1); } } } @@ -318,8 +348,7 @@ void AsmWriterEmitter::EmitGetMnemonic( // Add all strings to the string table upfront so it can generate an optimized // representation. for (AsmWriterInst &AWI : Instructions) { - if (AWI.Operands[0].OperandType == - AsmWriterOperand::isLiteralTextOperand && + if (AWI.Operands[0].OperandType == AsmWriterOperand::isLiteralTextOperand && !AWI.Operands[0].Str.empty()) { std::string Str = AWI.Operands[0].Str; UnescapeString(Str); @@ -347,7 +376,7 @@ void AsmWriterEmitter::EmitGetMnemonic( } // Bias offset by one since we want 0 as a sentinel. - OpcodeInfo[AWI.CGIIndex] = Idx+1; + OpcodeInfo[AWI.CGIIndex] = Idx + 1; } // Figure out how many bits we used for the string index. @@ -365,7 +394,8 @@ void AsmWriterEmitter::EmitGetMnemonic( NumInstOpsHandled, PassSubtarget); // If we ran out of operands to print, we're done. - if (UniqueOperandCommands.empty()) break; + if (UniqueOperandCommands.empty()) + break; // Compute the number of bits we need to represent these cases, this is // ceil(log2(numentries)). @@ -383,14 +413,14 @@ void AsmWriterEmitter::EmitGetMnemonic( unsigned NumOps = NumInstOpsHandled[i]; for (unsigned Idx : InstIdxs[i]) { OpcodeInfo[Instructions[Idx].CGIIndex] |= - (uint64_t)i << (OpcodeInfoBits-BitsLeft); + (uint64_t)i << (OpcodeInfoBits - BitsLeft); // Remove the info about this operand from the instruction. AsmWriterInst &Inst = Instructions[Idx]; if (!Inst.Operands.empty()) { assert(NumOps <= Inst.Operands.size() && "Can't remove this many ops!"); Inst.Operands.erase(Inst.Operands.begin(), - Inst.Operands.begin()+NumOps); + Inst.Operands.begin() + NumOps); } } } @@ -487,7 +517,7 @@ void AsmWriterEmitter::EmitPrintInstruction( << " assert(Bits != 0 && \"Cannot print this instruction.\");\n"; // Output the table driven operand information. - BitsLeft = OpcodeInfoBits-AsmStrBits; + BitsLeft = OpcodeInfoBits - AsmStrBits; for (unsigned i = 0, e = TableDrivenOperandPrinters.size(); i != e; ++i) { std::vector &Commands = TableDrivenOperandPrinters[i]; @@ -497,25 +527,21 @@ void AsmWriterEmitter::EmitPrintInstruction( assert(NumBits <= BitsLeft && "consistency error"); // Emit code to extract this field from Bits. - O << "\n // Fragment " << i << " encoded into " << NumBits - << " bits for " << Commands.size() << " unique commands.\n"; + O << "\n // Fragment " << i << " encoded into " << NumBits << " bits for " + << Commands.size() << " unique commands.\n"; if (Commands.size() == 2) { // Emit two possibilitys with if/else. - O << " if ((Bits >> " - << (OpcodeInfoBits-BitsLeft) << ") & " - << ((1 << NumBits)-1) << ") {\n" - << Commands[1] - << " } else {\n" - << Commands[0] - << " }\n\n"; + O << " if ((Bits >> " << (OpcodeInfoBits - BitsLeft) << ") & " + << ((1 << NumBits) - 1) << ") {\n" + << Commands[1] << " } else {\n" + << Commands[0] << " }\n\n"; } else if (Commands.size() == 1) { // Emit a single possibility. O << Commands[0] << "\n\n"; } else { - O << " switch ((Bits >> " - << (OpcodeInfoBits-BitsLeft) << ") & " - << ((1 << NumBits)-1) << ") {\n" + O << " switch ((Bits >> " << (OpcodeInfoBits - BitsLeft) << ") & " + << ((1 << NumBits) - 1) << ") {\n" << " default: llvm_unreachable(\"Invalid command number.\");\n"; // Print out all the cases. @@ -537,7 +563,6 @@ void AsmWriterEmitter::EmitPrintInstruction( // elements in the vector. std::reverse(Instructions.begin(), Instructions.end()); - // Now that we've emitted all of the operand info that fit into 64 bits, emit // information for those instructions that are left. This is a less dense // encoding, but we expect the main 64-bit table to handle the majority of @@ -572,22 +597,21 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName, AsmName = std::string(Reg.getName()); } else { // Make sure the register has an alternate name for this index. - std::vector AltNameList = - Reg.TheDef->getValueAsListOfDefs("RegAltNameIndices"); + std::vector AltNameList = + Reg.TheDef->getValueAsListOfDefs("RegAltNameIndices"); unsigned Idx = 0, e; for (e = AltNameList.size(); - Idx < e && (AltNameList[Idx]->getName() != AltName); - ++Idx) + Idx < e && (AltNameList[Idx]->getName() != AltName); ++Idx) ; // If the register has an alternate name for this index, use it. // Otherwise, leave it empty as an error flag. if (Idx < e) { std::vector AltNames = - Reg.TheDef->getValueAsListOfStrings("AltNames"); + Reg.TheDef->getValueAsListOfStrings("AltNames"); if (AltNames.size() <= Idx) PrintFatalError(Reg.TheDef->getLoc(), "Register definition missing alt name for '" + - AltName + "'."); + AltName + "'."); AsmName = std::string(AltNames[Idx]); } } @@ -613,15 +637,17 @@ void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) { Record *AsmWriter = Target.getAsmWriter(); StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName"); const auto &Registers = Target.getRegBank().getRegisters(); - const std::vector &AltNameIndices = Target.getRegAltNameIndices(); + const std::vector &AltNameIndices = Target.getRegAltNameIndices(); bool hasAltNames = AltNameIndices.size() > 1; StringRef Namespace = Registers.front().TheDef->getValueAsString("Namespace"); - O << - "\n\n/// getRegisterName - This method is automatically generated by tblgen\n" - "/// from the register set description. This returns the assembler name\n" - "/// for the specified register.\n" - "const char *" << Target.getName() << ClassName << "::"; + O << "\n\n/// getRegisterName - This method is automatically generated by " + "tblgen\n" + "/// from the register set description. This returns the assembler " + "name\n" + "/// for the specified register.\n" + "const char *" + << Target.getName() << ClassName << "::"; if (hasAltNames) O << "\ngetRegisterName(MCRegister Reg, unsigned AltIdx) {\n"; else @@ -695,8 +721,7 @@ public: void addOperand(StringRef Op, int OpIdx, int PrintMethodIdx = -1) { assert(OpIdx >= 0 && OpIdx < 0xFE && "Idx out of range"); - assert(PrintMethodIdx >= -1 && PrintMethodIdx < 0xFF && - "Idx out of range"); + assert(PrintMethodIdx >= -1 && PrintMethodIdx < 0xFF && "Idx out of range"); OpMap[Op] = std::make_pair(OpIdx, PrintMethodIdx); } @@ -791,7 +816,7 @@ namespace { struct AliasPriorityComparator { typedef std::pair ValueType; bool operator()(const ValueType &LHS, const ValueType &RHS) const { - if (LHS.second == RHS.second) { + if (LHS.second == RHS.second) { // We don't actually care about the order, but for consistency it // shouldn't depend on pointer comparisons. return LessRecordByID()(LHS.first.TheDef, RHS.first.TheDef); @@ -819,8 +844,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { unsigned Variant = AsmWriter->getValueAsInt("Variant"); bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget"); - std::vector AllInstAliases = - Records.getAllDerivedDefinitions("InstAlias"); + std::vector AllInstAliases = + Records.getAllDerivedDefinitions("InstAlias"); // Create a map from the qualified name to a list of potential matches. typedef std::set, AliasPriorityComparator> @@ -843,8 +868,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { std::vector> PrintMethods; // A list of MCOperandPredicates for all operands in use, and the reverse map - std::vector MCOpPredicates; - DenseMap MCOpPredicateMap; + std::vector MCOpPredicates; + DenseMap MCOpPredicateMap; for (auto &Aliases : AliasMap) { // Collection of instruction alias rules. May contain ambiguous rules. @@ -854,8 +879,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { const CodeGenInstAlias &CGA = Alias.first; unsigned LastOpNo = CGA.ResultInstOperandIndex.size(); std::string FlatInstAsmString = - CodeGenInstruction::FlattenAsmStringVariants(CGA.ResultInst->AsmString, - Variant); + CodeGenInstruction::FlattenAsmStringVariants( + CGA.ResultInst->AsmString, Variant); unsigned NumResultOps = CountNumOperands(FlatInstAsmString, Variant); std::string FlatAliasAsmString = @@ -881,8 +906,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { unsigned OpNum = Operands.getSubOperandNumber(MIOpNum).first; if (Operands[OpNum].MINumOperands == 1 && Operands[OpNum].getTiedRegister() != -1) { - // Tied operands of different RegisterClass should be explicit within - // an instruction's syntax and so cannot be skipped. + // Tied operands of different RegisterClass should be explicit + // within an instruction's syntax and so cannot be skipped. int TiedOpNum = Operands[OpNum].getTiedRegister(); if (Operands[OpNum].Rec->getName() == Operands[TiedOpNum].Rec->getName()) { @@ -1083,7 +1108,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { if (It == IAPrinterMap.end()) continue; std::vector &IAPs = It->second; - std::vector UniqueIAPs; + std::vector UniqueIAPs; // Remove any ambiguous alias rules. for (auto &LHS : IAPs) { @@ -1099,7 +1124,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { UniqueIAPs.push_back(&LHS); } - if (UniqueIAPs.empty()) continue; + if (UniqueIAPs.empty()) + continue; unsigned PatternStart = PatternCount; @@ -1193,7 +1219,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { if (MCOpPredicates.empty()) O.indent(2) << " nullptr,\n"; else - O.indent(2) << " &" << Target.getName() << ClassName << "ValidateMCOperand,\n"; + O.indent(2) << " &" << Target.getName() << ClassName + << "ValidateMCOperand,\n"; O.indent(2) << "};\n"; O.indent(2) << "const char *AsmString = matchAliasPatterns(MI, " @@ -1262,21 +1289,22 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { << " break;\n"; } O << " }\n"; - } + } O << "}\n\n"; if (!MCOpPredicates.empty()) { O << "static bool " << Target.getName() << ClassName << "ValidateMCOperand(const MCOperand &MCOp,\n" << " const MCSubtargetInfo &STI,\n" - << " unsigned PredicateIndex) {\n" + << " unsigned PredicateIndex) {\n" << " switch (PredicateIndex) {\n" << " default:\n" << " llvm_unreachable(\"Unknown MCOperandPredicate kind\");\n" << " break;\n"; for (unsigned i = 0; i < MCOpPredicates.size(); ++i) { - StringRef MCOpPred = MCOpPredicates[i]->getValueAsString("MCOperandPredicate"); + StringRef MCOpPred = + MCOpPredicates[i]->getValueAsString("MCOperandPredicate"); O << " case " << i + 1 << ": {\n" << MCOpPred.data() << "\n" << " }\n"; diff --git a/llvm/utils/TableGen/AsmWriterInst.cpp b/llvm/utils/TableGen/AsmWriterInst.cpp index c955859..1fa609e 100644 --- a/llvm/utils/TableGen/AsmWriterInst.cpp +++ b/llvm/utils/TableGen/AsmWriterInst.cpp @@ -57,54 +57,55 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex, std::string::size_type LastEmitted = 0; while (LastEmitted != AsmString.size()) { std::string::size_type DollarPos = - AsmString.find_first_of("$\\", LastEmitted); - if (DollarPos == std::string::npos) DollarPos = AsmString.size(); + AsmString.find_first_of("$\\", LastEmitted); + if (DollarPos == std::string::npos) + DollarPos = AsmString.size(); // Emit a constant string fragment. if (DollarPos != LastEmitted) { for (; LastEmitted != DollarPos; ++LastEmitted) switch (AsmString[LastEmitted]) { - case '\n': - AddLiteralString("\\n"); - break; - case '\t': - AddLiteralString("\\t"); - break; - case '"': - AddLiteralString("\\\""); - break; - case '\\': - AddLiteralString("\\\\"); - break; - default: - AddLiteralString(std::string(1, AsmString[LastEmitted])); - break; + case '\n': + AddLiteralString("\\n"); + break; + case '\t': + AddLiteralString("\\t"); + break; + case '"': + AddLiteralString("\\\""); + break; + case '\\': + AddLiteralString("\\\\"); + break; + default: + AddLiteralString(std::string(1, AsmString[LastEmitted])); + break; } } else if (AsmString[DollarPos] == '\\') { - if (DollarPos+1 != AsmString.size()) { - if (AsmString[DollarPos+1] == 'n') { + if (DollarPos + 1 != AsmString.size()) { + if (AsmString[DollarPos + 1] == 'n') { AddLiteralString("\\n"); - } else if (AsmString[DollarPos+1] == 't') { + } else if (AsmString[DollarPos + 1] == 't') { AddLiteralString("\\t"); - } else if (std::string("${|}\\").find(AsmString[DollarPos+1]) - != std::string::npos) { - AddLiteralString(std::string(1, AsmString[DollarPos+1])); + } else if (std::string("${|}\\").find(AsmString[DollarPos + 1]) != + std::string::npos) { + AddLiteralString(std::string(1, AsmString[DollarPos + 1])); } else { PrintFatalError( CGI.TheDef->getLoc(), "Non-supported escaped character found in instruction '" + CGI.TheDef->getName() + "'!"); } - LastEmitted = DollarPos+2; + LastEmitted = DollarPos + 2; continue; } - } else if (DollarPos+1 != AsmString.size() && - AsmString[DollarPos+1] == '$') { - AddLiteralString("$"); // "$$" -> $ - LastEmitted = DollarPos+2; + } else if (DollarPos + 1 != AsmString.size() && + AsmString[DollarPos + 1] == '$') { + AddLiteralString("$"); // "$$" -> $ + LastEmitted = DollarPos + 2; } else { // Get the name of the variable. - std::string::size_type VarEnd = DollarPos+1; + std::string::size_type VarEnd = DollarPos + 1; // handle ${foo}bar as $foo by detecting whether the character following // the dollar sign is a curly brace. If so, advance VarEnd and DollarPos @@ -118,7 +119,8 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex, while (VarEnd < AsmString.size() && isIdentChar(AsmString[VarEnd])) ++VarEnd; - StringRef VarName(AsmString.data()+DollarPos+1, VarEnd-DollarPos-1); + StringRef VarName(AsmString.data() + DollarPos + 1, + VarEnd - DollarPos - 1); // Modifier - Support ${foo:modifier} syntax, where "modifier" is passed // into printOperand. Also support ${:feature}, which is passed into @@ -190,13 +192,14 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex, /// specified instruction except for one differing operand, return the differing /// operand number. If more than one operand mismatches, return ~1, otherwise /// if the instructions are identical return ~0. -unsigned AsmWriterInst::MatchesAllButOneOp(const AsmWriterInst &Other)const{ - if (Operands.size() != Other.Operands.size()) return ~1; +unsigned AsmWriterInst::MatchesAllButOneOp(const AsmWriterInst &Other) const { + if (Operands.size() != Other.Operands.size()) + return ~1; unsigned MismatchOperand = ~0U; for (unsigned i = 0, e = Operands.size(); i != e; ++i) { if (Operands[i] != Other.Operands[i]) { - if (MismatchOperand != ~0U) // Already have one mismatch? + if (MismatchOperand != ~0U) // Already have one mismatch? return ~1U; MismatchOperand = i; } diff --git a/llvm/utils/TableGen/AsmWriterInst.h b/llvm/utils/TableGen/AsmWriterInst.h index 9c93e82..f0ebf79 100644 --- a/llvm/utils/TableGen/AsmWriterInst.h +++ b/llvm/utils/TableGen/AsmWriterInst.h @@ -20,88 +20,88 @@ #include namespace llvm { - class CodeGenInstruction; - - struct AsmWriterOperand { - enum OpType { - // Output this text surrounded by quotes to the asm. - isLiteralTextOperand, - // This is the name of a routine to call to print the operand. - isMachineInstrOperand, - // Output this text verbatim to the asm writer. It is code that - // will output some text to the asm. - isLiteralStatementOperand - } OperandType; - - /// MiOpNo - For isMachineInstrOperand, this is the operand number of the - /// machine instruction. - unsigned MIOpNo = 0; - - /// Str - For isLiteralTextOperand, this IS the literal text. For - /// isMachineInstrOperand, this is the PrinterMethodName for the operand.. - /// For isLiteralStatementOperand, this is the code to insert verbatim - /// into the asm writer. - std::string Str; - - /// MiModifier - For isMachineInstrOperand, this is the modifier string for - /// an operand, specified with syntax like ${opname:modifier}. - std::string MiModifier; - - bool PCRel = false; - - // To make VS STL happy - AsmWriterOperand(OpType op = isLiteralTextOperand):OperandType(op) {} - - AsmWriterOperand(const std::string &LitStr, - OpType op = isLiteralTextOperand) - : OperandType(op), Str(LitStr) {} - - AsmWriterOperand(const std::string &Printer, unsigned _MIOpNo, - const std::string &Modifier, - OpType op = isMachineInstrOperand, bool PCRel = false) - : OperandType(op), MIOpNo(_MIOpNo), Str(Printer), MiModifier(Modifier), - PCRel(PCRel) {} - - bool operator!=(const AsmWriterOperand &Other) const { - if (OperandType != Other.OperandType || Str != Other.Str) return true; - if (OperandType == isMachineInstrOperand) - return MIOpNo != Other.MIOpNo || MiModifier != Other.MiModifier || - PCRel != Other.PCRel; - return false; - } - bool operator==(const AsmWriterOperand &Other) const { - return !operator!=(Other); - } - - /// getCode - Return the code that prints this operand. - std::string getCode(bool PassSubtarget) const; - }; - - class AsmWriterInst { - public: - std::vector Operands; - const CodeGenInstruction *CGI; - unsigned CGIIndex; - - AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex, - unsigned Variant); - - /// MatchesAllButOneOp - If this instruction is exactly identical to the - /// specified instruction except for one differing operand, return the - /// differing operand number. Otherwise return ~0. - unsigned MatchesAllButOneOp(const AsmWriterInst &Other) const; - - private: - void AddLiteralString(const std::string &Str) { - // If the last operand was already a literal text string, append this to - // it, otherwise add a new operand. - if (!Operands.empty() && - Operands.back().OperandType == AsmWriterOperand::isLiteralTextOperand) - Operands.back().Str.append(Str); - else - Operands.push_back(AsmWriterOperand(Str)); - } - }; -} +class CodeGenInstruction; + +struct AsmWriterOperand { + enum OpType { + // Output this text surrounded by quotes to the asm. + isLiteralTextOperand, + // This is the name of a routine to call to print the operand. + isMachineInstrOperand, + // Output this text verbatim to the asm writer. It is code that + // will output some text to the asm. + isLiteralStatementOperand + } OperandType; + + /// MiOpNo - For isMachineInstrOperand, this is the operand number of the + /// machine instruction. + unsigned MIOpNo = 0; + + /// Str - For isLiteralTextOperand, this IS the literal text. For + /// isMachineInstrOperand, this is the PrinterMethodName for the operand.. + /// For isLiteralStatementOperand, this is the code to insert verbatim + /// into the asm writer. + std::string Str; + + /// MiModifier - For isMachineInstrOperand, this is the modifier string for + /// an operand, specified with syntax like ${opname:modifier}. + std::string MiModifier; + + bool PCRel = false; + + // To make VS STL happy + AsmWriterOperand(OpType op = isLiteralTextOperand) : OperandType(op) {} + + AsmWriterOperand(const std::string &LitStr, OpType op = isLiteralTextOperand) + : OperandType(op), Str(LitStr) {} + + AsmWriterOperand(const std::string &Printer, unsigned _MIOpNo, + const std::string &Modifier, + OpType op = isMachineInstrOperand, bool PCRel = false) + : OperandType(op), MIOpNo(_MIOpNo), Str(Printer), MiModifier(Modifier), + PCRel(PCRel) {} + + bool operator!=(const AsmWriterOperand &Other) const { + if (OperandType != Other.OperandType || Str != Other.Str) + return true; + if (OperandType == isMachineInstrOperand) + return MIOpNo != Other.MIOpNo || MiModifier != Other.MiModifier || + PCRel != Other.PCRel; + return false; + } + bool operator==(const AsmWriterOperand &Other) const { + return !operator!=(Other); + } + + /// getCode - Return the code that prints this operand. + std::string getCode(bool PassSubtarget) const; +}; + +class AsmWriterInst { +public: + std::vector Operands; + const CodeGenInstruction *CGI; + unsigned CGIIndex; + + AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex, + unsigned Variant); + + /// MatchesAllButOneOp - If this instruction is exactly identical to the + /// specified instruction except for one differing operand, return the + /// differing operand number. Otherwise return ~0. + unsigned MatchesAllButOneOp(const AsmWriterInst &Other) const; + +private: + void AddLiteralString(const std::string &Str) { + // If the last operand was already a literal text string, append this to + // it, otherwise add a new operand. + if (!Operands.empty() && + Operands.back().OperandType == AsmWriterOperand::isLiteralTextOperand) + Operands.back().Str.append(Str); + else + Operands.push_back(AsmWriterOperand(Str)); + } +}; +} // namespace llvm #endif diff --git a/llvm/utils/TableGen/CTagsEmitter.cpp b/llvm/utils/TableGen/CTagsEmitter.cpp index b8e27d0..bda18936 100644 --- a/llvm/utils/TableGen/CTagsEmitter.cpp +++ b/llvm/utils/TableGen/CTagsEmitter.cpp @@ -1,4 +1,4 @@ -//===- CTagsEmitter.cpp - Generate ctags-compatible index ------------------===// +//===- CTagsEmitter.cpp - Generate ctags-compatible index -----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -30,6 +30,7 @@ private: StringRef Id; StringRef BufferIdentifier; unsigned Line; + public: Tag(StringRef Name, const SMLoc Location) : Id(Name) { const MemoryBuffer *CurMB = @@ -39,7 +40,8 @@ public: Line = LineAndColumn.first; } int operator<(const Tag &B) const { - return std::make_tuple(Id, BufferIdentifier, Line) < std::make_tuple(B.Id, B.BufferIdentifier, B.Line); + return std::make_tuple(Id, BufferIdentifier, Line) < + std::make_tuple(B.Id, B.BufferIdentifier, B.Line); } void emit(raw_ostream &OS) const { OS << Id << "\t" << BufferIdentifier << "\t" << Line << "\n"; @@ -49,6 +51,7 @@ public: class CTagsEmitter { private: RecordKeeper &Records; + public: CTagsEmitter(RecordKeeper &R) : Records(R) {} diff --git a/llvm/utils/TableGen/CallingConvEmitter.cpp b/llvm/utils/TableGen/CallingConvEmitter.cpp index 02e7000..3c3a287 100644 --- a/llvm/utils/TableGen/CallingConvEmitter.cpp +++ b/llvm/utils/TableGen/CallingConvEmitter.cpp @@ -117,23 +117,24 @@ void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) { O << "\n"; EmitAction(Action, 2, O); } - + O << "\n return true; // CC didn't match.\n"; O << "}\n"; } -void CallingConvEmitter::EmitAction(Record *Action, - unsigned Indent, raw_ostream &O) { +void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent, + raw_ostream &O) { std::string IndentStr = std::string(Indent, ' '); if (Action->isSubClassOf("CCPredicateAction")) { O << IndentStr << "if ("; - + if (Action->isSubClassOf("CCIfType")) { ListInit *VTs = Action->getValueAsListInit("VTs"); for (unsigned i = 0, e = VTs->size(); i != e; ++i) { Record *VT = VTs->getElementAsRecord(i); - if (i != 0) O << " ||\n " << IndentStr; + if (i != 0) + O << " ||\n " << IndentStr; O << "LocVT == " << getEnumName(getValueType(VT)); } @@ -143,9 +144,9 @@ void CallingConvEmitter::EmitAction(Record *Action, errs() << *Action; PrintFatalError(Action->getLoc(), "Unknown CCPredicateAction!"); } - + O << ") {\n"; - EmitAction(Action->getValueAsDef("SubAction"), Indent+2, O); + EmitAction(Action->getValueAsDef("SubAction"), Indent + 2, O); O << IndentStr << "}\n"; } else { if (Action->isSubClassOf("CCDelegateTo")) { @@ -241,8 +242,8 @@ void CallingConvEmitter::EmitAction(Record *Action, O << "\n" << IndentStr << "};\n"; O << IndentStr << "if (unsigned Reg = State.AllocateReg(RegList" - << RegListNumber << ", " << "RegList" << ShadowRegListNumber - << ")) {\n"; + << RegListNumber << ", " + << "RegList" << ShadowRegListNumber << ")) {\n"; } O << IndentStr << " State.addLoc(CCValAssign::getReg(ValNo, ValVT, " << "Reg, LocVT, LocInfo));\n"; @@ -257,7 +258,8 @@ void CallingConvEmitter::EmitAction(Record *Action, if (Size) O << Size << ", "; else - O << "\n" << IndentStr + O << "\n" + << IndentStr << " State.getMachineFunction().getDataLayout()." "getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext()))," " "; @@ -269,8 +271,8 @@ void CallingConvEmitter::EmitAction(Record *Action, << " State.getMachineFunction().getDataLayout()." "getABITypeAlign(EVT(LocVT).getTypeForEVT(State.getContext()" "))"; - O << ");\n" << IndentStr - << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset" + O << ");\n" + << IndentStr << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset" << Counter << ", LocVT, LocInfo));\n"; O << IndentStr << "return false;\n"; } else if (Action->isSubClassOf("CCAssignToStackWithShadow")) { @@ -281,7 +283,7 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned ShadowRegListNumber = ++Counter; O << IndentStr << "static const MCPhysReg ShadowRegList" - << ShadowRegListNumber << "[] = {\n"; + << ShadowRegListNumber << "[] = {\n"; O << IndentStr << " "; ListSeparator LS; for (unsigned i = 0, e = ShadowRegList->size(); i != e; ++i) @@ -297,7 +299,7 @@ void CallingConvEmitter::EmitAction(Record *Action, } else if (Action->isSubClassOf("CCPromoteToType")) { Record *DestTy = Action->getValueAsDef("DestTy"); MVT::SimpleValueType DestVT = getValueType(DestTy); - O << IndentStr << "LocVT = " << getEnumName(DestVT) <<";\n"; + O << IndentStr << "LocVT = " << getEnumName(DestVT) << ";\n"; if (MVT(DestVT).isFloatingPoint()) { O << IndentStr << "LocInfo = CCValAssign::FPExt;\n"; } else { @@ -326,15 +328,18 @@ void CallingConvEmitter::EmitAction(Record *Action, } } else if (Action->isSubClassOf("CCBitConvertToType")) { Record *DestTy = Action->getValueAsDef("DestTy"); - O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n"; + O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) + << ";\n"; O << IndentStr << "LocInfo = CCValAssign::BCvt;\n"; } else if (Action->isSubClassOf("CCTruncToType")) { Record *DestTy = Action->getValueAsDef("DestTy"); - O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n"; + O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) + << ";\n"; O << IndentStr << "LocInfo = CCValAssign::Trunc;\n"; } else if (Action->isSubClassOf("CCPassIndirect")) { Record *DestTy = Action->getValueAsDef("DestTy"); - O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n"; + O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) + << ";\n"; O << IndentStr << "LocInfo = CCValAssign::Indirect;\n"; } else if (Action->isSubClassOf("CCPassByVal")) { int Size = Action->getValueAsInt("Size"); @@ -343,8 +348,8 @@ void CallingConvEmitter::EmitAction(Record *Action, << Size << ", Align(" << Align << "), ArgFlags);\n"; O << IndentStr << "return false;\n"; } else if (Action->isSubClassOf("CCCustom")) { - O << IndentStr - << "if (" << Action->getValueAsString("FuncName") << "(ValNo, ValVT, " + O << IndentStr << "if (" << Action->getValueAsString("FuncName") + << "(ValNo, ValVT, " << "LocVT, LocInfo, ArgFlags, State))\n"; O << IndentStr << " return false;\n"; } else { @@ -376,9 +381,8 @@ void CallingConvEmitter::EmitArgRegisterLists(raw_ostream &O) { std::set &InnerRegisters = InnerEntry.second; if (InnerRegisters.find(CCName) != InnerRegisters.end()) { - AssignedRegsMap[InnerCCName].insert( - AssignedRegsMap[CCName].begin(), - AssignedRegsMap[CCName].end()); + AssignedRegsMap[InnerCCName].insert(AssignedRegsMap[CCName].begin(), + AssignedRegsMap[CCName].end()); InnerRegisters.erase(CCName); } } diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp index 48ed319..d7020d1 100644 --- a/llvm/utils/TableGen/CodeEmitterGen.cpp +++ b/llvm/utils/TableGen/CodeEmitterGen.cpp @@ -78,8 +78,8 @@ private: // If the VarBitInit at position 'bit' matches the specified variable then // return the variable bit position. Otherwise return -1. -int CodeEmitterGen::getVariableBit(const std::string &VarName, - BitsInit *BI, int bit) { +int CodeEmitterGen::getVariableBit(const std::string &VarName, BitsInit *BI, + int bit) { if (VarBitInit *VBI = dyn_cast(BI->getBit(bit))) { if (VarInit *VI = dyn_cast(VBI->getBitVar())) if (VI->getName() == VarName) @@ -101,16 +101,16 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI, CodeGenInstruction &CGI = Target.getInstruction(R); // Determine if VarName actually contributes to the Inst encoding. - int bit = BI->getNumBits()-1; + int bit = BI->getNumBits() - 1; // Scan for a bit that this contributed to. - for (; bit >= 0; ) { + for (; bit >= 0;) { if (getVariableBit(VarName, BI, bit) != -1) break; - + --bit; } - + // If we found no bits, ignore this value, otherwise emit the call to get the // operand encoding. if (bit < 0) @@ -127,12 +127,14 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI, // Get the machine operand number for the indicated operand. OpIdx = CGI.Operands[OpIdx].MIOperandNo; } else { - PrintError(R, Twine("No operand named ") + VarName + " in record " + R->getName()); + PrintError(R, Twine("No operand named ") + VarName + " in record " + + R->getName()); return false; } if (CGI.Operands.isFlatOperandNotEmitted(OpIdx)) { - PrintError(R, "Operand " + VarName + " used but also marked as not emitted!"); + PrintError(R, + "Operand " + VarName + " used but also marked as not emitted!"); return false; } @@ -156,10 +158,12 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI, Case += ", Fixups, STI);\n"; } else { if (UseAPInt) { - Case += " getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")"; + Case += + " getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")"; Case += ", op, Fixups, STI"; } else { - Case += " op = getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")"; + Case += " op = getMachineOpValue(MI, MI.getOperand(" + + utostr(OpIdx) + ")"; Case += ", Fixups, STI"; } Case += ");\n"; @@ -193,9 +197,9 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI, } unsigned BitOffset = -1; - for (; bit >= 0; ) { + for (; bit >= 0;) { int varBit = getVariableBit(VarName, BI, bit); - + // If this bit isn't from a variable, skip it. if (varBit == -1) { --bit; @@ -209,7 +213,8 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI, int N = 1; for (--bit; bit >= 0;) { varBit = getVariableBit(VarName, BI, bit); - if (varBit == -1 || varBit != (beginVarBit - N)) break; + if (varBit == -1 || varBit != (beginVarBit - N)) + break; ++N; --bit; } @@ -368,7 +373,9 @@ void CodeEmitterGen::emitInstructionBaseValues( if (R->getValueAsString("Namespace") == "TargetOpcode" || R->getValueAsBit("isPseudo")) { - o << " "; emitInstBits(o, APInt(BitWidth, 0)); o << ",\n"; + o << " "; + emitInstBits(o, APInt(BitWidth, 0)); + o << ",\n"; continue; } @@ -419,13 +426,13 @@ void CodeEmitterGen::run(raw_ostream &o) { emitSourceFileHeader("Machine Code Emitter", o); CodeGenTarget Target(Records); - std::vector Insts = Records.getAllDerivedDefinitions("Instruction"); + std::vector Insts = Records.getAllDerivedDefinitions("Instruction"); // For little-endian instruction bit encodings, reverse the bit order Target.reverseBitsForLittleEndianEncoding(); - ArrayRef NumberedInstructions = - Target.getInstructionsByEnumValue(); + ArrayRef NumberedInstructions = + Target.getInstructionsByEnumValue(); if (any_of(NumberedInstructions, [](const CodeGenInstruction *CGI) { Record *R = CGI->TheDef; diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp index f88e25e..62e0482 100644 --- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp @@ -38,18 +38,10 @@ using namespace llvm; static inline bool isIntegerOrPtr(MVT VT) { return VT.isInteger() || VT == MVT::iPTR; } -static inline bool isFloatingPoint(MVT VT) { - return VT.isFloatingPoint(); -} -static inline bool isVector(MVT VT) { - return VT.isVector(); -} -static inline bool isScalar(MVT VT) { - return !VT.isVector(); -} -static inline bool isScalarInteger(MVT VT) { - return VT.isScalarInteger(); -} +static inline bool isFloatingPoint(MVT VT) { return VT.isFloatingPoint(); } +static inline bool isVector(MVT VT) { return VT.isVector(); } +static inline bool isScalar(MVT VT) { return !VT.isVector(); } +static inline bool isScalarInteger(MVT VT) { return VT.isScalarInteger(); } template static bool berase_if(MachineValueTypeSet &S, Predicate P) { @@ -173,8 +165,7 @@ bool TypeSetByHwMode::constrain(const TypeSetByHwMode &VTS) { return Changed; } -template -bool TypeSetByHwMode::constrain(Predicate P) { +template bool TypeSetByHwMode::constrain(Predicate P) { bool Changed = false; for (auto &I : *this) Changed |= berase_if(I.second, [&P](MVT VT) { return !P(VT); }); @@ -257,20 +248,18 @@ bool TypeSetByHwMode::operator==(const TypeSetByHwMode &VTS) const { } namespace llvm { - raw_ostream &operator<<(raw_ostream &OS, const MachineValueTypeSet &T) { - T.writeToStream(OS); - return OS; - } - raw_ostream &operator<<(raw_ostream &OS, const TypeSetByHwMode &T) { - T.writeToStream(OS); - return OS; - } +raw_ostream &operator<<(raw_ostream &OS, const MachineValueTypeSet &T) { + T.writeToStream(OS); + return OS; +} +raw_ostream &operator<<(raw_ostream &OS, const TypeSetByHwMode &T) { + T.writeToStream(OS); + return OS; } +} // namespace llvm LLVM_DUMP_METHOD -void TypeSetByHwMode::dump() const { - dbgs() << *this << '\n'; -} +void TypeSetByHwMode::dump() const { dbgs() << *this << '\n'; } bool TypeSetByHwMode::intersect(SetType &Out, const SetType &In) { bool OutP = Out.count(MVT::iPTR), InP = In.count(MVT::iPTR); @@ -335,7 +324,7 @@ bool TypeSetByHwMode::intersect(SetType &Out, const SetType &In) { // OutP == true SetType InOnly = subtract(In, Out); unsigned SizeOut = Out.size(); - berase_if(Out, CompIn); // This will remove at least the iPTR. + berase_if(Out, CompIn); // This will remove at least the iPTR. unsigned NumI = llvm::count_if(InOnly, isScalarInteger); if (NumI == 0) { // iPTR deleted from Out. @@ -568,29 +557,29 @@ bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small, TypeSetByHwMode &Big, // smaller-or-equal than MinS. auto MinS = min_if(S.begin(), S.end(), isScalar, LT); if (MinS != S.end()) - Changed |= berase_if(B, std::bind(SameKindLE, - std::placeholders::_1, *MinS)); + Changed |= + berase_if(B, std::bind(SameKindLE, std::placeholders::_1, *MinS)); // MaxS = max scalar in Big, remove all scalars from Small that are // larger than MaxS. auto MaxS = max_if(B.begin(), B.end(), isScalar, LT); if (MaxS != B.end()) - Changed |= berase_if(S, std::bind(SameKindLE, - *MaxS, std::placeholders::_1)); + Changed |= + berase_if(S, std::bind(SameKindLE, *MaxS, std::placeholders::_1)); // MinV = min vector in Small, remove all vectors from Big that are // smaller-or-equal than MinV. auto MinV = min_if(S.begin(), S.end(), isVector, LT); if (MinV != S.end()) - Changed |= berase_if(B, std::bind(SameKindLE, - std::placeholders::_1, *MinV)); + Changed |= + berase_if(B, std::bind(SameKindLE, std::placeholders::_1, *MinV)); // MaxV = max vector in Big, remove all vectors from Small that are // larger than MaxV. auto MaxV = max_if(B.begin(), B.end(), isVector, LT); if (MaxV != B.end()) - Changed |= berase_if(S, std::bind(SameKindLE, - *MaxV, std::placeholders::_1)); + Changed |= + berase_if(S, std::bind(SameKindLE, *MaxV, std::placeholders::_1)); } return Changed; @@ -618,8 +607,8 @@ bool TypeInfer::EnforceVectorEltTypeIs(TypeSetByHwMode &Vec, TypeSetByHwMode::SetType &V = Vec.get(M); TypeSetByHwMode::SetType &E = Elem.get(M); - Changed |= berase_if(V, isScalar); // Scalar = !vector - Changed |= berase_if(E, isVector); // Vector = !scalar + Changed |= berase_if(V, isScalar); // Scalar = !vector + Changed |= berase_if(E, isVector); // Vector = !scalar assert(!V.empty() && !E.empty()); MachineValueTypeSet VT, ST; @@ -632,8 +621,8 @@ bool TypeInfer::EnforceVectorEltTypeIs(TypeSetByHwMode &Vec, // Remove from V all (vector) types whose element type is not in S. Changed |= berase_if(V, [&ST](MVT T) -> bool { - return !ST.count(T.getVectorElementType()); - }); + return !ST.count(T.getVectorElementType()); + }); // Remove from E all (scalar) types, for which there is no corresponding // type in V. Changed |= berase_if(E, [&VT](MVT T) -> bool { return !VT.count(T); }); @@ -887,7 +876,6 @@ TypeInfer::ValidateOnExit::~ValidateOnExit() { } } - //===----------------------------------------------------------------------===// // ScopedName Implementation //===----------------------------------------------------------------------===// @@ -896,10 +884,7 @@ bool ScopedName::operator==(const ScopedName &o) const { return Scope == o.Scope && Identifier == o.Identifier; } -bool ScopedName::operator!=(const ScopedName &o) const { - return !(*this == o); -} - +bool ScopedName::operator!=(const ScopedName &o) const { return !(*this == o); } //===----------------------------------------------------------------------===// // TreePredicateFn Implementation @@ -1011,8 +996,9 @@ std::string TreePredicateFn::getPredCode() const { PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(), "IsAtomicOrderingAcquireRelease requires IsAtomic"); if (isAtomicOrderingSequentiallyConsistent()) - PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(), - "IsAtomicOrderingSequentiallyConsistent requires IsAtomic"); + PrintFatalError( + getOrigPatFragRecord()->getRecord()->getLoc(), + "IsAtomicOrderingSequentiallyConsistent requires IsAtomic"); if (isAtomicOrderingAcquireOrStronger()) PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(), "IsAtomicOrderingAcquireOrStronger requires IsAtomic"); @@ -1027,7 +1013,7 @@ std::string TreePredicateFn::getPredCode() const { if (isLoad() || isStore() || isAtomic()) { if (ListInit *AddressSpaces = getAddressSpaces()) { Code += "unsigned AddrSpace = cast(N)->getAddressSpace();\n" - " if ("; + " if ("; ListSeparator LS(" && "); for (Init *Val : AddressSpaces->getValues()) { @@ -1077,18 +1063,22 @@ std::string TreePredicateFn::getPredCode() const { "AtomicOrdering::SequentiallyConsistent) return false;\n"; if (isAtomic() && isAtomicOrderingAcquireOrStronger()) - Code += "if (!isAcquireOrStronger(cast(N)->getMergedOrdering())) " - "return false;\n"; + Code += + "if (!isAcquireOrStronger(cast(N)->getMergedOrdering())) " + "return false;\n"; if (isAtomic() && isAtomicOrderingWeakerThanAcquire()) - Code += "if (isAcquireOrStronger(cast(N)->getMergedOrdering())) " - "return false;\n"; + Code += + "if (isAcquireOrStronger(cast(N)->getMergedOrdering())) " + "return false;\n"; if (isAtomic() && isAtomicOrderingReleaseOrStronger()) - Code += "if (!isReleaseOrStronger(cast(N)->getMergedOrdering())) " - "return false;\n"; + Code += + "if (!isReleaseOrStronger(cast(N)->getMergedOrdering())) " + "return false;\n"; if (isAtomic() && isAtomicOrderingWeakerThanRelease()) - Code += "if (isReleaseOrStronger(cast(N)->getMergedOrdering())) " - "return false;\n"; + Code += + "if (isReleaseOrStronger(cast(N)->getMergedOrdering())) " + "return false;\n"; // TODO: Handle atomic sextload/zextload normally when ATOMIC_LOAD is removed. if (isAtomic() && (isZeroExtLoad() || isSignExtLoad())) @@ -1239,16 +1229,20 @@ bool TreePredicateFn::isAtomicOrderingSequentiallyConsistent() const { true); } bool TreePredicateFn::isAtomicOrderingAcquireOrStronger() const { - return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger", true); + return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger", + true); } bool TreePredicateFn::isAtomicOrderingWeakerThanAcquire() const { - return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger", false); + return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger", + false); } bool TreePredicateFn::isAtomicOrderingReleaseOrStronger() const { - return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger", true); + return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger", + true); } bool TreePredicateFn::isAtomicOrderingWeakerThanRelease() const { - return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger", false); + return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger", + false); } Record *TreePredicateFn::getMemoryVT() const { Record *R = getOrigPatFragRecord()->getRecord(); @@ -1428,7 +1422,7 @@ static bool isImmAllOnesAllZerosMatch(const TreePatternNode *P) { /// pattern. static unsigned getPatternSize(const TreePatternNode *P, const CodeGenDAGPatterns &CGP) { - unsigned Size = 3; // The node itself. + unsigned Size = 3; // The node itself. // If the root node is a ConstantSDNode, increases its size. // e.g. (set R32:$dst, 0). if (P->isLeaf() && isa(P->getLeafValue())) @@ -1459,7 +1453,7 @@ static unsigned getPatternSize(const TreePatternNode *P, } if (Child->isLeaf()) { if (isa(Child->getLeafValue())) - Size += 5; // Matches a ConstantSDNode (+3) and a specific value (+2). + Size += 5; // Matches a ConstantSDNode (+3) and a specific value (+2). else if (Child->getComplexPatternInfo(CGP)) Size += getPatternSize(Child, CGP); else if (isImmAllOnesAllZerosMatch(Child)) @@ -1474,8 +1468,7 @@ static unsigned getPatternSize(const TreePatternNode *P, /// Compute the complexity metric for the input pattern. This roughly /// corresponds to the number of nodes that are covered. -int PatternToMatch:: -getPatternComplexity(const CodeGenDAGPatterns &CGP) const { +int PatternToMatch::getPatternComplexity(const CodeGenDAGPatterns &CGP) const { return getPatternSize(getSrcPattern(), CGP) + getAddedComplexity(); } @@ -1550,18 +1543,17 @@ SDTypeConstraint::SDTypeConstraint(Record *R, const CodeGenHwModes &CGH) { } else if (R->isSubClassOf("SDTCisVTSmallerThanOp")) { ConstraintType = SDTCisVTSmallerThanOp; x.SDTCisVTSmallerThanOp_Info.OtherOperandNum = - R->getValueAsInt("OtherOperandNum"); + R->getValueAsInt("OtherOperandNum"); } else if (R->isSubClassOf("SDTCisOpSmallerThanOp")) { ConstraintType = SDTCisOpSmallerThanOp; x.SDTCisOpSmallerThanOp_Info.BigOperandNum = - R->getValueAsInt("BigOperandNum"); + R->getValueAsInt("BigOperandNum"); } else if (R->isSubClassOf("SDTCisEltOfVec")) { ConstraintType = SDTCisEltOfVec; x.SDTCisEltOfVec_Info.OtherOperandNum = R->getValueAsInt("OtherOpNum"); } else if (R->isSubClassOf("SDTCisSubVecOfVec")) { ConstraintType = SDTCisSubVecOfVec; - x.SDTCisSubVecOfVec_Info.OtherOperandNum = - R->getValueAsInt("OtherOpNum"); + x.SDTCisSubVecOfVec_Info.OtherOperandNum = R->getValueAsInt("OtherOpNum"); } else if (R->isSubClassOf("SDTCVecEltisVT")) { ConstraintType = SDTCVecEltisVT; VVT = getValueTypeByHwMode(R->getValueAsDef("VT"), CGH); @@ -1577,11 +1569,11 @@ SDTypeConstraint::SDTypeConstraint(Record *R, const CodeGenHwModes &CGH) { } else if (R->isSubClassOf("SDTCisSameNumEltsAs")) { ConstraintType = SDTCisSameNumEltsAs; x.SDTCisSameNumEltsAs_Info.OtherOperandNum = - R->getValueAsInt("OtherOperandNum"); + R->getValueAsInt("OtherOperandNum"); } else if (R->isSubClassOf("SDTCisSameSizeAs")) { ConstraintType = SDTCisSameSizeAs; x.SDTCisSameSizeAs_Info.OtherOperandNum = - R->getValueAsInt("OtherOperandNum"); + R->getValueAsInt("OtherOperandNum"); } else { PrintFatalError(R->getLoc(), "Unrecognized SDTypeConstraint '" + R->getName() + "'!\n"); @@ -1604,8 +1596,8 @@ static TreePatternNode *getOperandNum(unsigned OpNo, TreePatternNode *N, if (OpNo >= N->getNumChildren()) { std::string S; raw_string_ostream OS(S); - OS << "Invalid operand number in type constraint " - << (OpNo+NumResults) << " "; + OS << "Invalid operand number in type constraint " << (OpNo + NumResults) + << " "; N->print(OS); PrintFatalError(S); } @@ -1635,7 +1627,7 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N, return NodeToApply->UpdateNodeType(ResNo, MVT::iPTR, TP); case SDTCisInt: // Require it to be one of the legal integer VTs. - return TI.EnforceInteger(NodeToApply->getExtType(ResNo)); + return TI.EnforceInteger(NodeToApply->getExtType(ResNo)); case SDTCisFP: // Require it to be one of the legal fp VTs. return TI.EnforceFloatingPoint(NodeToApply->getExtType(ResNo)); @@ -1645,7 +1637,7 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N, case SDTCisSameAs: { unsigned OResNo = 0; TreePatternNode *OtherNode = - getOperandNum(x.SDTCisSameAs_Info.OtherOperandNum, N, NodeInfo, OResNo); + getOperandNum(x.SDTCisSameAs_Info.OtherOperandNum, N, NodeInfo, OResNo); return (int)NodeToApply->UpdateNodeType(ResNo, OtherNode->getExtType(OResNo), TP) | (int)OtherNode->UpdateNodeType(OResNo, @@ -1654,10 +1646,10 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N, case SDTCisVTSmallerThanOp: { // The NodeToApply must be a leaf node that is a VT. OtherOperandNum must // have an integer type that is smaller than the VT. - if (!NodeToApply->isLeaf() || - !isa(NodeToApply->getLeafValue()) || - !cast(NodeToApply->getLeafValue())->getDef() - ->isSubClassOf("ValueType")) { + if (!NodeToApply->isLeaf() || !isa(NodeToApply->getLeafValue()) || + !cast(NodeToApply->getLeafValue()) + ->getDef() + ->isSubClassOf("ValueType")) { TP.error(N->getOperator()->getName() + " expects a VT operand!"); return false; } @@ -1667,26 +1659,23 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N, TypeSetByHwMode TypeListTmp(VVT); unsigned OResNo = 0; - TreePatternNode *OtherNode = - getOperandNum(x.SDTCisVTSmallerThanOp_Info.OtherOperandNum, N, NodeInfo, - OResNo); + TreePatternNode *OtherNode = getOperandNum( + x.SDTCisVTSmallerThanOp_Info.OtherOperandNum, N, NodeInfo, OResNo); return TI.EnforceSmallerThan(TypeListTmp, OtherNode->getExtType(OResNo), /*SmallIsVT*/ true); } case SDTCisOpSmallerThanOp: { unsigned BResNo = 0; - TreePatternNode *BigOperand = - getOperandNum(x.SDTCisOpSmallerThanOp_Info.BigOperandNum, N, NodeInfo, - BResNo); + TreePatternNode *BigOperand = getOperandNum( + x.SDTCisOpSmallerThanOp_Info.BigOperandNum, N, NodeInfo, BResNo); return TI.EnforceSmallerThan(NodeToApply->getExtType(ResNo), BigOperand->getExtType(BResNo)); } case SDTCisEltOfVec: { unsigned VResNo = 0; - TreePatternNode *VecOperand = - getOperandNum(x.SDTCisEltOfVec_Info.OtherOperandNum, N, NodeInfo, - VResNo); + TreePatternNode *VecOperand = getOperandNum( + x.SDTCisEltOfVec_Info.OtherOperandNum, N, NodeInfo, VResNo); // Filter vector types out of VecOperand that don't have the right element // type. return TI.EnforceVectorEltTypeIs(VecOperand->getExtType(VResNo), @@ -1694,9 +1683,8 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N, } case SDTCisSubVecOfVec: { unsigned VResNo = 0; - TreePatternNode *BigVecOperand = - getOperandNum(x.SDTCisSubVecOfVec_Info.OtherOperandNum, N, NodeInfo, - VResNo); + TreePatternNode *BigVecOperand = getOperandNum( + x.SDTCisSubVecOfVec_Info.OtherOperandNum, N, NodeInfo, VResNo); // Filter vector types out of BigVecOperand that don't have the // right subvector type. @@ -1708,17 +1696,15 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N, } case SDTCisSameNumEltsAs: { unsigned OResNo = 0; - TreePatternNode *OtherNode = - getOperandNum(x.SDTCisSameNumEltsAs_Info.OtherOperandNum, - N, NodeInfo, OResNo); + TreePatternNode *OtherNode = getOperandNum( + x.SDTCisSameNumEltsAs_Info.OtherOperandNum, N, NodeInfo, OResNo); return TI.EnforceSameNumElts(OtherNode->getExtType(OResNo), NodeToApply->getExtType(ResNo)); } case SDTCisSameSizeAs: { unsigned OResNo = 0; - TreePatternNode *OtherNode = - getOperandNum(x.SDTCisSameSizeAs_Info.OtherOperandNum, - N, NodeInfo, OResNo); + TreePatternNode *OtherNode = getOperandNum( + x.SDTCisSameSizeAs_Info.OtherOperandNum, N, NodeInfo, OResNo); return TI.EnforceSameSize(OtherNode->getExtType(OResNo), NodeToApply->getExtType(ResNo)); } @@ -1729,8 +1715,7 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N, // Update the node type to match an instruction operand or result as specified // in the ins or outs lists on the instruction definition. Return true if the // type was actually changed. -bool TreePatternNode::UpdateNodeTypeFromInst(unsigned ResNo, - Record *Operand, +bool TreePatternNode::UpdateNodeTypeFromInst(unsigned ResNo, Record *Operand, TreePattern &TP) { // The 'unknown' operand indicates that types should be inferred from the // context. @@ -1808,7 +1793,7 @@ bool TreePatternNode::setDefaultMode(unsigned Mode) { // SDNodeInfo implementation // SDNodeInfo::SDNodeInfo(Record *R, const CodeGenHwModes &CGH) : Def(R) { - EnumName = R->getValueAsString("Opcode"); + EnumName = R->getValueAsString("Opcode"); SDClassName = R->getValueAsString("SDClass"); Record *TypeProfile = R->getValueAsDef("TypeProfile"); NumResults = TypeProfile->getValueAsInt("NumResults"); @@ -1818,8 +1803,8 @@ SDNodeInfo::SDNodeInfo(Record *R, const CodeGenHwModes &CGH) : Def(R) { Properties = parseSDPatternOperatorProperties(R); // Parse the type constraints. - std::vector ConstraintList = - TypeProfile->getValueAsListOfDefs("Constraints"); + std::vector ConstraintList = + TypeProfile->getValueAsListOfDefs("Constraints"); for (Record *R : ConstraintList) TypeConstraints.emplace_back(R, CGH); } @@ -1835,11 +1820,12 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const { for (const SDTypeConstraint &Constraint : TypeConstraints) { // Make sure that this applies to the correct node result. - if (Constraint.OperandNo >= NumResults) // FIXME: need value # + if (Constraint.OperandNo >= NumResults) // FIXME: need value # continue; switch (Constraint.ConstraintType) { - default: break; + default: + break; case SDTypeConstraint::SDTCisVT: if (Constraint.VVT.isSimple()) return Constraint.VVT.getSimple().SimpleTy; @@ -1856,9 +1842,8 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const { // static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) { - if (Operator->getName() == "set" || - Operator->getName() == "implicit") - return 0; // All return nothing. + if (Operator->getName() == "set" || Operator->getName() == "implicit") + return 0; // All return nothing. if (Operator->isSubClassOf("Intrinsic")) return CDP.getIntrinsic(Operator).IS.RetTys.size(); @@ -1908,16 +1893,17 @@ static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) { } // Add on one implicit def if it has a resolvable type. - if (InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo()) !=MVT::Other) + if (InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo()) != + MVT::Other) ++NumDefsToAdd; return NumDefsToAdd; } if (Operator->isSubClassOf("SDNodeXForm")) - return 1; // FIXME: Generalize SDNodeXForm + return 1; // FIXME: Generalize SDNodeXForm if (Operator->isSubClassOf("ValueType")) - return 1; // A type-cast of one result. + return 1; // A type-cast of one result. if (Operator->isSubClassOf("ComplexPattern")) return 1; @@ -1963,9 +1949,7 @@ void TreePatternNode::print(raw_ostream &OS) const { for (const ScopedName &Name : NamesAsPredicateArg) OS << ":$pred:" << Name.getScope() << ":" << Name.getIdentifier(); } -void TreePatternNode::dump() const { - print(errs()); -} +void TreePatternNode::dump() const { print(errs()); } /// isIsomorphicTo - Return true if this node is recursively /// isomorphic to the specified node. For this comparison, the node's @@ -1976,7 +1960,8 @@ void TreePatternNode::dump() const { /// isomorphic if the names match. bool TreePatternNode::isIsomorphicTo(const TreePatternNode *N, const MultipleUseVarSet &DepVars) const { - if (N == this) return true; + if (N == this) + return true; if (N->isLeaf() != isLeaf()) return false; @@ -2035,17 +2020,18 @@ TreePatternNodePtr TreePatternNode::clone() const { void TreePatternNode::RemoveAllTypes() { // Reset to unknown type. std::fill(Types.begin(), Types.end(), TypeSetByHwMode()); - if (isLeaf()) return; + if (isLeaf()) + return; for (unsigned i = 0, e = getNumChildren(); i != e; ++i) getChild(i)->RemoveAllTypes(); } - /// SubstituteFormalArguments - Replace the formal arguments in this tree /// with actual values specified by ArgMap. void TreePatternNode::SubstituteFormalArguments( std::map &ArgMap) { - if (isLeaf()) return; + if (isLeaf()) + return; for (unsigned i = 0, e = getNumChildren(); i != e; ++i) { TreePatternNode *Child = getChild(i); @@ -2053,8 +2039,9 @@ void TreePatternNode::SubstituteFormalArguments( Init *Val = Child->getLeafValue(); // Note that, when substituting into an output pattern, Val might be an // UnsetInit. - if (isa(Val) || (isa(Val) && - cast(Val)->getDef()->getName() == "node")) { + if (isa(Val) || + (isa(Val) && + cast(Val)->getDef()->getName() == "node")) { // We found a use of a formal argument, replace it with its value. TreePatternNodePtr NewChild = ArgMap[Child->getName()]; assert(NewChild && "Couldn't find formal argument!"); @@ -2069,7 +2056,6 @@ void TreePatternNode::SubstituteFormalArguments( } } - /// InlinePatternFragments - If this pattern refers to any pattern /// fragments, return the set of inlined versions (this can be more than /// one if a PatFrags record has multiple alternatives). @@ -2205,7 +2191,7 @@ void TreePatternNode::InlinePatternFragments( for (const TreePredicateCall &Pred : getPredicateCalls()) FragTree->addPredicateCall(Pred); - // The fragment we inlined could have recursive inlining that is needed. See + // The fragment we inlined could have recursive inlining that is needed. See // if there are any pattern fragments in it and inline them as needed. FragTree->InlinePatternFragments(TP, OutAlternatives); } @@ -2224,8 +2210,7 @@ void TreePatternNode::InlinePatternFragments( /// GPR:$src operand above. /// static TypeSetByHwMode getImplicitType(Record *R, unsigned ResNo, - bool NotRegisters, - bool Unnamed, + bool NotRegisters, bool Unnamed, TreePattern &TP) { CodeGenDAGPatterns &CDP = TP.getDAGPatterns(); @@ -2331,11 +2316,10 @@ static TypeSetByHwMode getImplicitType(Record *R, unsigned ResNo, return TypeSetByHwMode(MVT::Other); } - /// getIntrinsicInfo - If this node corresponds to an intrinsic, return the /// CodeGenIntrinsic information for it, otherwise return a null pointer. -const CodeGenIntrinsic *TreePatternNode:: -getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const { +const CodeGenIntrinsic * +TreePatternNode::getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const { if (getOperator() != CDP.get_intrinsic_void_sdnode() && getOperator() != CDP.get_intrinsic_w_chain_sdnode() && getOperator() != CDP.get_intrinsic_wo_chain_sdnode()) @@ -2406,9 +2390,6 @@ bool TreePatternNode::NodeHasProperty(SDNP Property, return CGP.getSDNodeInfo(getOperator()).hasProperty(Property); } - - - /// TreeHasProperty - Return true if any node in this tree has the specified /// property. bool TreePatternNode::TreeHasProperty(SDNP Property, @@ -2423,8 +2404,8 @@ bool TreePatternNode::TreeHasProperty(SDNP Property, /// isCommutativeIntrinsic - Return true if the node corresponds to a /// commutative intrinsic. -bool -TreePatternNode::isCommutativeIntrinsic(const CodeGenDAGPatterns &CDP) const { +bool TreePatternNode::isCommutativeIntrinsic( + const CodeGenDAGPatterns &CDP) const { if (const CodeGenIntrinsic *Int = getIntrinsicInfo(CDP)) return Int->isCommutative; return false; @@ -2441,19 +2422,16 @@ static bool isOperandClass(const TreePatternNode *N, StringRef Class) { return false; } -static void emitTooManyOperandsError(TreePattern &TP, - StringRef InstName, - unsigned Expected, - unsigned Actual) { +static void emitTooManyOperandsError(TreePattern &TP, StringRef InstName, + unsigned Expected, unsigned Actual) { TP.error("Instruction '" + InstName + "' was provided " + Twine(Actual) + " operands but expected only " + Twine(Expected) + "!"); } -static void emitTooFewOperandsError(TreePattern &TP, - StringRef InstName, +static void emitTooFewOperandsError(TreePattern &TP, StringRef InstName, unsigned Actual) { - TP.error("Instruction '" + InstName + - "' expects more than the provided " + Twine(Actual) + " operands!"); + TP.error("Instruction '" + InstName + "' expects more than the provided " + + Twine(Actual) + " operands!"); } /// ApplyTypeConstraints - Apply all of the type constraints relevant to @@ -2469,9 +2447,9 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { // If it's a regclass or something else known, include the type. bool MadeChange = false; for (unsigned i = 0, e = Types.size(); i != e; ++i) - MadeChange |= UpdateNodeType(i, getImplicitType(DI->getDef(), i, - NotRegisters, - !hasName(), TP), TP); + MadeChange |= UpdateNodeType( + i, getImplicitType(DI->getDef(), i, NotRegisters, !hasName(), TP), + TP); return MadeChange; } @@ -2530,8 +2508,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { // Apply type info to the intrinsic ID. MadeChange |= getChild(0)->UpdateNodeType(0, MVT::iPTR, TP); - for (unsigned i = 0, e = getNumChildren()-1; i != e; ++i) { - MadeChange |= getChild(i+1)->ApplyTypeConstraints(TP, NotRegisters); + for (unsigned i = 0, e = getNumChildren() - 1; i != e; ++i) { + MadeChange |= getChild(i + 1)->ApplyTypeConstraints(TP, NotRegisters); MVT::SimpleValueType OpVT = getValueType(Int->IS.ParamTys[i]->getValueAsDef("VT")); @@ -2562,14 +2540,14 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { if (getOperator()->isSubClassOf("Instruction")) { const DAGInstruction &Inst = CDP.getInstruction(getOperator()); CodeGenInstruction &InstInfo = - CDP.getTargetInfo().getInstruction(getOperator()); + CDP.getTargetInfo().getInstruction(getOperator()); bool MadeChange = false; // Apply the result types to the node, these come from the things in the // (outs) list of the instruction. - unsigned NumResultsToAdd = std::min(InstInfo.Operands.NumDefs, - Inst.getNumResults()); + unsigned NumResultsToAdd = + std::min(InstInfo.Operands.NumDefs, Inst.getNumResults()); for (unsigned ResNo = 0; ResNo != NumResultsToAdd; ++ResNo) MadeChange |= UpdateNodeTypeFromInst(ResNo, Inst.getResult(ResNo), TP); @@ -2581,7 +2559,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { // FIXME: Generalize to multiple possible types and multiple possible // ImplicitDefs. MVT::SimpleValueType VT = - InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo()); + InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo()); if (VT != MVT::Other) MadeChange |= UpdateNodeType(ResNo, VT, TP); @@ -2636,7 +2614,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { // the pattern was intended to override A or skip it. unsigned NonOverridableOperands = NumFixedOperands; while (NonOverridableOperands > NumResults && - CDP.operandHasDefault(InstInfo.Operands[NonOverridableOperands-1].Rec)) + CDP.operandHasDefault( + InstInfo.Operands[NonOverridableOperands - 1].Rec)) --NonOverridableOperands; unsigned ChildNo = 0; @@ -2659,7 +2638,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { } TreePatternNode *Child = getChild(ChildNo++); - unsigned ChildResNo = 0; // Instructions always use res #0 of their op. + unsigned ChildResNo = 0; // Instructions always use res #0 of their op. // If the operand has sub-operands, they may be provided by distinct // child patterns, so attempt to match each sub-operand separately. @@ -2672,8 +2651,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { if (Child->getNumMIResults(CDP) < NumArgs) { // Match first sub-operand against the child we already have. Record *SubRec = cast(MIOpInfo->getArg(0))->getDef(); - MadeChange |= - Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP); + MadeChange |= Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP); // And the remaining sub-operands against subsequent children. for (unsigned Arg = 1; Arg < NumArgs; ++Arg) { @@ -2686,7 +2664,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { SubRec = cast(MIOpInfo->getArg(Arg))->getDef(); MadeChange |= - Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP); + Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP); } continue; } @@ -2699,8 +2677,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { } if (!InstInfo.Operands.isVariadic && ChildNo != getNumChildren()) { - emitTooManyOperandsError(TP, getOperator()->getName(), - ChildNo, getNumChildren()); + emitTooManyOperandsError(TP, getOperator()->getName(), ChildNo, + getNumChildren()); return false; } @@ -2759,7 +2737,6 @@ static bool OnlyOnRHSOfCommutative(TreePatternNode *N) { return false; } - /// canPatternMatch - If it is impossible for this pattern to match on this /// target, fill in Reason and return false. Otherwise, return true. This is /// used as a sanity check for .td files (to prevent people from writing stuff @@ -2767,7 +2744,8 @@ static bool OnlyOnRHSOfCommutative(TreePatternNode *N) { /// generating stuff that is useless. bool TreePatternNode::canPatternMatch(std::string &Reason, const CodeGenDAGPatterns &CDP) { - if (isLeaf()) return true; + if (isLeaf()) + return true; for (unsigned i = 0, e = getNumChildren(); i != e; ++i) if (!getChild(i)->canPatternMatch(Reason, CDP)) @@ -2790,11 +2768,12 @@ bool TreePatternNode::canPatternMatch(std::string &Reason, if (NodeInfo.hasProperty(SDNPCommutative) || isCommIntrinsic) { // Scan all of the operands of the node and make sure that only the last one // is a constant node, unless the RHS also is. - if (!OnlyOnRHSOfCommutative(getChild(getNumChildren()-1))) { + if (!OnlyOnRHSOfCommutative(getChild(getNumChildren() - 1))) { unsigned Skip = isCommIntrinsic ? 1 : 0; // First operand is intrinsic id. - for (unsigned i = Skip, e = getNumChildren()-1; i != e; ++i) + for (unsigned i = Skip, e = getNumChildren() - 1; i != e; ++i) if (OnlyOnRHSOfCommutative(getChild(i))) { - Reason="Immediate value must be on the RHS of commutative operators!"; + Reason = + "Immediate value must be on the RHS of commutative operators!"; return false; } } @@ -2808,17 +2787,17 @@ bool TreePatternNode::canPatternMatch(std::string &Reason, // TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput, - CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp), - isInputPattern(isInput), HasError(false), - Infer(*this) { + CodeGenDAGPatterns &cdp) + : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false), + Infer(*this) { for (Init *I : RawPat->getValues()) Trees.push_back(ParseTreePattern(I, "")); } TreePattern::TreePattern(Record *TheRec, DagInit *Pat, bool isInput, - CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp), - isInputPattern(isInput), HasError(false), - Infer(*this) { + CodeGenDAGPatterns &cdp) + : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false), + Infer(*this) { Trees.push_back(ParseTreePattern(Pat, "")); } @@ -2861,9 +2840,9 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, /// (foo GPR, imm) -> (foo GPR, (imm)) if (R->isSubClassOf("SDNode") || R->isSubClassOf("PatFrags")) return ParseTreePattern( - DagInit::get(DI, nullptr, - std::vector >()), - OpName); + DagInit::get(DI, nullptr, + std::vector>()), + OpName); // Input argument? TreePatternNodePtr Res = makeIntrusiveRefCnt(DI, 1); @@ -2943,8 +2922,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, !Operator->isSubClassOf("SDNodeXForm") && !Operator->isSubClassOf("Intrinsic") && !Operator->isSubClassOf("ComplexPattern") && - Operator->getName() != "set" && - Operator->getName() != "implicit") + Operator->getName() != "set" && Operator->getName() != "implicit") error("Unrecognized node '" + Operator->getName() + "'!"); // Check to see if this is something that is illegal in an input pattern. @@ -2956,20 +2934,16 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, if (Operator->isSubClassOf("Intrinsic")) error("Cannot use '" + Operator->getName() + "' in an output pattern!"); - if (Operator->isSubClassOf("SDNode") && - Operator->getName() != "imm" && - Operator->getName() != "timm" && - Operator->getName() != "fpimm" && + if (Operator->isSubClassOf("SDNode") && Operator->getName() != "imm" && + Operator->getName() != "timm" && Operator->getName() != "fpimm" && Operator->getName() != "tglobaltlsaddr" && Operator->getName() != "tconstpool" && Operator->getName() != "tjumptable" && Operator->getName() != "tframeindex" && Operator->getName() != "texternalsym" && Operator->getName() != "tblockaddress" && - Operator->getName() != "tglobaladdr" && - Operator->getName() != "bb" && - Operator->getName() != "vt" && - Operator->getName() != "mcsym") + Operator->getName() != "tglobaladdr" && Operator->getName() != "bb" && + Operator->getName() != "vt" && Operator->getName() != "mcsym") error("Cannot use '" + Operator->getName() + "' in an output pattern!"); } @@ -2979,8 +2953,8 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) Children.push_back(ParseTreePattern(Dag->getArg(i), Dag->getArgNameStr(i))); - // Get the actual number of results before Operator is converted to an intrinsic - // node (which is hard-coded to have either zero or one result). + // Get the actual number of results before Operator is converted to an + // intrinsic node (which is hard-coded to have either zero or one result). unsigned NumResults = GetNumNodeResults(Operator, CDP); // If the operator is an intrinsic, then this is just syntactic sugar for @@ -2988,7 +2962,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, // convert the intrinsic name to a number. if (Operator->isSubClassOf("Intrinsic")) { const CodeGenIntrinsic &Int = getDAGPatterns().getIntrinsic(Operator); - unsigned IID = getDAGPatterns().getIntrinsicID(Operator)+1; + unsigned IID = getDAGPatterns().getIntrinsicID(Operator) + 1; // If this intrinsic returns void, it must have side-effects and thus a // chain. @@ -3072,13 +3046,11 @@ static bool SimplifyTree(TreePatternNodePtr &N) { return MadeChange; } - - /// InferAllTypes - Infer/propagate as many types throughout the expression /// patterns as possible. Return true if all types are inferred, false /// otherwise. Flags an error if a type contradiction is found. -bool TreePattern:: -InferAllTypes(const StringMap > *InNamedTypes) { +bool TreePattern::InferAllTypes( + const StringMap> *InNamedTypes) { if (NamedNodes.empty()) ComputeNamedNodes(); @@ -3092,7 +3064,7 @@ InferAllTypes(const StringMap > *InNamedTypes) { // If there are constraints on our named nodes, apply them. for (auto &Entry : NamedNodes) { - SmallVectorImpl &Nodes = Entry.second; + SmallVectorImpl &Nodes = Entry.second; // If we have input named node types, propagate their types to the named // values here. @@ -3103,8 +3075,8 @@ InferAllTypes(const StringMap > *InNamedTypes) { return true; } - const SmallVectorImpl &InNodes = - InNamedTypes->find(Entry.getKey())->second; + const SmallVectorImpl &InNodes = + InNamedTypes->find(Entry.getKey())->second; // The input types should be fully resolved by now. for (TreePatternNode *Node : Nodes) { @@ -3120,19 +3092,18 @@ InferAllTypes(const StringMap > *InNamedTypes) { continue; } - assert(Node->getNumTypes() == 1 && - InNodes[0]->getNumTypes() == 1 && + assert(Node->getNumTypes() == 1 && InNodes[0]->getNumTypes() == 1 && "FIXME: cannot name multiple result nodes yet"); - MadeChange |= Node->UpdateNodeType(0, InNodes[0]->getExtType(0), - *this); + MadeChange |= + Node->UpdateNodeType(0, InNodes[0]->getExtType(0), *this); } } // If there are multiple nodes with the same name, they must all have the // same type. if (Entry.second.size() > 1) { - for (unsigned i = 0, e = Nodes.size()-1; i != e; ++i) { - TreePatternNode *N1 = Nodes[i], *N2 = Nodes[i+1]; + for (unsigned i = 0, e = Nodes.size() - 1; i != e; ++i) { + TreePatternNode *N1 = Nodes[i], *N2 = Nodes[i + 1]; assert(N1->getNumTypes() == 1 && N2->getNumTypes() == 1 && "FIXME: cannot name multiple result nodes yet"); @@ -3190,7 +3161,7 @@ CodeGenDAGPatterns::CodeGenDAGPatterns(RecordKeeper &R, ParsePatternFragments(); ParseDefaultOperands(); ParseInstructions(); - ParsePatternFragments(/*OutFrags*/true); + ParsePatternFragments(/*OutFrags*/ true); ParsePatterns(); // Generate variants. For example, commutative patterns can match @@ -3221,7 +3192,7 @@ Record *CodeGenDAGPatterns::getSDNodeNamed(StringRef Name) const { // Parse all of the SDNode definitions for the target, populating SDNodes. void CodeGenDAGPatterns::ParseNodeInfo() { - std::vector Nodes = Records.getAllDerivedDefinitions("SDNode"); + std::vector Nodes = Records.getAllDerivedDefinitions("SDNode"); const CodeGenHwModes &CGH = getTargetInfo().getHwModes(); while (!Nodes.empty()) { @@ -3231,15 +3202,16 @@ void CodeGenDAGPatterns::ParseNodeInfo() { } // Get the builtin intrinsic nodes. - intrinsic_void_sdnode = getSDNodeNamed("intrinsic_void"); - intrinsic_w_chain_sdnode = getSDNodeNamed("intrinsic_w_chain"); + intrinsic_void_sdnode = getSDNodeNamed("intrinsic_void"); + intrinsic_w_chain_sdnode = getSDNodeNamed("intrinsic_w_chain"); intrinsic_wo_chain_sdnode = getSDNodeNamed("intrinsic_wo_chain"); } /// ParseNodeTransforms - Parse all SDNodeXForm instances into the SDNodeXForms /// map, and emit them to the file as functions. void CodeGenDAGPatterns::ParseNodeTransforms() { - std::vector Xforms = Records.getAllDerivedDefinitions("SDNodeXForm"); + std::vector Xforms = + Records.getAllDerivedDefinitions("SDNodeXForm"); while (!Xforms.empty()) { Record *XFormNode = Xforms.back(); Record *SDNode = XFormNode->getValueAsDef("Opcode"); @@ -3252,21 +3224,22 @@ void CodeGenDAGPatterns::ParseNodeTransforms() { } void CodeGenDAGPatterns::ParseComplexPatterns() { - std::vector AMs = Records.getAllDerivedDefinitions("ComplexPattern"); + std::vector AMs = + Records.getAllDerivedDefinitions("ComplexPattern"); while (!AMs.empty()) { ComplexPatterns.insert(std::make_pair(AMs.back(), AMs.back())); AMs.pop_back(); } } - /// ParsePatternFragments - Parse all of the PatFrag definitions in the .td /// file, building up the PatternFragments map. After we've collected them all, /// inline fragments together as necessary, so that there are no references left /// inside a pattern fragment to a pattern fragment. /// void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) { - std::vector Fragments = Records.getAllDerivedDefinitions("PatFrags"); + std::vector Fragments = + Records.getAllDerivedDefinitions("PatFrags"); // First step, parse all of the fragments. for (Record *Frag : Fragments) { @@ -3274,10 +3247,9 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) { continue; ListInit *LI = Frag->getValueAsListInit("Fragments"); - TreePattern *P = - (PatternFragments[Frag] = std::make_unique( - Frag, LI, !Frag->isSubClassOf("OutPatFrag"), - *this)).get(); + TreePattern *P = (PatternFragments[Frag] = std::make_unique( + Frag, LI, !Frag->isSubClassOf("OutPatFrag"), *this)) + .get(); // Validate the argument list, converting it to set, to discard duplicates. std::vector &Args = P->getArgList(); @@ -3294,10 +3266,9 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) { DefInit *OpsOp = dyn_cast(OpsList->getOperator()); // Special cases: ops == outs == ins. Different names are used to // improve readability. - if (!OpsOp || - (OpsOp->getDef()->getName() != "ops" && - OpsOp->getDef()->getName() != "outs" && - OpsOp->getDef()->getName() != "ins")) + if (!OpsOp || (OpsOp->getDef()->getName() != "ops" && + OpsOp->getDef()->getName() != "outs" && + OpsOp->getDef()->getName() != "ins")) P->error("Operands list should start with '(ops ... '!"); // Copy over the arguments. @@ -3323,7 +3294,7 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) { // If there is a node transformation corresponding to this, keep track of // it. Record *Transform = Frag->getValueAsDef("OperandTransform"); - if (!getSDNodeTransform(Transform).second.empty()) // not noop xform? + if (!getSDNodeTransform(Transform).second.empty()) // not noop xform? for (const auto &T : P->getTrees()) T->setTransformFn(Transform); } @@ -3354,7 +3325,7 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) { } void CodeGenDAGPatterns::ParseDefaultOperands() { - std::vector DefaultOps; + std::vector DefaultOps; DefaultOps = Records.getAllDerivedDefinitions("OperandWithDefaultOps"); // Find some SDNode. @@ -3366,10 +3337,10 @@ void CodeGenDAGPatterns::ParseDefaultOperands() { // Clone the DefaultInfo dag node, changing the operator from 'ops' to // SomeSDnode so that we can parse this. - std::vector > Ops; + std::vector> Ops; for (unsigned op = 0, e = DefaultInfo->getNumArgs(); op != e; ++op) - Ops.push_back(std::make_pair(DefaultInfo->getArg(op), - DefaultInfo->getArgName(op))); + Ops.push_back( + std::make_pair(DefaultInfo->getArg(op), DefaultInfo->getArgName(op))); DagInit *DI = DagInit::get(SomeSDNode, nullptr, Ops); // Create a TreePattern to parse this. @@ -3520,7 +3491,7 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs( I.error("Cannot specify a transform function on a set node!"); // Check the set destinations. - unsigned NumDests = Pat->getNumChildren()-1; + unsigned NumDests = Pat->getNumChildren() - 1; for (unsigned i = 0; i != NumDests; ++i) { TreePatternNodePtr Dest = Pat->getChildShared(i); // For set destinations we also must resolve fragments here. @@ -3565,6 +3536,7 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs( class InstAnalyzer { const CodeGenDAGPatterns &CDP; + public: bool hasSideEffects; bool mayStore; @@ -3574,8 +3546,8 @@ public: bool hasChain; InstAnalyzer(const CodeGenDAGPatterns &cdp) - : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false), - isBitcast(false), isVariadic(false), hasChain(false) {} + : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false), + isBitcast(false), isVariadic(false), hasChain(false) {} void Analyze(const PatternToMatch &Pat) { const TreePatternNode *N = Pat.getSrcPattern(); @@ -3611,9 +3583,12 @@ public: // Handle ComplexPattern leaves. if (LeafRec->isSubClassOf("ComplexPattern")) { const ComplexPattern &CP = CDP.getComplexPattern(LeafRec); - if (CP.hasProperty(SDNPMayStore)) mayStore = true; - if (CP.hasProperty(SDNPMayLoad)) mayLoad = true; - if (CP.hasProperty(SDNPSideEffect)) hasSideEffects = true; + if (CP.hasProperty(SDNPMayStore)) + mayStore = true; + if (CP.hasProperty(SDNPMayLoad)) + mayLoad = true; + if (CP.hasProperty(SDNPSideEffect)) + hasSideEffects = true; } } return; @@ -3624,11 +3599,16 @@ public: AnalyzeNode(N->getChild(i)); // Notice properties of the node. - if (N->NodeHasProperty(SDNPMayStore, CDP)) mayStore = true; - if (N->NodeHasProperty(SDNPMayLoad, CDP)) mayLoad = true; - if (N->NodeHasProperty(SDNPSideEffect, CDP)) hasSideEffects = true; - if (N->NodeHasProperty(SDNPVariadic, CDP)) isVariadic = true; - if (N->NodeHasProperty(SDNPHasChain, CDP)) hasChain = true; + if (N->NodeHasProperty(SDNPMayStore, CDP)) + mayStore = true; + if (N->NodeHasProperty(SDNPMayLoad, CDP)) + mayLoad = true; + if (N->NodeHasProperty(SDNPSideEffect, CDP)) + hasSideEffects = true; + if (N->NodeHasProperty(SDNPVariadic, CDP)) + isVariadic = true; + if (N->NodeHasProperty(SDNPHasChain, CDP)) + hasChain = true; if (const CodeGenIntrinsic *IntInfo = N->getIntrinsicInfo(CDP)) { ModRefInfo MR = IntInfo->ME.getModRef(); @@ -3645,17 +3625,15 @@ public: hasSideEffects = true; } } - }; static bool InferFromPattern(CodeGenInstruction &InstInfo, - const InstAnalyzer &PatInfo, - Record *PatDef) { + const InstAnalyzer &PatInfo, Record *PatDef) { bool Error = false; // Remember where InstInfo got its flags. if (InstInfo.hasUndefFlags()) - InstInfo.InferredFrom = PatDef; + InstInfo.InferredFrom = PatDef; // Check explicitly set flags for consistency. if (InstInfo.hasSideEffects != PatInfo.hasSideEffects && @@ -3666,14 +3644,14 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo, if (!InstInfo.hasSideEffects) { Error = true; PrintError(PatDef->getLoc(), "Pattern doesn't match hasSideEffects = " + - Twine(InstInfo.hasSideEffects)); + Twine(InstInfo.hasSideEffects)); } } if (InstInfo.mayStore != PatInfo.mayStore && !InstInfo.mayStore_Unset) { Error = true; - PrintError(PatDef->getLoc(), "Pattern doesn't match mayStore = " + - Twine(InstInfo.mayStore)); + PrintError(PatDef->getLoc(), + "Pattern doesn't match mayStore = " + Twine(InstInfo.mayStore)); } if (InstInfo.mayLoad != PatInfo.mayLoad && !InstInfo.mayLoad_Unset) { @@ -3681,8 +3659,8 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo, // Some targets translate immediates to loads. if (!InstInfo.mayLoad) { Error = true; - PrintError(PatDef->getLoc(), "Pattern doesn't match mayLoad = " + - Twine(InstInfo.mayLoad)); + PrintError(PatDef->getLoc(), + "Pattern doesn't match mayLoad = " + Twine(InstInfo.mayLoad)); } } @@ -3712,11 +3690,13 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo, /// null_frag operator. static bool hasNullFragReference(DagInit *DI) { DefInit *OpDef = dyn_cast(DI->getOperator()); - if (!OpDef) return false; + if (!OpDef) + return false; Record *Operator = OpDef->getDef(); // If this is the null fragment, return true. - if (Operator->getName() == "null_frag") return true; + if (Operator->getName() == "null_frag") + return true; // If any of the arguments reference the null fragment, return true. for (unsigned i = 0, e = DI->getNumArgs(); i != e; ++i) { if (auto Arg = dyn_cast(DI->getArg(i))) @@ -3743,8 +3723,8 @@ static bool hasNullFragReference(ListInit *LI) { } /// Get all the instructions in a tree. -static void -getInstructionsInTree(TreePatternNode *Tree, SmallVectorImpl &Instrs) { +static void getInstructionsInTree(TreePatternNode *Tree, + SmallVectorImpl &Instrs) { if (Tree->isLeaf()) return; if (Tree->getOperator()->isSubClassOf("Instruction")) @@ -3755,8 +3735,7 @@ getInstructionsInTree(TreePatternNode *Tree, SmallVectorImpl &Instrs) { /// Check the class of a pattern leaf node against the instruction operand it /// represents. -static bool checkOperandClass(CGIOperandList::OperandInfo &OI, - Record *Leaf) { +static bool checkOperandClass(CGIOperandList::OperandInfo &OI, Record *Leaf) { if (OI.Rec == Leaf) return true; @@ -3772,8 +3751,9 @@ static bool checkOperandClass(CGIOperandList::OperandInfo &OI, return false; } -void CodeGenDAGPatterns::parseInstructionPattern( - CodeGenInstruction &CGI, ListInit *Pat, DAGInstMap &DAGInsts) { +void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI, + ListInit *Pat, + DAGInstMap &DAGInsts) { assert(!DAGInsts.count(CGI.TheDef) && "Instruction already parsed!"); @@ -3789,7 +3769,7 @@ void CodeGenDAGPatterns::parseInstructionPattern( MapVector> InstResults; - std::vector InstImpResults; + std::vector InstImpResults; // Verify that the top-level forms in the instruction are of void type, and // fill in the InstResults map. @@ -3805,8 +3785,8 @@ void CodeGenDAGPatterns::parseInstructionPattern( Pat->getExtType(k).writeToStream(OS); } I.error("Top-level forms in instruction pattern should have" - " void types, has types " + - OS.str()); + " void types, has types " + + OS.str()); } // Find inputs and outputs, and verify the structure of the uses/defs. @@ -3823,7 +3803,7 @@ void CodeGenDAGPatterns::parseInstructionPattern( assert(I.getArgList().empty() && "Args list should still be empty here!"); // Check that all of the results occur first in the list. - std::vector Results; + std::vector Results; std::vector ResultIndices; SmallVector ResNodes; for (unsigned i = 0; i != NumResults; ++i) { @@ -3850,8 +3830,9 @@ void CodeGenDAGPatterns::parseInstructionPattern( Record *R = cast(RNode->getLeafValue())->getDef(); ResNodes.push_back(std::move(RNode)); if (!R) - I.error("Operand $" + OpName + " should be a set destination: all " - "outputs must occur before inputs in operand list!"); + I.error("Operand $" + OpName + + " should be a set destination: all " + "outputs must occur before inputs in operand list!"); if (!checkOperandClass(CGI.Operands[i], R)) I.error("Operand $" + OpName + " class mismatch!"); @@ -3868,7 +3849,7 @@ void CodeGenDAGPatterns::parseInstructionPattern( // Loop over the inputs next. std::vector ResultNodeOperands; - std::vector Operands; + std::vector Operands; for (unsigned i = NumResults, e = CGI.Operands.size(); i != e; ++i) { CGIOperandList::OperandInfo &Op = CGI.Operands[i]; const std::string &OpName = Op.Name; @@ -3885,16 +3866,17 @@ void CodeGenDAGPatterns::parseInstructionPattern( continue; } I.error("Operand $" + OpName + - " does not appear in the instruction pattern"); + " does not appear in the instruction pattern"); } TreePatternNodePtr InVal = InstInputs[OpName]; - InstInputs.erase(OpName); // It occurred, remove from map. + InstInputs.erase(OpName); // It occurred, remove from map. if (InVal->isLeaf() && isa(InVal->getLeafValue())) { Record *InRec = cast(InVal->getLeafValue())->getDef(); if (!checkOperandClass(Op, InRec)) - I.error("Operand $" + OpName + "'s register class disagrees" - " between the operand and pattern"); + I.error("Operand $" + OpName + + "'s register class disagrees" + " between the operand and pattern"); } Operands.push_back(Op.Rec); @@ -3910,7 +3892,7 @@ void CodeGenDAGPatterns::parseInstructionPattern( std::vector Children; Children.push_back(OpNode); OpNode = makeIntrusiveRefCnt(Xform, std::move(Children), - OpNode->getNumTypes()); + OpNode->getNumTypes()); } ResultNodeOperands.push_back(std::move(OpNode)); @@ -3935,8 +3917,8 @@ void CodeGenDAGPatterns::parseInstructionPattern( TreePatternNodePtr Pattern = I.getTree(0); TreePatternNodePtr SrcPattern; if (Pattern->getOperator()->getName() == "set") { - SrcPattern = Pattern->getChild(Pattern->getNumChildren()-1)->clone(); - } else{ + SrcPattern = Pattern->getChild(Pattern->getNumChildren() - 1)->clone(); + } else { // Not a set (store or something?) SrcPattern = Pattern; } @@ -3954,7 +3936,8 @@ void CodeGenDAGPatterns::parseInstructionPattern( /// any fragments involved. This populates the Instructions list with fully /// resolved instructions. void CodeGenDAGPatterns::ParseInstructions() { - std::vector Instrs = Records.getAllDerivedDefinitions("Instruction"); + std::vector Instrs = + Records.getAllDerivedDefinitions("Instruction"); for (Record *Instr : Instrs) { ListInit *LI = nullptr; @@ -3969,8 +3952,8 @@ void CodeGenDAGPatterns::ParseInstructions() { // is from a multiclass expansion w/ a SDPatternOperator passed in as // null_frag. if (!LI || LI->empty() || hasNullFragReference(LI)) { - std::vector Results; - std::vector Operands; + std::vector Results; + std::vector Operands; CodeGenInstruction &InstInfo = Target.getInstruction(Instr); @@ -3980,14 +3963,15 @@ void CodeGenDAGPatterns::ParseInstructions() { // The rest are inputs. for (unsigned j = InstInfo.Operands.NumDefs, - e = InstInfo.Operands.size(); j < e; ++j) + e = InstInfo.Operands.size(); + j < e; ++j) Operands.push_back(InstInfo.Operands[j].Rec); } // Create and insert the instruction. Instructions.try_emplace(Instr, std::move(Results), std::move(Operands), std::vector()); - continue; // no pattern. + continue; // no pattern. } CodeGenInstruction &CGI = Target.getInstruction(Instr); @@ -4036,19 +4020,18 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern, std::string Reason; if (!PTM.getSrcPattern()->canPatternMatch(Reason, *this)) { PrintWarning(Pattern->getRecord()->getLoc(), - Twine("Pattern can never match: ") + Reason); + Twine("Pattern can never match: ") + Reason); return; } // If the source pattern's root is a complex pattern, that complex pattern // must specify the nodes it can potentially match. if (const ComplexPattern *CP = - PTM.getSrcPattern()->getComplexPatternInfo(*this)) + PTM.getSrcPattern()->getComplexPatternInfo(*this)) if (CP->getRootNodes().empty()) Pattern->error("ComplexPattern at root must specify list of opcodes it" " could match"); - // Find all of the named values in the input and output, ensure they have the // same type. std::map SrcNames, DstNames; @@ -4074,8 +4057,8 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern, } void CodeGenDAGPatterns::InferInstructionFlags() { - ArrayRef Instructions = - Target.getInstructionsByEnumValue(); + ArrayRef Instructions = + Target.getInstructionsByEnumValue(); unsigned Errors = 0; @@ -4085,7 +4068,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() { for (const PatternToMatch &PTM : ptms()) { // We can only infer from single-instruction patterns, otherwise we won't // know which instruction should get the flags. - SmallVector PatInstrs; + SmallVector PatInstrs; getInstructionsInTree(PTM.getDstPattern(), PatInstrs); if (PatInstrs.size() != 1) continue; @@ -4109,7 +4092,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() { if (Target.guessInstructionProperties()) { for (unsigned i = 0, e = Instructions.size(); i != e; ++i) { CodeGenInstruction *InstInfo = - const_cast(Instructions[i]); + const_cast(Instructions[i]); if (InstInfo->InferredFrom) continue; // The mayLoad and mayStore flags default to false. @@ -4123,7 +4106,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() { // Complain about any flags that are still undefined. for (unsigned i = 0, e = Instructions.size(); i != e; ++i) { CodeGenInstruction *InstInfo = - const_cast(Instructions[i]); + const_cast(Instructions[i]); if (InstInfo->InferredFrom) continue; if (InstInfo->hasSideEffects_Unset) @@ -4138,12 +4121,11 @@ void CodeGenDAGPatterns::InferInstructionFlags() { } } - /// Verify instruction flags against pattern node properties. void CodeGenDAGPatterns::VerifyInstructionFlags() { unsigned Errors = 0; for (const PatternToMatch &PTM : ptms()) { - SmallVector Instrs; + SmallVector Instrs; getInstructionsInTree(PTM.getDstPattern(), Instrs); if (Instrs.empty()) continue; @@ -4186,16 +4168,16 @@ void CodeGenDAGPatterns::VerifyInstructionFlags() { ++Errors; for (const std::string &Msg : Msgs) - PrintError(PTM.getSrcRecord()->getLoc(), Twine(Msg) + " on the " + - (Instrs.size() == 1 ? - "instruction" : "output instructions")); + PrintError( + PTM.getSrcRecord()->getLoc(), + Twine(Msg) + " on the " + + (Instrs.size() == 1 ? "instruction" : "output instructions")); // Provide the location of the relevant instruction definitions. for (const Record *Instr : Instrs) { if (Instr != PTM.getSrcRecord()) PrintError(Instr->getLoc(), "defined here"); const CodeGenInstruction &InstInfo = Target.getInstruction(Instr); - if (InstInfo.InferredFrom && - InstInfo.InferredFrom != InstInfo.TheDef && + if (InstInfo.InferredFrom && InstInfo.InferredFrom != InstInfo.TheDef && InstInfo.InferredFrom != PTM.getSrcRecord()) PrintError(InstInfo.InferredFrom->getLoc(), "inferred from pattern"); } @@ -4237,11 +4219,11 @@ static bool ForceArbitraryInstResultType(TreePatternNode *N, TreePattern &TP) { // Promote xform function to be an explicit node wherever set. static TreePatternNodePtr PromoteXForms(TreePatternNodePtr N) { if (Record *Xform = N->getTransformFn()) { - N->setTransformFn(nullptr); - std::vector Children; - Children.push_back(PromoteXForms(N)); - return makeIntrusiveRefCnt(Xform, std::move(Children), - N->getNumTypes()); + N->setTransformFn(nullptr); + std::vector Children; + Children.push_back(PromoteXForms(N)); + return makeIntrusiveRefCnt(Xform, std::move(Children), + N->getNumTypes()); } if (!N->isLeaf()) @@ -4252,9 +4234,9 @@ static TreePatternNodePtr PromoteXForms(TreePatternNodePtr N) { return N; } -void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef, - TreePattern &Pattern, TreePattern &Result, - const std::vector &InstImpResults) { +void CodeGenDAGPatterns::ParseOnePattern( + Record *TheDef, TreePattern &Pattern, TreePattern &Result, + const std::vector &InstImpResults) { // Inline pattern fragments and expand multiple alternatives. Pattern.InlinePatternFragments(); @@ -4274,8 +4256,7 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef, // Infer as many types as possible. If we cannot infer all of them, we // can never do anything with this pattern: report it to the user. - InferredAllResultTypes = - Result.InferAllTypes(&Pattern.getNamedNodesMap()); + InferredAllResultTypes = Result.InferAllTypes(&Pattern.getNamedNodesMap()); IterateInference = false; @@ -4286,11 +4267,11 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef, for (const auto &T : Pattern.getTrees()) for (unsigned i = 0, e = std::min(Result.getOnlyTree()->getNumTypes(), T->getNumTypes()); - i != e; ++i) { - IterateInference |= T->UpdateNodeType( - i, Result.getOnlyTree()->getExtType(i), Result); - IterateInference |= Result.getOnlyTree()->UpdateNodeType( - i, T->getExtType(i), Result); + i != e; ++i) { + IterateInference |= + T->UpdateNodeType(i, Result.getOnlyTree()->getExtType(i), Result); + IterateInference |= + Result.getOnlyTree()->UpdateNodeType(i, T->getExtType(i), Result); } // If our iteration has converged and the input pattern's types are fully @@ -4302,8 +4283,7 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef, // // In any case, to handle this, we just go through and disambiguate some // arbitrary types to the result pattern's nodes. - if (!IterateInference && InferredAllPatternTypes && - !InferredAllResultTypes) + if (!IterateInference && InferredAllPatternTypes && !InferredAllResultTypes) IterateInference = ForceArbitraryInstResultType(Result.getTree(0).get(), Result); } while (IterateInference); @@ -4357,7 +4337,7 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef, } void CodeGenDAGPatterns::ParsePatterns() { - std::vector Patterns = Records.getAllDerivedDefinitions("Pattern"); + std::vector Patterns = Records.getAllDerivedDefinitions("Pattern"); for (Record *CurPattern : Patterns) { DagInit *Tree = CurPattern->getValueAsDag("PatternToMatch"); @@ -4369,7 +4349,8 @@ void CodeGenDAGPatterns::ParsePatterns() { TreePattern Pattern(CurPattern, Tree, true, *this); ListInit *LI = CurPattern->getValueAsListInit("ResultInstrs"); - if (LI->empty()) continue; // no pattern. + if (LI->empty()) + continue; // no pattern. // Parse the instruction. TreePattern Result(CurPattern, LI, false, *this); @@ -4382,7 +4363,7 @@ void CodeGenDAGPatterns::ParsePatterns() { std::map InstInputs; MapVector> InstResults; - std::vector InstImpResults; + std::vector InstImpResults; for (unsigned j = 0, ee = Pattern.getNumTrees(); j != ee; ++j) FindPatternInputsAndOutputs(Pattern, Pattern.getTree(j), InstInputs, InstResults, InstImpResults); @@ -4512,7 +4493,6 @@ static void DumpDepVars(MultipleUseVarSet &DepVars) { } #endif - /// CombineChildVariants - Given a bunch of permutations of each child of the /// 'operator' node, put them together in all possible ways. static void CombineChildVariants( @@ -4598,7 +4578,8 @@ static void CombineChildVariants(TreePatternNodePtr Orig, static void GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N, std::vector &Children) { - assert(N->getNumChildren()==2 &&"Associative but doesn't have 2 children!"); + assert(N->getNumChildren() == 2 && + "Associative but doesn't have 2 children!"); Record *Operator = N->getOperator(); // Only permit raw nodes. @@ -4725,7 +4706,6 @@ static void GenerateVariantsOf(TreePatternNodePtr N, } } - // GenerateVariants - Generate variants. For example, commutative patterns can // match multiple ways. Add them to PatternsToMatch as well. void CodeGenDAGPatterns::GenerateVariants() { @@ -4782,7 +4762,8 @@ void CodeGenDAGPatterns::GenerateVariants() { } } // If we already have it, ignore the variant. - if (AlreadyExists) continue; + if (AlreadyExists) + continue; // Otherwise, add it to the list of patterns we have. PatternsToMatch.emplace_back( diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.h b/llvm/utils/TableGen/CodeGenDAGPatterns.h index 2611fe0..ea6219c 100644 --- a/llvm/utils/TableGen/CodeGenDAGPatterns.h +++ b/llvm/utils/TableGen/CodeGenDAGPatterns.h @@ -56,17 +56,15 @@ struct MachineValueTypeSet { static_assert(std::is_same, uint8_t>::value, "Change uint8_t here to the SimpleValueType's type"); - static unsigned constexpr Capacity = std::numeric_limits::max()+1; + static unsigned constexpr Capacity = std::numeric_limits::max() + 1; using WordType = uint64_t; - static unsigned constexpr WordWidth = CHAR_BIT*sizeof(WordType); - static unsigned constexpr NumWords = Capacity/WordWidth; - static_assert(NumWords*WordWidth == Capacity, + static unsigned constexpr WordWidth = CHAR_BIT * sizeof(WordType); + static unsigned constexpr NumWords = Capacity / WordWidth; + static_assert(NumWords * WordWidth == Capacity, "Capacity should be a multiple of WordWidth"); LLVM_ATTRIBUTE_ALWAYS_INLINE - MachineValueTypeSet() { - clear(); - } + MachineValueTypeSet() { clear(); } LLVM_ATTRIBUTE_ALWAYS_INLINE unsigned size() const { @@ -76,9 +74,7 @@ struct MachineValueTypeSet { return Count; } LLVM_ATTRIBUTE_ALWAYS_INLINE - void clear() { - std::memset(Words.data(), 0, NumWords*sizeof(WordType)); - } + void clear() { std::memset(Words.data(), 0, NumWords * sizeof(WordType)); } LLVM_ATTRIBUTE_ALWAYS_INLINE bool empty() const { for (WordType W : Words) @@ -90,7 +86,7 @@ struct MachineValueTypeSet { unsigned count(MVT T) const { return (Words[T.SimpleTy / WordWidth] >> (T.SimpleTy % WordWidth)) & 1; } - std::pair insert(MVT T) { + std::pair insert(MVT T) { bool V = count(T.SimpleTy); Words[T.SimpleTy / WordWidth] |= WordType(1) << (T.SimpleTy % WordWidth); return {*this, V}; @@ -113,8 +109,8 @@ struct MachineValueTypeSet { using iterator_category = std::forward_iterator_tag; using value_type = MVT; using difference_type = ptrdiff_t; - using pointer = const MVT*; - using reference = const MVT&; + using pointer = const MVT *; + using reference = const MVT &; LLVM_ATTRIBUTE_ALWAYS_INLINE MVT operator*() const { @@ -128,7 +124,7 @@ struct MachineValueTypeSet { LLVM_ATTRIBUTE_ALWAYS_INLINE const_iterator &operator++() { assert(Pos != Capacity); - Pos = find_from_pos(Pos+1); + Pos = find_from_pos(Pos + 1); return *this; } @@ -137,9 +133,7 @@ struct MachineValueTypeSet { return Set == It.Set && Pos == It.Pos; } LLVM_ATTRIBUTE_ALWAYS_INLINE - bool operator!=(const const_iterator &It) const { - return !operator==(It); - } + bool operator!=(const const_iterator &It) const { return !operator==(It); } private: unsigned find_from_pos(unsigned P) const { @@ -151,7 +145,7 @@ struct MachineValueTypeSet { // the trailing bits need to be masked off to use findFirstSet. if (SkipBits != 0) { WordType W = Set->Words[SkipWords]; - W &= maskLeadingOnes(WordWidth-SkipBits); + W &= maskLeadingOnes(WordWidth - SkipBits); if (W != 0) return Count + llvm::countr_zero(W); Count += WordWidth; @@ -174,20 +168,18 @@ struct MachineValueTypeSet { LLVM_ATTRIBUTE_ALWAYS_INLINE const_iterator begin() const { return const_iterator(this, false); } LLVM_ATTRIBUTE_ALWAYS_INLINE - const_iterator end() const { return const_iterator(this, true); } + const_iterator end() const { return const_iterator(this, true); } LLVM_ATTRIBUTE_ALWAYS_INLINE bool operator==(const MachineValueTypeSet &S) const { return Words == S.Words; } LLVM_ATTRIBUTE_ALWAYS_INLINE - bool operator!=(const MachineValueTypeSet &S) const { - return !operator==(S); - } + bool operator!=(const MachineValueTypeSet &S) const { return !operator==(S); } private: friend struct const_iterator; - std::array Words; + std::array Words; }; raw_ostream &operator<<(raw_ostream &OS, const MachineValueTypeSet &T); @@ -200,14 +192,12 @@ struct TypeSetByHwMode : public InfoByHwMode { TypeSetByHwMode(const TypeSetByHwMode &VTS) = default; TypeSetByHwMode &operator=(const TypeSetByHwMode &) = default; TypeSetByHwMode(MVT::SimpleValueType VT) - : TypeSetByHwMode(ValueTypeByHwMode(VT)) {} + : TypeSetByHwMode(ValueTypeByHwMode(VT)) {} TypeSetByHwMode(ValueTypeByHwMode VT) - : TypeSetByHwMode(ArrayRef(&VT, 1)) {} + : TypeSetByHwMode(ArrayRef(&VT, 1)) {} TypeSetByHwMode(ArrayRef VTList); - SetType &getOrCreate(unsigned Mode) { - return Map[Mode]; - } + SetType &getOrCreate(unsigned Mode) { return Map[Mode]; } bool isValueTypeByHwMode(bool AllowEmpty) const; ValueTypeByHwMode getValueTypeByHwMode() const; @@ -225,9 +215,7 @@ struct TypeSetByHwMode : public InfoByHwMode { bool isPossible() const; - bool isPointer() const { - return getValueTypeByHwMode().isPointer(); - } + bool isPointer() const { return getValueTypeByHwMode().isPointer(); } unsigned getPtrAddrSpace() const { assert(isPointer()); @@ -313,8 +301,7 @@ struct TypeInfer { /// Ensure that for each type T in \p Sub, T is a vector type, and there /// exists a type U in \p Vec such that U is a vector type with the same /// element type as T and at least as many elements as T. - bool EnforceVectorSubVectorTypeIs(TypeSetByHwMode &Vec, - TypeSetByHwMode &Sub); + bool EnforceVectorSubVectorTypeIs(TypeSetByHwMode &Vec, TypeSetByHwMode &Sub); /// 1. Ensure that \p V has a scalar type iff \p W has a scalar type. /// 2. Ensure that for each vector type T in \p V, there exists a vector /// type U in \p W, such that T and U have the same number of elements. @@ -346,15 +333,13 @@ struct TypeInfer { SuppressValidation(TypeInfer &TI) : Infer(TI), SavedValidate(TI.Validate) { Infer.Validate = false; } - ~SuppressValidation() { - Infer.Validate = SavedValidate; - } + ~SuppressValidation() { Infer.Validate = SavedValidate; } TypeInfer &Infer; bool SavedValidate; }; TreePattern &TP; - bool Validate = true; // Indicate whether to validate types. + bool Validate = true; // Indicate whether to validate types. private: const TypeSetByHwMode &getLegalTypes() const; @@ -372,14 +357,24 @@ typedef StringSet<> MultipleUseVarSet; struct SDTypeConstraint { SDTypeConstraint(Record *R, const CodeGenHwModes &CGH); - unsigned OperandNo; // The operand # this constraint applies to. + unsigned OperandNo; // The operand # this constraint applies to. enum { - SDTCisVT, SDTCisPtrTy, SDTCisInt, SDTCisFP, SDTCisVec, SDTCisSameAs, - SDTCisVTSmallerThanOp, SDTCisOpSmallerThanOp, SDTCisEltOfVec, - SDTCisSubVecOfVec, SDTCVecEltisVT, SDTCisSameNumEltsAs, SDTCisSameSizeAs + SDTCisVT, + SDTCisPtrTy, + SDTCisInt, + SDTCisFP, + SDTCisVec, + SDTCisSameAs, + SDTCisVTSmallerThanOp, + SDTCisOpSmallerThanOp, + SDTCisEltOfVec, + SDTCisSubVecOfVec, + SDTCVecEltisVT, + SDTCisSameNumEltsAs, + SDTCisSameSizeAs } ConstraintType; - union { // The discriminated union. + union { // The discriminated union. struct { unsigned OtherOperandNum; } SDTCisSameAs_Info; @@ -422,6 +417,7 @@ struct SDTypeConstraint { class ScopedName { unsigned Scope; std::string Identifier; + public: ScopedName(unsigned Scope, StringRef Identifier) : Scope(Scope), Identifier(std::string(Identifier)) { @@ -447,6 +443,7 @@ class SDNodeInfo { unsigned NumResults; int NumOperands; std::vector TypeConstraints; + public: // Parse the specified record. SDNodeInfo(Record *R, const CodeGenHwModes &CGH); @@ -487,11 +484,11 @@ class TreePredicateFn { /// PatFragRec - This is the TreePattern for the PatFrag that we /// originally came from. TreePattern *PatFragRec; + public: /// TreePredicateFn constructor. Here 'N' is a subclass of PatFrag. TreePredicateFn(TreePattern *N); - TreePattern *getOrigPatFragRecord() const { return PatFragRec; } /// isAlwaysTrue - Return true if this is a noop predicate. @@ -582,7 +579,8 @@ public: bool isAtomicOrderingWeakerThanRelease() const; /// If non-null, indicates that this predicate is a predefined memory VT - /// predicate for a load/store and returns the ValueType record for the memory VT. + /// predicate for a load/store and returns the ValueType record for the memory + /// VT. Record *getMemoryVT() const; /// If non-null, indicates that this predicate is a predefined memory VT /// predicate (checking only the scalar type) for load/store and returns the @@ -615,14 +613,12 @@ struct TreePredicateCall { unsigned Scope; TreePredicateCall(const TreePredicateFn &Fn, unsigned Scope) - : Fn(Fn), Scope(Scope) {} + : Fn(Fn), Scope(Scope) {} bool operator==(const TreePredicateCall &o) const { return Fn == o.Fn && Scope == o.Scope; } - bool operator!=(const TreePredicateCall &o) const { - return !(*this == o); - } + bool operator!=(const TreePredicateCall &o) const { return !(*this == o); } }; class TreePatternNode : public RefCountedBase { @@ -681,7 +677,7 @@ public: const std::vector &getNamesAsPredicateArg() const { return NamesAsPredicateArg; } - void setNamesAsPredicateArg(const std::vector& Names) { + void setNamesAsPredicateArg(const std::vector &Names) { NamesAsPredicateArg = Names; } void addNameAsPredicateArg(const ScopedName &N) { @@ -733,9 +729,7 @@ public: const TreePatternNodePtr &getChildShared(unsigned N) const { return Children[N]; } - TreePatternNodePtr &getChildSharedPtr(unsigned N) { - return Children[N]; - } + TreePatternNodePtr &getChildSharedPtr(unsigned N) { return Children[N]; } void setChild(unsigned i, TreePatternNodePtr N) { Children[i] = N; } /// hasChild - Return true if N is any of our children. @@ -762,7 +756,8 @@ public: } void addPredicateCall(const TreePredicateCall &Call) { assert(!Call.Fn.isAlwaysTrue() && "Empty predicate string!"); - assert(!is_contained(PredicateCalls, Call) && "predicate applied recursively"); + assert(!is_contained(PredicateCalls, Call) && + "predicate applied recursively"); PredicateCalls.push_back(Call); } void addPredicateCall(const TreePredicateFn &Fn, unsigned Scope) { @@ -805,8 +800,7 @@ public: void print(raw_ostream &OS) const; void dump() const; -public: // Higher level manipulation routines. - +public: // Higher level manipulation routines. /// clone - Return a new copy of this tree. /// TreePatternNodePtr clone() const; @@ -845,8 +839,7 @@ public: // Higher level manipulation routines. TreePattern &TP); bool UpdateNodeType(unsigned ResNo, MVT::SimpleValueType InTy, TreePattern &TP); - bool UpdateNodeType(unsigned ResNo, ValueTypeByHwMode InTy, - TreePattern &TP); + bool UpdateNodeType(unsigned ResNo, ValueTypeByHwMode InTy, TreePattern &TP); // Update node type with types inferred from an instruction operand or result // def from the ins/outs lists. @@ -910,7 +903,6 @@ class TreePattern { TypeInfer Infer; public: - /// TreePattern constructor - Parse the specified DagInits into the /// current record. TreePattern(Record *TheRec, ListInit *RawPat, bool isInput, @@ -971,12 +963,8 @@ public: /// error - If this is the first error in the current resolution step, /// print it and set the error flag. Otherwise, continue silently. void error(const Twine &Msg); - bool hasError() const { - return HasError; - } - void resetError() { - HasError = false; - } + bool hasError() const { return HasError; } + void resetError() { HasError = false; } TypeInfer &getInfer() { return Infer; } @@ -989,7 +977,6 @@ private: void ComputeNamedNodes(TreePatternNode *N); }; - inline bool TreePatternNode::UpdateNodeType(unsigned ResNo, const TypeSetByHwMode &InTy, TreePattern &TP) { @@ -1014,7 +1001,6 @@ inline bool TreePatternNode::UpdateNodeType(unsigned ResNo, return TP.getInfer().MergeInTypeInfo(Types[ResNo], VTS); } - /// DAGDefaultOperand - One of these is created for each OperandWithDefaultOps /// that has a set ExecuteAlways / DefaultOps field. struct DAGDefaultOperand { @@ -1022,9 +1008,9 @@ struct DAGDefaultOperand { }; class DAGInstruction { - std::vector Results; - std::vector Operands; - std::vector ImpResults; + std::vector Results; + std::vector Operands; + std::vector ImpResults; TreePatternNodePtr SrcPattern; TreePatternNodePtr ResultPattern; @@ -1041,7 +1027,7 @@ public: unsigned getNumResults() const { return Results.size(); } unsigned getNumOperands() const { return Operands.size(); } unsigned getNumImpResults() const { return ImpResults.size(); } - const std::vector& getImpResults() const { return ImpResults; } + const std::vector &getImpResults() const { return ImpResults; } Record *getResult(unsigned RN) const { assert(RN < Results.size()); @@ -1065,34 +1051,33 @@ public: /// PatternToMatch - Used by CodeGenDAGPatterns to keep tab of patterns /// processed to produce isel. class PatternToMatch { - Record *SrcRecord; // Originating Record for the pattern. - ListInit *Predicates; // Top level predicate conditions to match. - TreePatternNodePtr SrcPattern; // Source pattern to match. - TreePatternNodePtr DstPattern; // Resulting pattern. - std::vector Dstregs; // Physical register defs being matched. - std::string HwModeFeatures; - int AddedComplexity; // Add to matching pattern complexity. - unsigned ID; // Unique ID for the record. + Record *SrcRecord; // Originating Record for the pattern. + ListInit *Predicates; // Top level predicate conditions to match. + TreePatternNodePtr SrcPattern; // Source pattern to match. + TreePatternNodePtr DstPattern; // Resulting pattern. + std::vector Dstregs; // Physical register defs being matched. + std::string HwModeFeatures; + int AddedComplexity; // Add to matching pattern complexity. + unsigned ID; // Unique ID for the record. public: PatternToMatch(Record *srcrecord, ListInit *preds, TreePatternNodePtr src, TreePatternNodePtr dst, std::vector dstregs, - int complexity, unsigned uid, - const Twine &hwmodefeatures = "") + int complexity, unsigned uid, const Twine &hwmodefeatures = "") : SrcRecord(srcrecord), Predicates(preds), SrcPattern(src), DstPattern(dst), Dstregs(std::move(dstregs)), HwModeFeatures(hwmodefeatures.str()), AddedComplexity(complexity), ID(uid) {} - Record *getSrcRecord() const { return SrcRecord; } - ListInit *getPredicates() const { return Predicates; } + Record *getSrcRecord() const { return SrcRecord; } + ListInit *getPredicates() const { return Predicates; } TreePatternNode *getSrcPattern() const { return SrcPattern.get(); } TreePatternNodePtr getSrcPatternShared() const { return SrcPattern; } TreePatternNode *getDstPattern() const { return DstPattern.get(); } TreePatternNodePtr getDstPatternShared() const { return DstPattern; } - const std::vector &getDstRegs() const { return Dstregs; } - StringRef getHwModeFeatures() const { return HwModeFeatures; } - int getAddedComplexity() const { return AddedComplexity; } + const std::vector &getDstRegs() const { return Dstregs; } + StringRef getHwModeFeatures() const { return HwModeFeatures; } + int getAddedComplexity() const { return AddedComplexity; } unsigned getID() const { return ID; } std::string getPredicateCheck() const; @@ -1108,14 +1093,14 @@ class CodeGenDAGPatterns { CodeGenTarget Target; CodeGenIntrinsicTable Intrinsics; - std::map SDNodes; - std::map, LessRecordByID> + std::map SDNodes; + std::map, LessRecordByID> SDNodeXForms; - std::map ComplexPatterns; + std::map ComplexPatterns; std::map, LessRecordByID> PatternFragments; - std::map DefaultOperands; - std::map Instructions; + std::map DefaultOperands; + std::map Instructions; // Specific SDNode definitions: Record *intrinsic_void_sdnode; @@ -1128,7 +1113,7 @@ class CodeGenDAGPatterns { TypeSetByHwMode LegalVTS; - using PatternRewriterFn = std::function; + using PatternRewriterFn = std::function; PatternRewriterFn PatternRewriter; unsigned NumScopes = 0; @@ -1150,7 +1135,7 @@ public: } // Node transformation lookups. - typedef std::pair NodeXForm; + typedef std::pair NodeXForm; const NodeXForm &getSDNodeTransform(Record *R) const { auto F = SDNodeXForms.find(R); assert(F != SDNodeXForms.end() && "Invalid transform!"); @@ -1165,25 +1150,27 @@ public: const CodeGenIntrinsic &getIntrinsic(Record *R) const { for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i) - if (Intrinsics[i].TheDef == R) return Intrinsics[i]; + if (Intrinsics[i].TheDef == R) + return Intrinsics[i]; llvm_unreachable("Unknown intrinsic!"); } const CodeGenIntrinsic &getIntrinsicInfo(unsigned IID) const { - if (IID-1 < Intrinsics.size()) - return Intrinsics[IID-1]; + if (IID - 1 < Intrinsics.size()) + return Intrinsics[IID - 1]; llvm_unreachable("Bad intrinsic ID!"); } unsigned getIntrinsicID(Record *R) const { for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i) - if (Intrinsics[i].TheDef == R) return i; + if (Intrinsics[i].TheDef == R) + return i; llvm_unreachable("Unknown intrinsic!"); } const DAGDefaultOperand &getDefaultOperand(Record *R) const { auto F = DefaultOperands.find(R); - assert(F != DefaultOperands.end() &&"Isn't an analyzed default operand!"); + assert(F != DefaultOperands.end() && "Isn't an analyzed default operand!"); return F->second; } @@ -1213,10 +1200,9 @@ public: iterator_range ptms() const { return PatternsToMatch; } /// Parse the Pattern for an instruction, and insert the result in DAGInsts. - typedef std::map DAGInstMap; - void parseInstructionPattern( - CodeGenInstruction &CGI, ListInit *Pattern, - DAGInstMap &DAGInsts); + typedef std::map DAGInstMap; + void parseInstructionPattern(CodeGenInstruction &CGI, ListInit *Pattern, + DAGInstMap &DAGInsts); const DAGInstruction &getInstruction(Record *R) const { auto F = Instructions.find(R); @@ -1224,9 +1210,7 @@ public: return F->second; } - Record *get_intrinsic_void_sdnode() const { - return intrinsic_void_sdnode; - } + Record *get_intrinsic_void_sdnode() const { return intrinsic_void_sdnode; } Record *get_intrinsic_w_chain_sdnode() const { return intrinsic_w_chain_sdnode; } @@ -1238,7 +1222,7 @@ public: bool operandHasDefault(Record *Op) const { return Op->isSubClassOf("OperandWithDefaultOps") && - !getDefaultOperand(Op).DefaultOps.empty(); + !getDefaultOperand(Op).DefaultOps.empty(); } private: @@ -1254,8 +1238,8 @@ private: void GenerateVariants(); void VerifyInstructionFlags(); - void ParseOnePattern(Record *TheDef, - TreePattern &Pattern, TreePattern &Result, + void ParseOnePattern(Record *TheDef, TreePattern &Pattern, + TreePattern &Result, const std::vector &InstImpResults); void AddPatternToMatch(TreePattern *Pattern, PatternToMatch &&PTM); void FindPatternInputsAndOutputs( @@ -1266,14 +1250,13 @@ private: std::vector &InstImpResults); }; - inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode *N, TreePattern &TP) const { - bool MadeChange = false; - for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i) - MadeChange |= TypeConstraints[i].ApplyTypeConstraint(N, *this, TP); - return MadeChange; - } + bool MadeChange = false; + for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i) + MadeChange |= TypeConstraints[i].ApplyTypeConstraint(N, *this, TP); + return MadeChange; +} } // end namespace llvm diff --git a/llvm/utils/TableGen/CodeGenHwModes.cpp b/llvm/utils/TableGen/CodeGenHwModes.cpp index 2171507..7c08c75 100644 --- a/llvm/utils/TableGen/CodeGenHwModes.cpp +++ b/llvm/utils/TableGen/CodeGenHwModes.cpp @@ -37,17 +37,17 @@ HwMode::HwMode(Record *R) { } LLVM_DUMP_METHOD -void HwMode::dump() const { - dbgs() << Name << ": " << Features << '\n'; -} +void HwMode::dump() const { dbgs() << Name << ": " << Features << '\n'; } HwModeSelect::HwModeSelect(Record *R, CodeGenHwModes &CGH) { - std::vector Modes = R->getValueAsListOfDefs("Modes"); - std::vector Objects = R->getValueAsListOfDefs("Objects"); + std::vector Modes = R->getValueAsListOfDefs("Modes"); + std::vector Objects = R->getValueAsListOfDefs("Objects"); if (Modes.size() != Objects.size()) { - PrintError(R->getLoc(), "in record " + R->getName() + - " derived from HwModeSelect: the lists Modes and Objects should " - "have the same size"); + PrintError( + R->getLoc(), + "in record " + R->getName() + + " derived from HwModeSelect: the lists Modes and Objects should " + "have the same size"); report_fatal_error("error in target description."); } for (unsigned i = 0, e = Modes.size(); i != e; ++i) { diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h index 09d20ad..9a5b7a8 100644 --- a/llvm/utils/TableGen/CodeGenHwModes.h +++ b/llvm/utils/TableGen/CodeGenHwModes.h @@ -22,46 +22,46 @@ // HwModeId -> list of predicates (definition) namespace llvm { - class Record; - class RecordKeeper; +class Record; +class RecordKeeper; - struct CodeGenHwModes; +struct CodeGenHwModes; - struct HwMode { - HwMode(Record *R); - StringRef Name; - std::string Features; - std::string Predicates; - void dump() const; - }; +struct HwMode { + HwMode(Record *R); + StringRef Name; + std::string Features; + std::string Predicates; + void dump() const; +}; - struct HwModeSelect { - HwModeSelect(Record *R, CodeGenHwModes &CGH); - typedef std::pair PairType; - std::vector Items; - void dump() const; - }; +struct HwModeSelect { + HwModeSelect(Record *R, CodeGenHwModes &CGH); + typedef std::pair PairType; + std::vector Items; + void dump() const; +}; - struct CodeGenHwModes { - enum : unsigned { DefaultMode = 0 }; - static StringRef DefaultModeName; +struct CodeGenHwModes { + enum : unsigned { DefaultMode = 0 }; + static StringRef DefaultModeName; - CodeGenHwModes(RecordKeeper &R); - unsigned getHwModeId(Record *R) const; - const HwMode &getMode(unsigned Id) const { - assert(Id != 0 && "Mode id of 0 is reserved for the default mode"); - return Modes[Id-1]; - } - const HwModeSelect &getHwModeSelect(Record *R) const; - unsigned getNumModeIds() const { return Modes.size()+1; } - void dump() const; + CodeGenHwModes(RecordKeeper &R); + unsigned getHwModeId(Record *R) const; + const HwMode &getMode(unsigned Id) const { + assert(Id != 0 && "Mode id of 0 is reserved for the default mode"); + return Modes[Id - 1]; + } + const HwModeSelect &getHwModeSelect(Record *R) const; + unsigned getNumModeIds() const { return Modes.size() + 1; } + void dump() const; - private: - RecordKeeper &Records; - DenseMap ModeIds; // HwMode Record -> HwModeId - std::vector Modes; - std::map ModeSelects; - }; -} +private: + RecordKeeper &Records; + DenseMap ModeIds; // HwMode Record -> HwModeId + std::vector Modes; + std::map ModeSelects; +}; +} // namespace llvm #endif // LLVM_UTILS_TABLEGEN_CODEGENHWMODES_H diff --git a/llvm/utils/TableGen/CodeGenInstruction.cpp b/llvm/utils/TableGen/CodeGenInstruction.cpp index 5cd8941..a569194 100644 --- a/llvm/utils/TableGen/CodeGenInstruction.cpp +++ b/llvm/utils/TableGen/CodeGenInstruction.cpp @@ -55,15 +55,15 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) { unsigned e = InDI->getNumArgs() + OutDI->getNumArgs(); OperandList.reserve(e); bool VariadicOuts = false; - for (unsigned i = 0; i != e; ++i){ + for (unsigned i = 0; i != e; ++i) { Init *ArgInit; StringRef ArgName; if (i < NumDefs) { ArgInit = OutDI->getArg(i); ArgName = OutDI->getArgNameStr(i); } else { - ArgInit = InDI->getArg(i-NumDefs); - ArgName = InDI->getArgNameStr(i-NumDefs); + ArgInit = InDI->getArg(i - NumDefs); + ArgName = InDI->getArgNameStr(i - NumDefs); } DagInit *SubArgDag = dyn_cast(ArgInit); @@ -192,7 +192,6 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) { --NumDefs; } - /// getOperandNamed - Return the index of the operand with the specified /// non-empty name. If the instruction does not have an operand with the /// specified name, abort. @@ -230,7 +229,7 @@ bool CGIOperandList::hasSubOperandAlias( return false; } -std::pair +std::pair CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) { if (!Op.starts_with("$")) PrintFatalError(TheDef->getLoc(), @@ -242,7 +241,7 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) { // Check to see if this is $foo.bar. StringRef::size_type DotIdx = OpName.find_first_of('.'); if (DotIdx != StringRef::npos) { - SubOpName = OpName.substr(DotIdx+1); + SubOpName = OpName.substr(DotIdx + 1); if (SubOpName.empty()) PrintFatalError(TheDef->getLoc(), TheDef->getName() + @@ -266,7 +265,7 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) { OpIdx = getOperandNamed(OpName); - if (SubOpName.empty()) { // If no suboperand name was specified: + if (SubOpName.empty()) { // If no suboperand name was specified: // If one was needed, throw. if (OperandList[OpIdx].MINumOperands > 1 && !AllowWholeOp && SubOpName.empty()) @@ -299,82 +298,80 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) { return std::make_pair(0U, 0U); } -static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, - Record *Rec) { +static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, Record *Rec) { // EARLY_CLOBBER: @early $reg StringRef::size_type wpos = CStr.find_first_of(" \t"); StringRef::size_type start = CStr.find_first_not_of(" \t"); StringRef Tok = CStr.substr(start, wpos - start); if (Tok == "@earlyclobber") { - StringRef Name = CStr.substr(wpos+1); + StringRef Name = CStr.substr(wpos + 1); wpos = Name.find_first_not_of(" \t"); if (wpos == StringRef::npos) - PrintFatalError( - Rec->getLoc(), "Illegal format for @earlyclobber constraint in '" + - Rec->getName() + "': '" + CStr + "'"); + PrintFatalError(Rec->getLoc(), + "Illegal format for @earlyclobber constraint in '" + + Rec->getName() + "': '" + CStr + "'"); Name = Name.substr(wpos); - std::pair Op = Ops.ParseOperandName(Name, false); + std::pair Op = Ops.ParseOperandName(Name, false); // Build the string for the operand if (!Ops[Op.first].Constraints[Op.second].isNone()) - PrintFatalError( - Rec->getLoc(), "Operand '" + Name + "' of '" + Rec->getName() + - "' cannot have multiple constraints!"); + PrintFatalError(Rec->getLoc(), "Operand '" + Name + "' of '" + + Rec->getName() + + "' cannot have multiple constraints!"); Ops[Op.first].Constraints[Op.second] = - CGIOperandList::ConstraintInfo::getEarlyClobber(); + CGIOperandList::ConstraintInfo::getEarlyClobber(); return; } // Only other constraint is "TIED_TO" for now. StringRef::size_type pos = CStr.find_first_of('='); if (pos == StringRef::npos) - PrintFatalError( - Rec->getLoc(), "Unrecognized constraint '" + CStr + - "' in '" + Rec->getName() + "'"); + PrintFatalError(Rec->getLoc(), "Unrecognized constraint '" + CStr + + "' in '" + Rec->getName() + "'"); start = CStr.find_first_not_of(" \t"); // TIED_TO: $src1 = $dst wpos = CStr.find_first_of(" \t", start); if (wpos == StringRef::npos || wpos > pos) - PrintFatalError( - Rec->getLoc(), "Illegal format for tied-to constraint in '" + - Rec->getName() + "': '" + CStr + "'"); + PrintFatalError(Rec->getLoc(), + "Illegal format for tied-to constraint in '" + + Rec->getName() + "': '" + CStr + "'"); StringRef LHSOpName = CStr.substr(start, wpos - start); - std::pair LHSOp = Ops.ParseOperandName(LHSOpName, false); + std::pair LHSOp = Ops.ParseOperandName(LHSOpName, false); wpos = CStr.find_first_not_of(" \t", pos + 1); if (wpos == StringRef::npos) - PrintFatalError( - Rec->getLoc(), "Illegal format for tied-to constraint: '" + CStr + "'"); + PrintFatalError(Rec->getLoc(), + "Illegal format for tied-to constraint: '" + CStr + "'"); StringRef RHSOpName = CStr.substr(wpos); - std::pair RHSOp = Ops.ParseOperandName(RHSOpName, false); + std::pair RHSOp = Ops.ParseOperandName(RHSOpName, false); // Sort the operands into order, which should put the output one // first. But keep the original order, for use in diagnostics. bool FirstIsDest = (LHSOp < RHSOp); - std::pair DestOp = (FirstIsDest ? LHSOp : RHSOp); + std::pair DestOp = (FirstIsDest ? LHSOp : RHSOp); StringRef DestOpName = (FirstIsDest ? LHSOpName : RHSOpName); - std::pair SrcOp = (FirstIsDest ? RHSOp : LHSOp); + std::pair SrcOp = (FirstIsDest ? RHSOp : LHSOp); StringRef SrcOpName = (FirstIsDest ? RHSOpName : LHSOpName); // Ensure one operand is a def and the other is a use. if (DestOp.first >= Ops.NumDefs) - PrintFatalError( - Rec->getLoc(), "Input operands '" + LHSOpName + "' and '" + RHSOpName + - "' of '" + Rec->getName() + "' cannot be tied!"); + PrintFatalError(Rec->getLoc(), "Input operands '" + LHSOpName + "' and '" + + RHSOpName + "' of '" + Rec->getName() + + "' cannot be tied!"); if (SrcOp.first < Ops.NumDefs) - PrintFatalError( - Rec->getLoc(), "Output operands '" + LHSOpName + "' and '" + RHSOpName + - "' of '" + Rec->getName() + "' cannot be tied!"); + PrintFatalError(Rec->getLoc(), "Output operands '" + LHSOpName + "' and '" + + RHSOpName + "' of '" + Rec->getName() + + "' cannot be tied!"); // The constraint has to go on the operand with higher index, i.e. // the source one. Check there isn't another constraint there // already. if (!Ops[SrcOp.first].Constraints[SrcOp.second].isNone()) - PrintFatalError( - Rec->getLoc(), "Operand '" + SrcOpName + "' of '" + Rec->getName() + - "' cannot have multiple constraints!"); + PrintFatalError(Rec->getLoc(), "Operand '" + SrcOpName + "' of '" + + Rec->getName() + + "' cannot have multiple constraints!"); unsigned DestFlatOpNo = Ops.getFlattenedOperandNumber(DestOp); auto NewConstraint = CGIOperandList::ConstraintInfo::getTied(DestFlatOpNo); @@ -384,16 +381,17 @@ static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, for (const CGIOperandList::OperandInfo &Op : Ops) { for (unsigned i = 0; i < Op.MINumOperands; i++) if (Op.Constraints[i] == NewConstraint) - PrintFatalError( - Rec->getLoc(), "Operand '" + DestOpName + "' of '" + Rec->getName() + - "' cannot have multiple operands tied to it!"); + PrintFatalError(Rec->getLoc(), + "Operand '" + DestOpName + "' of '" + Rec->getName() + + "' cannot have multiple operands tied to it!"); } Ops[SrcOp.first].Constraints[SrcOp.second] = NewConstraint; } static void ParseConstraints(StringRef CStr, CGIOperandList &Ops, Record *Rec) { - if (CStr.empty()) return; + if (CStr.empty()) + return; StringRef delims(","); StringRef::size_type bidx, eidx; @@ -413,15 +411,15 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) { while (true) { StringRef OpName; std::tie(OpName, DisableEncoding) = getToken(DisableEncoding, " ,\t"); - if (OpName.empty()) break; + if (OpName.empty()) + break; // Figure out which operand this is. - std::pair Op = ParseOperandName(OpName, false); + std::pair Op = ParseOperandName(OpName, false); // Mark the operand as not-to-be encoded. OperandList[Op.first].DoNotEncode[Op.second] = true; } - } //===----------------------------------------------------------------------===// @@ -429,27 +427,27 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) { //===----------------------------------------------------------------------===// CodeGenInstruction::CodeGenInstruction(Record *R) - : TheDef(R), Operands(R), InferredFrom(nullptr) { + : TheDef(R), Operands(R), InferredFrom(nullptr) { Namespace = R->getValueAsString("Namespace"); AsmString = std::string(R->getValueAsString("AsmString")); isPreISelOpcode = R->getValueAsBit("isPreISelOpcode"); - isReturn = R->getValueAsBit("isReturn"); + isReturn = R->getValueAsBit("isReturn"); isEHScopeReturn = R->getValueAsBit("isEHScopeReturn"); - isBranch = R->getValueAsBit("isBranch"); + isBranch = R->getValueAsBit("isBranch"); isIndirectBranch = R->getValueAsBit("isIndirectBranch"); - isCompare = R->getValueAsBit("isCompare"); - isMoveImm = R->getValueAsBit("isMoveImm"); - isMoveReg = R->getValueAsBit("isMoveReg"); - isBitcast = R->getValueAsBit("isBitcast"); - isSelect = R->getValueAsBit("isSelect"); - isBarrier = R->getValueAsBit("isBarrier"); - isCall = R->getValueAsBit("isCall"); - isAdd = R->getValueAsBit("isAdd"); - isTrap = R->getValueAsBit("isTrap"); + isCompare = R->getValueAsBit("isCompare"); + isMoveImm = R->getValueAsBit("isMoveImm"); + isMoveReg = R->getValueAsBit("isMoveReg"); + isBitcast = R->getValueAsBit("isBitcast"); + isSelect = R->getValueAsBit("isSelect"); + isBarrier = R->getValueAsBit("isBarrier"); + isCall = R->getValueAsBit("isCall"); + isAdd = R->getValueAsBit("isAdd"); + isTrap = R->getValueAsBit("isTrap"); canFoldAsLoad = R->getValueAsBit("canFoldAsLoad"); - isPredicable = !R->getValueAsBit("isUnpredicable") && ( - Operands.isPredicable || R->getValueAsBit("isPredicable")); + isPredicable = !R->getValueAsBit("isUnpredicable") && + (Operands.isPredicable || R->getValueAsBit("isPredicable")); isConvertibleToThreeAddress = R->getValueAsBit("isConvertibleToThreeAddress"); isCommutable = R->getValueAsBit("isCommutable"); isTerminator = R->getValueAsBit("isTerminator"); @@ -457,7 +455,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R) hasDelaySlot = R->getValueAsBit("hasDelaySlot"); usesCustomInserter = R->getValueAsBit("usesCustomInserter"); hasPostISelHook = R->getValueAsBit("hasPostISelHook"); - hasCtrlDep = R->getValueAsBit("hasCtrlDep"); + hasCtrlDep = R->getValueAsBit("hasCtrlDep"); isNotDuplicable = R->getValueAsBit("isNotDuplicable"); isRegSequence = R->getValueAsBit("isRegSequence"); isExtractSubreg = R->getValueAsBit("isExtractSubreg"); @@ -469,9 +467,9 @@ CodeGenInstruction::CodeGenInstruction(Record *R) isAuthenticated = R->getValueAsBit("isAuthenticated"); bool Unset; - mayLoad = R->getValueAsBitOrUnset("mayLoad", Unset); + mayLoad = R->getValueAsBitOrUnset("mayLoad", Unset); mayLoad_Unset = Unset; - mayStore = R->getValueAsBitOrUnset("mayStore", Unset); + mayStore = R->getValueAsBitOrUnset("mayStore", Unset); mayStore_Unset = Unset; mayRaiseFPException = R->getValueAsBit("mayRaiseFPException"); hasSideEffects = R->getValueAsBitOrUnset("hasSideEffects", Unset); @@ -494,8 +492,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R) ParseConstraints(R->getValueAsString("Constraints"), Operands, R); // Parse the DisableEncoding field. - Operands.ProcessDisableEncoding( - R->getValueAsString("DisableEncoding")); + Operands.ProcessDisableEncoding(R->getValueAsString("DisableEncoding")); // First check for a ComplexDeprecationPredicate. if (R->getValue("ComplexDeprecationPredicate")) { @@ -516,25 +513,25 @@ CodeGenInstruction::CodeGenInstruction(Record *R) /// HasOneImplicitDefWithKnownVT - If the instruction has at least one /// implicit def and it has a known VT, return the VT, otherwise return /// MVT::Other. -MVT::SimpleValueType CodeGenInstruction:: -HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const { - if (ImplicitDefs.empty()) return MVT::Other; +MVT::SimpleValueType CodeGenInstruction::HasOneImplicitDefWithKnownVT( + const CodeGenTarget &TargetInfo) const { + if (ImplicitDefs.empty()) + return MVT::Other; // Check to see if the first implicit def has a resolvable type. Record *FirstImplicitDef = ImplicitDefs[0]; assert(FirstImplicitDef->isSubClassOf("Register")); const std::vector &RegVTs = - TargetInfo.getRegisterVTs(FirstImplicitDef); + TargetInfo.getRegisterVTs(FirstImplicitDef); if (RegVTs.size() == 1 && RegVTs[0].isSimple()) return RegVTs[0].getSimple().SimpleTy; return MVT::Other; } - /// FlattenAsmStringVariants - Flatten the specified AsmString to only /// include text from the specified variant, returning the new string. -std::string CodeGenInstruction:: -FlattenAsmStringVariants(StringRef Cur, unsigned Variant) { +std::string CodeGenInstruction::FlattenAsmStringVariants(StringRef Cur, + unsigned Variant) { std::string Res; for (;;) { @@ -542,8 +539,8 @@ FlattenAsmStringVariants(StringRef Cur, unsigned Variant) { size_t VariantsStart = 0; for (size_t e = Cur.size(); VariantsStart != e; ++VariantsStart) if (Cur[VariantsStart] == '{' && - (VariantsStart == 0 || (Cur[VariantsStart-1] != '$' && - Cur[VariantsStart-1] != '\\'))) + (VariantsStart == 0 || + (Cur[VariantsStart - 1] != '$' && Cur[VariantsStart - 1] != '\\'))) break; // Add the prefix to the result. @@ -557,7 +554,7 @@ FlattenAsmStringVariants(StringRef Cur, unsigned Variant) { size_t VariantsEnd = VariantsStart; unsigned NestedBraces = 1; for (size_t e = Cur.size(); VariantsEnd != e; ++VariantsEnd) { - if (Cur[VariantsEnd] == '}' && Cur[VariantsEnd-1] != '\\') { + if (Cur[VariantsEnd] == '}' && Cur[VariantsEnd - 1] != '\\') { if (--NestedBraces == 0) break; } else if (Cur[VariantsEnd] == '{') diff --git a/llvm/utils/TableGen/CodeGenInstruction.h b/llvm/utils/TableGen/CodeGenInstruction.h index 4a34c29..ca7b1e9 100644 --- a/llvm/utils/TableGen/CodeGenInstruction.h +++ b/llvm/utils/TableGen/CodeGenInstruction.h @@ -23,324 +23,320 @@ #include namespace llvm { - class Record; - class DagInit; - class CodeGenTarget; +class Record; +class DagInit; +class CodeGenTarget; - class CGIOperandList { - public: - class ConstraintInfo { - enum { None, EarlyClobber, Tied } Kind = None; - unsigned OtherTiedOperand = 0; - - public: - ConstraintInfo() = default; - - static ConstraintInfo getEarlyClobber() { - ConstraintInfo I; - I.Kind = EarlyClobber; - I.OtherTiedOperand = 0; - return I; - } - - static ConstraintInfo getTied(unsigned Op) { - ConstraintInfo I; - I.Kind = Tied; - I.OtherTiedOperand = Op; - return I; - } - - bool isNone() const { return Kind == None; } - bool isEarlyClobber() const { return Kind == EarlyClobber; } - bool isTied() const { return Kind == Tied; } +class CGIOperandList { +public: + class ConstraintInfo { + enum { None, EarlyClobber, Tied } Kind = None; + unsigned OtherTiedOperand = 0; - unsigned getTiedOperand() const { - assert(isTied()); - return OtherTiedOperand; - } - - bool operator==(const ConstraintInfo &RHS) const { - if (Kind != RHS.Kind) - return false; - if (Kind == Tied && OtherTiedOperand != RHS.OtherTiedOperand) - return false; - return true; - } - bool operator!=(const ConstraintInfo &RHS) const { - return !(*this == RHS); - } - }; - - /// OperandInfo - The information we keep track of for each operand in the - /// operand list for a tablegen instruction. - struct OperandInfo { - /// Rec - The definition this operand is declared as. - /// - Record *Rec; - - /// Name - If this operand was assigned a symbolic name, this is it, - /// otherwise, it's empty. - std::string Name; - - /// The names of sub-operands, if given, otherwise empty. - std::vector SubOpNames; - - /// PrinterMethodName - The method used to print operands of this type in - /// the asmprinter. - std::string PrinterMethodName; - - /// The method used to get the machine operand value for binary - /// encoding, per sub-operand. If empty, uses "getMachineOpValue". - std::vector EncoderMethodNames; - - /// OperandType - A value from MCOI::OperandType representing the type of - /// the operand. - std::string OperandType; - - /// MIOperandNo - Currently (this is meant to be phased out), some logical - /// operands correspond to multiple MachineInstr operands. In the X86 - /// target for example, one address operand is represented as 4 - /// MachineOperands. Because of this, the operand number in the - /// OperandList may not match the MachineInstr operand num. Until it - /// does, this contains the MI operand index of this operand. - unsigned MIOperandNo; - unsigned MINumOperands; // The number of operands. - - /// DoNotEncode - Bools are set to true in this vector for each operand in - /// the DisableEncoding list. These should not be emitted by the code - /// emitter. - BitVector DoNotEncode; - - /// MIOperandInfo - Default MI operand type. Note an operand may be made - /// up of multiple MI operands. - DagInit *MIOperandInfo; - - /// Constraint info for this operand. This operand can have pieces, so we - /// track constraint info for each. - std::vector Constraints; - - OperandInfo(Record *R, const std::string &N, const std::string &PMN, - const std::string &OT, unsigned MION, unsigned MINO, - DagInit *MIOI) - : Rec(R), Name(N), SubOpNames(MINO), PrinterMethodName(PMN), - EncoderMethodNames(MINO), OperandType(OT), MIOperandNo(MION), - MINumOperands(MINO), DoNotEncode(MINO), MIOperandInfo(MIOI), - Constraints(MINO) {} - - /// getTiedOperand - If this operand is tied to another one, return the - /// other operand number. Otherwise, return -1. - int getTiedRegister() const { - for (unsigned j = 0, e = Constraints.size(); j != e; ++j) { - const CGIOperandList::ConstraintInfo &CI = Constraints[j]; - if (CI.isTied()) return CI.getTiedOperand(); - } - return -1; - } - }; - - CGIOperandList(Record *D); - - Record *TheDef; // The actual record containing this OperandList. + public: + ConstraintInfo() = default; - /// NumDefs - Number of def operands declared, this is the number of - /// elements in the instruction's (outs) list. - /// - unsigned NumDefs; - - /// OperandList - The list of declared operands, along with their declared - /// type (which is a record). - std::vector OperandList; - - /// SubOpAliases - List of alias names for suboperands. - StringMap> SubOpAliases; - - // Information gleaned from the operand list. - bool isPredicable; - bool hasOptionalDef; - bool isVariadic; - - // Provide transparent accessors to the operand list. - bool empty() const { return OperandList.empty(); } - unsigned size() const { return OperandList.size(); } - const OperandInfo &operator[](unsigned i) const { return OperandList[i]; } - OperandInfo &operator[](unsigned i) { return OperandList[i]; } - OperandInfo &back() { return OperandList.back(); } - const OperandInfo &back() const { return OperandList.back(); } - - typedef std::vector::iterator iterator; - typedef std::vector::const_iterator const_iterator; - iterator begin() { return OperandList.begin(); } - const_iterator begin() const { return OperandList.begin(); } - iterator end() { return OperandList.end(); } - const_iterator end() const { return OperandList.end(); } - - /// getOperandNamed - Return the index of the operand with the specified - /// non-empty name. If the instruction does not have an operand with the - /// specified name, abort. - unsigned getOperandNamed(StringRef Name) const; - - /// hasOperandNamed - Query whether the instruction has an operand of the - /// given name. If so, return true and set OpIdx to the index of the - /// operand. Otherwise, return false. - bool hasOperandNamed(StringRef Name, unsigned &OpIdx) const; - - bool hasSubOperandAlias(StringRef Name, - std::pair &SubOp) const; - - /// ParseOperandName - Parse an operand name like "$foo" or "$foo.bar", - /// where $foo is a whole operand and $foo.bar refers to a suboperand. - /// This aborts if the name is invalid. If AllowWholeOp is true, references - /// to operands with suboperands are allowed, otherwise not. - std::pair ParseOperandName(StringRef Op, - bool AllowWholeOp = true); - - /// getFlattenedOperandNumber - Flatten a operand/suboperand pair into a - /// flat machineinstr operand #. - unsigned getFlattenedOperandNumber(std::pair Op) const { - return OperandList[Op.first].MIOperandNo + Op.second; + static ConstraintInfo getEarlyClobber() { + ConstraintInfo I; + I.Kind = EarlyClobber; + I.OtherTiedOperand = 0; + return I; } - /// getSubOperandNumber - Unflatten a operand number into an - /// operand/suboperand pair. - std::pair getSubOperandNumber(unsigned Op) const { - for (unsigned i = 0; ; ++i) { - assert(i < OperandList.size() && "Invalid flat operand #"); - if (OperandList[i].MIOperandNo+OperandList[i].MINumOperands > Op) - return std::make_pair(i, Op-OperandList[i].MIOperandNo); - } + static ConstraintInfo getTied(unsigned Op) { + ConstraintInfo I; + I.Kind = Tied; + I.OtherTiedOperand = Op; + return I; } + bool isNone() const { return Kind == None; } + bool isEarlyClobber() const { return Kind == EarlyClobber; } + bool isTied() const { return Kind == Tied; } - /// isFlatOperandNotEmitted - Return true if the specified flat operand # - /// should not be emitted with the code emitter. - bool isFlatOperandNotEmitted(unsigned FlatOpNo) const { - std::pair Op = getSubOperandNumber(FlatOpNo); - if (OperandList[Op.first].DoNotEncode.size() > Op.second) - return OperandList[Op.first].DoNotEncode[Op.second]; - return false; + unsigned getTiedOperand() const { + assert(isTied()); + return OtherTiedOperand; } - void ProcessDisableEncoding(StringRef Value); - }; - - - class CodeGenInstruction { - public: - Record *TheDef; // The actual record defining this instruction. - StringRef Namespace; // The namespace the instruction is in. - - /// AsmString - The format string used to emit a .s file for the - /// instruction. - std::string AsmString; - - /// Operands - This is information about the (ins) and (outs) list specified - /// to the instruction. - CGIOperandList Operands; - - /// ImplicitDefs/ImplicitUses - These are lists of registers that are - /// implicitly defined and used by the instruction. - std::vector ImplicitDefs, ImplicitUses; - - // Various boolean values we track for the instruction. - bool isPreISelOpcode : 1; - bool isReturn : 1; - bool isEHScopeReturn : 1; - bool isBranch : 1; - bool isIndirectBranch : 1; - bool isCompare : 1; - bool isMoveImm : 1; - bool isMoveReg : 1; - bool isBitcast : 1; - bool isSelect : 1; - bool isBarrier : 1; - bool isCall : 1; - bool isAdd : 1; - bool isTrap : 1; - bool canFoldAsLoad : 1; - bool mayLoad : 1; - bool mayLoad_Unset : 1; - bool mayStore : 1; - bool mayStore_Unset : 1; - bool mayRaiseFPException : 1; - bool isPredicable : 1; - bool isConvertibleToThreeAddress : 1; - bool isCommutable : 1; - bool isTerminator : 1; - bool isReMaterializable : 1; - bool hasDelaySlot : 1; - bool usesCustomInserter : 1; - bool hasPostISelHook : 1; - bool hasCtrlDep : 1; - bool isNotDuplicable : 1; - bool hasSideEffects : 1; - bool hasSideEffects_Unset : 1; - bool isAsCheapAsAMove : 1; - bool hasExtraSrcRegAllocReq : 1; - bool hasExtraDefRegAllocReq : 1; - bool isCodeGenOnly : 1; - bool isPseudo : 1; - bool isMeta : 1; - bool isRegSequence : 1; - bool isExtractSubreg : 1; - bool isInsertSubreg : 1; - bool isConvergent : 1; - bool hasNoSchedulingInfo : 1; - bool FastISelShouldIgnore : 1; - bool hasChain : 1; - bool hasChain_Inferred : 1; - bool variadicOpsAreDefs : 1; - bool isAuthenticated : 1; - - std::string DeprecatedReason; - bool HasComplexDeprecationPredicate; - - /// Are there any undefined flags? - bool hasUndefFlags() const { - return mayLoad_Unset || mayStore_Unset || hasSideEffects_Unset; - } - - // The record used to infer instruction flags, or NULL if no flag values - // have been inferred. - Record *InferredFrom; - - // The enum value assigned by CodeGenTarget::computeInstrsByEnum. - mutable unsigned EnumVal; - - CodeGenInstruction(Record *R); - - /// HasOneImplicitDefWithKnownVT - If the instruction has at least one - /// implicit def and it has a known VT, return the VT, otherwise return - /// MVT::Other. - MVT::SimpleValueType - HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const; - - - /// FlattenAsmStringVariants - Flatten the specified AsmString to only - /// include text from the specified variant, returning the new string. - static std::string FlattenAsmStringVariants(StringRef AsmString, - unsigned Variant); - - // Is the specified operand in a generic instruction implicitly a pointer. - // This can be used on intructions that use typeN or ptypeN to identify - // operands that should be considered as pointers even though SelectionDAG - // didn't make a distinction between integer and pointers. - bool isInOperandAPointer(unsigned i) const { - return isOperandImpl("InOperandList", i, "IsPointer"); + bool operator==(const ConstraintInfo &RHS) const { + if (Kind != RHS.Kind) + return false; + if (Kind == Tied && OtherTiedOperand != RHS.OtherTiedOperand) + return false; + return true; } + bool operator!=(const ConstraintInfo &RHS) const { return !(*this == RHS); } + }; - bool isOutOperandAPointer(unsigned i) const { - return isOperandImpl("OutOperandList", i, "IsPointer"); + /// OperandInfo - The information we keep track of for each operand in the + /// operand list for a tablegen instruction. + struct OperandInfo { + /// Rec - The definition this operand is declared as. + /// + Record *Rec; + + /// Name - If this operand was assigned a symbolic name, this is it, + /// otherwise, it's empty. + std::string Name; + + /// The names of sub-operands, if given, otherwise empty. + std::vector SubOpNames; + + /// PrinterMethodName - The method used to print operands of this type in + /// the asmprinter. + std::string PrinterMethodName; + + /// The method used to get the machine operand value for binary + /// encoding, per sub-operand. If empty, uses "getMachineOpValue". + std::vector EncoderMethodNames; + + /// OperandType - A value from MCOI::OperandType representing the type of + /// the operand. + std::string OperandType; + + /// MIOperandNo - Currently (this is meant to be phased out), some logical + /// operands correspond to multiple MachineInstr operands. In the X86 + /// target for example, one address operand is represented as 4 + /// MachineOperands. Because of this, the operand number in the + /// OperandList may not match the MachineInstr operand num. Until it + /// does, this contains the MI operand index of this operand. + unsigned MIOperandNo; + unsigned MINumOperands; // The number of operands. + + /// DoNotEncode - Bools are set to true in this vector for each operand in + /// the DisableEncoding list. These should not be emitted by the code + /// emitter. + BitVector DoNotEncode; + + /// MIOperandInfo - Default MI operand type. Note an operand may be made + /// up of multiple MI operands. + DagInit *MIOperandInfo; + + /// Constraint info for this operand. This operand can have pieces, so we + /// track constraint info for each. + std::vector Constraints; + + OperandInfo(Record *R, const std::string &N, const std::string &PMN, + const std::string &OT, unsigned MION, unsigned MINO, + DagInit *MIOI) + : Rec(R), Name(N), SubOpNames(MINO), PrinterMethodName(PMN), + EncoderMethodNames(MINO), OperandType(OT), MIOperandNo(MION), + MINumOperands(MINO), DoNotEncode(MINO), MIOperandInfo(MIOI), + Constraints(MINO) {} + + /// getTiedOperand - If this operand is tied to another one, return the + /// other operand number. Otherwise, return -1. + int getTiedRegister() const { + for (unsigned j = 0, e = Constraints.size(); j != e; ++j) { + const CGIOperandList::ConstraintInfo &CI = Constraints[j]; + if (CI.isTied()) + return CI.getTiedOperand(); + } + return -1; } + }; - /// Check if the operand is required to be an immediate. - bool isInOperandImmArg(unsigned i) const { - return isOperandImpl("InOperandList", i, "IsImmediate"); + CGIOperandList(Record *D); + + Record *TheDef; // The actual record containing this OperandList. + + /// NumDefs - Number of def operands declared, this is the number of + /// elements in the instruction's (outs) list. + /// + unsigned NumDefs; + + /// OperandList - The list of declared operands, along with their declared + /// type (which is a record). + std::vector OperandList; + + /// SubOpAliases - List of alias names for suboperands. + StringMap> SubOpAliases; + + // Information gleaned from the operand list. + bool isPredicable; + bool hasOptionalDef; + bool isVariadic; + + // Provide transparent accessors to the operand list. + bool empty() const { return OperandList.empty(); } + unsigned size() const { return OperandList.size(); } + const OperandInfo &operator[](unsigned i) const { return OperandList[i]; } + OperandInfo &operator[](unsigned i) { return OperandList[i]; } + OperandInfo &back() { return OperandList.back(); } + const OperandInfo &back() const { return OperandList.back(); } + + typedef std::vector::iterator iterator; + typedef std::vector::const_iterator const_iterator; + iterator begin() { return OperandList.begin(); } + const_iterator begin() const { return OperandList.begin(); } + iterator end() { return OperandList.end(); } + const_iterator end() const { return OperandList.end(); } + + /// getOperandNamed - Return the index of the operand with the specified + /// non-empty name. If the instruction does not have an operand with the + /// specified name, abort. + unsigned getOperandNamed(StringRef Name) const; + + /// hasOperandNamed - Query whether the instruction has an operand of the + /// given name. If so, return true and set OpIdx to the index of the + /// operand. Otherwise, return false. + bool hasOperandNamed(StringRef Name, unsigned &OpIdx) const; + + bool hasSubOperandAlias(StringRef Name, + std::pair &SubOp) const; + + /// ParseOperandName - Parse an operand name like "$foo" or "$foo.bar", + /// where $foo is a whole operand and $foo.bar refers to a suboperand. + /// This aborts if the name is invalid. If AllowWholeOp is true, references + /// to operands with suboperands are allowed, otherwise not. + std::pair ParseOperandName(StringRef Op, + bool AllowWholeOp = true); + + /// getFlattenedOperandNumber - Flatten a operand/suboperand pair into a + /// flat machineinstr operand #. + unsigned getFlattenedOperandNumber(std::pair Op) const { + return OperandList[Op.first].MIOperandNo + Op.second; + } + + /// getSubOperandNumber - Unflatten a operand number into an + /// operand/suboperand pair. + std::pair getSubOperandNumber(unsigned Op) const { + for (unsigned i = 0;; ++i) { + assert(i < OperandList.size() && "Invalid flat operand #"); + if (OperandList[i].MIOperandNo + OperandList[i].MINumOperands > Op) + return std::make_pair(i, Op - OperandList[i].MIOperandNo); } - - private: - bool isOperandImpl(StringRef OpListName, unsigned i, - StringRef PropertyName) const; - }; + } + + /// isFlatOperandNotEmitted - Return true if the specified flat operand # + /// should not be emitted with the code emitter. + bool isFlatOperandNotEmitted(unsigned FlatOpNo) const { + std::pair Op = getSubOperandNumber(FlatOpNo); + if (OperandList[Op.first].DoNotEncode.size() > Op.second) + return OperandList[Op.first].DoNotEncode[Op.second]; + return false; + } + + void ProcessDisableEncoding(StringRef Value); +}; + +class CodeGenInstruction { +public: + Record *TheDef; // The actual record defining this instruction. + StringRef Namespace; // The namespace the instruction is in. + + /// AsmString - The format string used to emit a .s file for the + /// instruction. + std::string AsmString; + + /// Operands - This is information about the (ins) and (outs) list specified + /// to the instruction. + CGIOperandList Operands; + + /// ImplicitDefs/ImplicitUses - These are lists of registers that are + /// implicitly defined and used by the instruction. + std::vector ImplicitDefs, ImplicitUses; + + // Various boolean values we track for the instruction. + bool isPreISelOpcode : 1; + bool isReturn : 1; + bool isEHScopeReturn : 1; + bool isBranch : 1; + bool isIndirectBranch : 1; + bool isCompare : 1; + bool isMoveImm : 1; + bool isMoveReg : 1; + bool isBitcast : 1; + bool isSelect : 1; + bool isBarrier : 1; + bool isCall : 1; + bool isAdd : 1; + bool isTrap : 1; + bool canFoldAsLoad : 1; + bool mayLoad : 1; + bool mayLoad_Unset : 1; + bool mayStore : 1; + bool mayStore_Unset : 1; + bool mayRaiseFPException : 1; + bool isPredicable : 1; + bool isConvertibleToThreeAddress : 1; + bool isCommutable : 1; + bool isTerminator : 1; + bool isReMaterializable : 1; + bool hasDelaySlot : 1; + bool usesCustomInserter : 1; + bool hasPostISelHook : 1; + bool hasCtrlDep : 1; + bool isNotDuplicable : 1; + bool hasSideEffects : 1; + bool hasSideEffects_Unset : 1; + bool isAsCheapAsAMove : 1; + bool hasExtraSrcRegAllocReq : 1; + bool hasExtraDefRegAllocReq : 1; + bool isCodeGenOnly : 1; + bool isPseudo : 1; + bool isMeta : 1; + bool isRegSequence : 1; + bool isExtractSubreg : 1; + bool isInsertSubreg : 1; + bool isConvergent : 1; + bool hasNoSchedulingInfo : 1; + bool FastISelShouldIgnore : 1; + bool hasChain : 1; + bool hasChain_Inferred : 1; + bool variadicOpsAreDefs : 1; + bool isAuthenticated : 1; + + std::string DeprecatedReason; + bool HasComplexDeprecationPredicate; + + /// Are there any undefined flags? + bool hasUndefFlags() const { + return mayLoad_Unset || mayStore_Unset || hasSideEffects_Unset; + } + + // The record used to infer instruction flags, or NULL if no flag values + // have been inferred. + Record *InferredFrom; + + // The enum value assigned by CodeGenTarget::computeInstrsByEnum. + mutable unsigned EnumVal; + + CodeGenInstruction(Record *R); + + /// HasOneImplicitDefWithKnownVT - If the instruction has at least one + /// implicit def and it has a known VT, return the VT, otherwise return + /// MVT::Other. + MVT::SimpleValueType + HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const; + + /// FlattenAsmStringVariants - Flatten the specified AsmString to only + /// include text from the specified variant, returning the new string. + static std::string FlattenAsmStringVariants(StringRef AsmString, + unsigned Variant); + + // Is the specified operand in a generic instruction implicitly a pointer. + // This can be used on intructions that use typeN or ptypeN to identify + // operands that should be considered as pointers even though SelectionDAG + // didn't make a distinction between integer and pointers. + bool isInOperandAPointer(unsigned i) const { + return isOperandImpl("InOperandList", i, "IsPointer"); + } + + bool isOutOperandAPointer(unsigned i) const { + return isOperandImpl("OutOperandList", i, "IsPointer"); + } + + /// Check if the operand is required to be an immediate. + bool isInOperandImmArg(unsigned i) const { + return isOperandImpl("InOperandList", i, "IsImmediate"); + } + +private: + bool isOperandImpl(StringRef OpListName, unsigned i, + StringRef PropertyName) const; +}; } // namespace llvm #endif diff --git a/llvm/utils/TableGen/CodeGenIntrinsics.h b/llvm/utils/TableGen/CodeGenIntrinsics.h index f3452f5..da9e386 100644 --- a/llvm/utils/TableGen/CodeGenIntrinsics.h +++ b/llvm/utils/TableGen/CodeGenIntrinsics.h @@ -25,12 +25,12 @@ class Record; class RecordKeeper; struct CodeGenIntrinsic { - Record *TheDef; // The actual record defining this intrinsic. - std::string Name; // The name of the LLVM function "llvm.bswap.i32" - std::string EnumName; // The name of the enum "bswap_i32" + Record *TheDef; // The actual record defining this intrinsic. + std::string Name; // The name of the LLVM function "llvm.bswap.i32" + std::string EnumName; // The name of the enum "bswap_i32" std::string ClangBuiltinName; // Name of the corresponding GCC builtin, or "". - std::string MSBuiltinName; // Name of the corresponding MS builtin, or "". - std::string TargetPrefix; // Target prefix, e.g. "ppc" for t-s intrinsics. + std::string MSBuiltinName; // Name of the corresponding MS builtin, or "". + std::string TargetPrefix; // Target prefix, e.g. "ppc" for t-s intrinsics. /// This structure holds the return values and parameter values of an /// intrinsic. If the number of return values is > 1, then the intrinsic @@ -136,9 +136,7 @@ struct CodeGenIntrinsic { void addArgAttribute(unsigned Idx, ArgAttrKind AK, uint64_t V = 0); - bool hasProperty(enum SDNP Prop) const { - return Properties & (1 << Prop); - } + bool hasProperty(enum SDNP Prop) const { return Properties & (1 << Prop); } /// Goes through all IntrProperties that have IsDefault /// value set and sets the property. @@ -182,6 +180,6 @@ public: return Intrinsics[Pos]; } }; -} +} // namespace llvm #endif diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp index fd375735..03af0b4 100644 --- a/llvm/utils/TableGen/CodeGenMapTable.cpp +++ b/llvm/utils/TableGen/CodeGenMapTable.cpp @@ -80,9 +80,9 @@ #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" using namespace llvm; -typedef std::map > InstrRelMapTy; +typedef std::map> InstrRelMapTy; -typedef std::map, std::vector > RowInstrMapTy; +typedef std::map, std::vector> RowInstrMapTy; namespace { @@ -95,10 +95,10 @@ private: ListInit *RowFields; ListInit *ColFields; ListInit *KeyCol; - std::vector ValueCols; + std::vector ValueCols; public: - InstrMap(Record* MapRec) { + InstrMap(Record *MapRec) { Name = std::string(MapRec->getName()); // FilterClass - It's used to reduce the search space only to the @@ -130,7 +130,8 @@ public: // Each instruction map must specify at least one column for it to be valid. if (ColValList->empty()) PrintFatalError(MapRec->getLoc(), "InstrMapping record `" + - MapRec->getName() + "' has empty " + "`ValueCols' field!"); + MapRec->getName() + "' has empty " + + "`ValueCols' field!"); for (Init *I : ColValList->getValues()) { auto *ColI = cast(I); @@ -138,9 +139,10 @@ public: // Make sure that all the sub-lists in 'ValueCols' have same number of // elements as the fields in 'ColFields'. if (ColI->size() != ColFields->size()) - PrintFatalError(MapRec->getLoc(), "Record `" + MapRec->getName() + - "', field `ValueCols' entries don't match with " + - " the entries in 'ColFields'!"); + PrintFatalError(MapRec->getLoc(), + "Record `" + MapRec->getName() + + "', field `ValueCols' entries don't match with " + + " the entries in 'ColFields'!"); ValueCols.push_back(ColI); } } @@ -155,13 +157,10 @@ public: ListInit *getKeyCol() const { return KeyCol; } - const std::vector &getValueCols() const { - return ValueCols; - } + const std::vector &getValueCols() const { return ValueCols; } }; } // end anonymous namespace - //===----------------------------------------------------------------------===// // class MapTableEmitter : It builds the instruction relation maps using // the information provided in InstrMapping records. It outputs these @@ -171,26 +170,26 @@ public: namespace { class MapTableEmitter { private: -// std::string TargetName; + // std::string TargetName; const CodeGenTarget &Target; // InstrMapDesc - InstrMapping record to be processed. InstrMap InstrMapDesc; // InstrDefs - list of instructions filtered using FilterClass defined // in InstrMapDesc. - std::vector InstrDefs; + std::vector InstrDefs; // RowInstrMap - maps RowFields values to the instructions. It's keyed by the // values of the row fields and contains vector of records as values. RowInstrMapTy RowInstrMap; // KeyInstrVec - list of key instructions. - std::vector KeyInstrVec; - DenseMap > MapTable; + std::vector KeyInstrVec; + DenseMap> MapTable; public: - MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec): - Target(Target), InstrMapDesc(IMRec) { + MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec) + : Target(Target), InstrMapDesc(IMRec) { const std::string &FilterClass = InstrMapDesc.getFilterClass(); InstrDefs = Records.getAllDerivedDefinitions(FilterClass); } @@ -199,7 +198,7 @@ public: // Returns true if an instruction is a key instruction, i.e., its ColFields // have same values as KeyCol. - bool isKeyColInstr(Record* CurInstr); + bool isKeyColInstr(Record *CurInstr); // Find column instruction corresponding to a key instruction based on the // constraints for that column. @@ -215,11 +214,9 @@ public: // Lookup functions to query binary search tables. void emitMapFuncBody(raw_ostream &OS, unsigned TableSize); - }; } // end anonymous namespace - //===----------------------------------------------------------------------===// // Process all the instructions that model this relation (alreday present in // InstrDefs) and insert them into RowInstrMap which is keyed by the values of @@ -230,14 +227,15 @@ public: void MapTableEmitter::buildRowInstrMap() { for (Record *CurInstr : InstrDefs) { - std::vector KeyValue; + std::vector KeyValue; ListInit *RowFields = InstrMapDesc.getRowFields(); for (Init *RowField : RowFields->getValues()) { RecordVal *RecVal = CurInstr->getValue(RowField); if (RecVal == nullptr) - PrintFatalError(CurInstr->getLoc(), "No value " + - RowField->getAsString() + " found in \"" + - CurInstr->getName() + "\" instruction description."); + PrintFatalError(CurInstr->getLoc(), + "No value " + RowField->getAsString() + " found in \"" + + CurInstr->getName() + + "\" instruction description."); Init *CurInstrVal = RecVal->getValue(); KeyValue.push_back(CurInstrVal); } @@ -256,14 +254,14 @@ void MapTableEmitter::buildRowInstrMap() { // Return true if an instruction is a KeyCol instruction. //===----------------------------------------------------------------------===// -bool MapTableEmitter::isKeyColInstr(Record* CurInstr) { +bool MapTableEmitter::isKeyColInstr(Record *CurInstr) { ListInit *ColFields = InstrMapDesc.getColFields(); ListInit *KeyCol = InstrMapDesc.getKeyCol(); // Check if the instruction is a KeyCol instruction. bool MatchFound = true; - for (unsigned j = 0, endCF = ColFields->size(); - (j < endCF) && MatchFound; j++) { + for (unsigned j = 0, endCF = ColFields->size(); (j < endCF) && MatchFound; + j++) { RecordVal *ColFieldName = CurInstr->getValue(ColFields->getElement(j)); std::string CurInstrVal = ColFieldName->getValue()->getAsUnquotedString(); std::string KeyColValue = KeyCol->getElement(j)->getAsUnquotedString(); @@ -280,10 +278,10 @@ bool MapTableEmitter::isKeyColInstr(Record* CurInstr) { void MapTableEmitter::buildMapTable() { // Find column instructions for a given key based on the ColField // constraints. - const std::vector &ValueCols = InstrMapDesc.getValueCols(); + const std::vector &ValueCols = InstrMapDesc.getValueCols(); unsigned NumOfCols = ValueCols.size(); for (Record *CurKeyInstr : KeyInstrVec) { - std::vector ColInstrVec(NumOfCols); + std::vector ColInstrVec(NumOfCols); // Find the column instruction based on the constraints for the column. for (unsigned ColIdx = 0; ColIdx < NumOfCols; ColIdx++) { @@ -302,7 +300,7 @@ void MapTableEmitter::buildMapTable() { Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr, ListInit *CurValueCol) { ListInit *RowFields = InstrMapDesc.getRowFields(); - std::vector KeyValue; + std::vector KeyValue; // Construct KeyValue using KeyInstr's values for RowFields. for (Init *RowField : RowFields->getValues()) { @@ -314,15 +312,15 @@ Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr, // in RowInstrMap. We search through these instructions to find a match // for the current column, i.e., the instruction which has the same values // as CurValueCol for all the fields in ColFields. - const std::vector &RelatedInstrVec = RowInstrMap[KeyValue]; + const std::vector &RelatedInstrVec = RowInstrMap[KeyValue]; ListInit *ColFields = InstrMapDesc.getColFields(); Record *MatchInstr = nullptr; for (llvm::Record *CurInstr : RelatedInstrVec) { bool MatchFound = true; - for (unsigned j = 0, endCF = ColFields->size(); - (j < endCF) && MatchFound; j++) { + for (unsigned j = 0, endCF = ColFields->size(); (j < endCF) && MatchFound; + j++) { Init *ColFieldJ = ColFields->getElement(j); Init *CurInstrInit = CurInstr->getValue(ColFieldJ)->getValue(); std::string CurInstrVal = CurInstrInit->getAsUnquotedString(); @@ -360,21 +358,21 @@ Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr, unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) { - ArrayRef NumberedInstructions = - Target.getInstructionsByEnumValue(); + ArrayRef NumberedInstructions = + Target.getInstructionsByEnumValue(); StringRef Namespace = Target.getInstNamespace(); - const std::vector &ValueCols = InstrMapDesc.getValueCols(); + const std::vector &ValueCols = InstrMapDesc.getValueCols(); unsigned NumCol = ValueCols.size(); unsigned TotalNumInstr = NumberedInstructions.size(); unsigned TableSize = 0; - OS << "static const uint16_t "<TheDef; - std::vector ColInstrs = MapTable[CurInstr]; + std::vector ColInstrs = MapTable[CurInstr]; std::string OutStr; unsigned RelExists = 0; if (!ColInstrs.empty()) { @@ -385,19 +383,23 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) { OutStr += Namespace; OutStr += "::"; OutStr += ColInstrs[j]->getName(); - } else { OutStr += ", (uint16_t)-1U";} + } else { + OutStr += ", (uint16_t)-1U"; + } } if (RelExists) { OS << " { " << Namespace << "::" << CurInstr->getName(); - OS << OutStr <<" },\n"; + OS << OutStr << " },\n"; TableSize++; } } } if (!TableSize) { - OS << " { " << Namespace << "::" << "INSTRUCTION_LIST_END, "; - OS << Namespace << "::" << "INSTRUCTION_LIST_END }"; + OS << " { " << Namespace << "::" + << "INSTRUCTION_LIST_END, "; + OS << Namespace << "::" + << "INSTRUCTION_LIST_END }"; } OS << "}; // End of " << InstrMapDesc.getName() << "Table\n\n"; return TableSize; @@ -430,11 +432,10 @@ void MapTableEmitter::emitBinSearch(raw_ostream &OS, unsigned TableSize) { // Emit functions to query relation tables. //===----------------------------------------------------------------------===// -void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, - unsigned TableSize) { +void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) { ListInit *ColFields = InstrMapDesc.getColFields(); - const std::vector &ValueCols = InstrMapDesc.getValueCols(); + const std::vector &ValueCols = InstrMapDesc.getValueCols(); // Emit binary search algorithm to locate instructions in the // relation table. If found, return opcode value from the appropriate column @@ -455,14 +456,13 @@ void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, } OS << ")\n"; OS << " return " << InstrMapDesc.getName(); - OS << "Table[mid]["< &ValueCols = InstrMapDesc.getValueCols(); - OS << "// "<< InstrMapDesc.getName() << "\nLLVM_READONLY\n"; - OS << "int "<< InstrMapDesc.getName() << "(uint16_t Opcode"; + const std::vector &ValueCols = InstrMapDesc.getValueCols(); + OS << "// " << InstrMapDesc.getName() << "\nLLVM_READONLY\n"; + OS << "int " << InstrMapDesc.getName() << "(uint16_t Opcode"; if (ValueCols.size() > 1) { for (Init *CF : ColFields->getValues()) { std::string ColName = CF->getAsUnquotedString(); @@ -501,9 +501,9 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) { static void emitEnums(raw_ostream &OS, RecordKeeper &Records) { - std::vector InstrMapVec; + std::vector InstrMapVec; InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping"); - std::map > ColFieldValueMap; + std::map> ColFieldValueMap; // Iterate over all InstrMapping records and create a map between column // fields and their possible values across all records. @@ -511,20 +511,22 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) { ListInit *ColFields; ColFields = CurMap->getValueAsListInit("ColFields"); ListInit *List = CurMap->getValueAsListInit("ValueCols"); - std::vector ValueCols; + std::vector ValueCols; unsigned ListSize = List->size(); for (unsigned j = 0; j < ListSize; j++) { auto *ListJ = cast(List->getElement(j)); if (ListJ->size() != ColFields->size()) - PrintFatalError("Record `" + CurMap->getName() + "', field " - "`ValueCols' entries don't match with the entries in 'ColFields' !"); + PrintFatalError("Record `" + CurMap->getName() + + "', field " + "`ValueCols' entries don't match with the entries in " + "'ColFields' !"); ValueCols.push_back(ListJ); } for (unsigned j = 0, endCF = ColFields->size(); j < endCF; j++) { - for (unsigned k = 0; k < ListSize; k++){ + for (unsigned k = 0; k < ListSize; k++) { std::string ColName = ColFields->getElement(j)->getAsUnquotedString(); ColFieldValueMap[ColName].push_back((ValueCols[k])->getElement(j)); } @@ -532,14 +534,14 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) { } for (auto &Entry : ColFieldValueMap) { - std::vector FieldValues = Entry.second; + std::vector FieldValues = Entry.second; // Delete duplicate entries from ColFieldValueMap for (unsigned i = 0; i < FieldValues.size() - 1; i++) { Init *CurVal = FieldValues[i]; - for (unsigned j = i+1; j < FieldValues.size(); j++) { + for (unsigned j = i + 1; j < FieldValues.size(); j++) { if (CurVal == FieldValues[j]) { - FieldValues.erase(FieldValues.begin()+j); + FieldValues.erase(FieldValues.begin() + j); --j; } } @@ -566,7 +568,7 @@ namespace llvm { void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) { CodeGenTarget Target(Records); StringRef NameSpace = Target.getInstNamespace(); - std::vector InstrMapVec; + std::vector InstrMapVec; InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping"); if (InstrMapVec.empty()) @@ -603,4 +605,4 @@ void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) { OS << "#endif // GET_INSTRMAP_INFO\n\n"; } -} // End llvm namespace +} // namespace llvm diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp index d1abdb7..4b89540 100644 --- a/llvm/utils/TableGen/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/CodeGenRegisters.cpp @@ -48,7 +48,7 @@ using namespace llvm; //===----------------------------------------------------------------------===// CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum) - : TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) { + : TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) { Name = std::string(R->getName()); if (R->getValue("Namespace")) Namespace = std::string(R->getValueAsString("Namespace")); @@ -74,7 +74,7 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) { if (!TheDef) return; - std::vector Comps = TheDef->getValueAsListOfDefs("ComposedOf"); + std::vector Comps = TheDef->getValueAsListOfDefs("ComposedOf"); if (!Comps.empty()) { if (Comps.size() != 2) PrintFatalError(TheDef->getLoc(), @@ -86,13 +86,13 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) { PrintFatalError(TheDef->getLoc(), "Ambiguous ComposedOf entries"); } - std::vector Parts = - TheDef->getValueAsListOfDefs("CoveringSubRegIndices"); + std::vector Parts = + TheDef->getValueAsListOfDefs("CoveringSubRegIndices"); if (!Parts.empty()) { if (Parts.size() < 2) PrintFatalError(TheDef->getLoc(), "CoveredBySubRegs must have two or more entries"); - SmallVector IdxParts; + SmallVector IdxParts; for (Record *Part : Parts) IdxParts.push_back(RegBank.getSubRegIdx(Part)); setConcatenationOf(IdxParts); @@ -117,17 +117,19 @@ LaneBitmask CodeGenSubRegIndex::computeLaneMask() const { } void CodeGenSubRegIndex::setConcatenationOf( - ArrayRef Parts) { + ArrayRef Parts) { if (ConcatenationOf.empty()) ConcatenationOf.assign(Parts.begin(), Parts.end()); else - assert(std::equal(Parts.begin(), Parts.end(), - ConcatenationOf.begin()) && "parts consistent"); + assert(std::equal(Parts.begin(), Parts.end(), ConcatenationOf.begin()) && + "parts consistent"); } void CodeGenSubRegIndex::computeConcatTransitiveClosure() { - for (SmallVectorImpl::iterator - I = ConcatenationOf.begin(); I != ConcatenationOf.end(); /*empty*/) { + for (SmallVectorImpl::iterator I = + ConcatenationOf.begin(); + I != ConcatenationOf.end(); + /*empty*/) { CodeGenSubRegIndex *SubIdx = *I; SubIdx->computeConcatTransitiveClosure(); #ifndef NDEBUG @@ -160,8 +162,8 @@ CodeGenRegister::CodeGenRegister(Record *R, unsigned Enum) } void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) { - std::vector SRIs = TheDef->getValueAsListOfDefs("SubRegIndices"); - std::vector SRs = TheDef->getValueAsListOfDefs("SubRegs"); + std::vector SRIs = TheDef->getValueAsListOfDefs("SubRegIndices"); + std::vector SRs = TheDef->getValueAsListOfDefs("SubRegs"); if (SRIs.size() != SRs.size()) PrintFatalError(TheDef->getLoc(), @@ -182,7 +184,7 @@ void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) { // Add ad hoc alias links. This is a symmetric relationship between two // registers, so build a symmetric graph by adding links in both ends. - std::vector Aliases = TheDef->getValueAsListOfDefs("Aliases"); + std::vector Aliases = TheDef->getValueAsListOfDefs("Aliases"); for (Record *Alias : Aliases) { CodeGenRegister *Reg = RegBank.getReg(Alias); ExplicitAliases.push_back(Reg); @@ -204,8 +206,8 @@ class RegUnitIterator { static CodeGenRegister::RegUnitList Sentinel; public: - RegUnitIterator(const CodeGenRegister::Vec &Regs): - RegI(Regs.begin()), RegE(Regs.end()) { + RegUnitIterator(const CodeGenRegister::Vec &Regs) + : RegI(Regs.begin()), RegE(Regs.end()) { if (RegI == RegE) { UnitI = Sentinel.end(); @@ -219,9 +221,15 @@ public: bool isValid() const { return UnitI != UnitE; } - unsigned operator* () const { assert(isValid()); return *UnitI; } + unsigned operator*() const { + assert(isValid()); + return *UnitI; + } - const CodeGenRegister *getReg() const { assert(isValid()); return *RegI; } + const CodeGenRegister *getReg() const { + assert(isValid()); + return *RegI; + } /// Preincrement. Move to the next unit. void operator++() { @@ -280,14 +288,15 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) { Idx->Artificial = false; if (!SubRegs.insert(std::make_pair(Idx, SR)).second) PrintFatalError(TheDef->getLoc(), "SubRegIndex " + Idx->getName() + - " appears twice in Register " + getName()); + " appears twice in Register " + + getName()); // Map explicit sub-registers first, so the names take precedence. // The inherited sub-registers are mapped below. SubReg2Idx.insert(std::make_pair(SR, Idx)); } // Keep track of inherited subregs and how they can be reached. - SmallPtrSet Orphans; + SmallPtrSet Orphans; // Clone inherited subregs and place duplicate entries in Orphans. // Here the order is important - earlier subregs take precedence. @@ -305,7 +314,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) { // If dsub_2 has ComposedOf = [qsub_1, dsub_0], and this register has a // qsub_1 subreg, add a dsub_2 subreg. Keep growing Indices and process // expanded subreg indices recursively. - SmallVector Indices = ExplicitSubRegIndices; + SmallVector Indices = ExplicitSubRegIndices; for (unsigned i = 0; i != Indices.size(); ++i) { CodeGenSubRegIndex *Idx = Indices[i]; const CodeGenSubRegIndex::CompMap &Comps = Idx->getComposites(); @@ -350,7 +359,8 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) { const SubRegMap &Map = SR->computeSubRegs(RegBank); for (const auto &SubReg : Map) if (Orphans.erase(SubReg.second)) - SubRegs[RegBank.getCompositeSubRegIndex(Idx, SubReg.first)] = SubReg.second; + SubRegs[RegBank.getCompositeSubRegIndex(Idx, SubReg.first)] = + SubReg.second; } // Compute the inverse SubReg -> Idx map. @@ -360,7 +370,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) { if (TheDef) Loc = TheDef->getLoc(); PrintFatalError(Loc, "Register " + getName() + - " has itself as a sub-register"); + " has itself as a sub-register"); } // Compute AllSuperRegsCovered. @@ -368,17 +378,18 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) { SubReg.first->AllSuperRegsCovered = false; // Ensure that every sub-register has a unique name. - DenseMap::iterator Ins = - SubReg2Idx.insert(std::make_pair(SubReg.second, SubReg.first)).first; + DenseMap::iterator Ins = + SubReg2Idx.insert(std::make_pair(SubReg.second, SubReg.first)).first; if (Ins->second == SubReg.first) continue; // Trouble: Two different names for SubReg.second. ArrayRef Loc; if (TheDef) Loc = TheDef->getLoc(); - PrintFatalError(Loc, "Sub-register can't have two names: " + - SubReg.second->getName() + " available as " + - SubReg.first->getName() + " and " + Ins->second->getName()); + PrintFatalError( + Loc, "Sub-register can't have two names: " + SubReg.second->getName() + + " available as " + SubReg.first->getName() + " and " + + Ins->second->getName()); } // Derive possible names for sub-register concatenations from any explicit @@ -392,7 +403,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) { continue; // SR is composed of multiple sub-regs. Find their names in this register. - SmallVector Parts; + SmallVector Parts; for (unsigned j = 0, e = SR->ExplicitSubRegs.size(); j != e; ++j) { CodeGenSubRegIndex &I = *SR->ExplicitSubRegIndices[j]; if (!I.Artificial) @@ -464,8 +475,8 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) { void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) { SmallVector NewSubRegs; - std::queue> SubRegQueue; - for (std::pair P : SubRegs) + std::queue> SubRegQueue; + for (std::pair P : SubRegs) SubRegQueue.push(P); // Look at the leading super-registers of each sub-register. Those are the @@ -479,7 +490,7 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) { const CodeGenRegister::SuperRegList &Leads = SubReg->LeadingSuperRegs; for (unsigned i = 0, e = Leads.size(); i != e; ++i) { - CodeGenRegister *Cand = const_cast(Leads[i]); + CodeGenRegister *Cand = const_cast(Leads[i]); // Already got this sub-register? if (Cand == this || getSubRegIndex(Cand)) continue; @@ -488,7 +499,7 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) { "Super-register has no sub-registers"); if (Cand->ExplicitSubRegs.size() == 1) continue; - SmallVector Parts; + SmallVector Parts; // We know that the first component is (SubRegIdx,SubReg). However we // may still need to split it into smaller subregister parts. assert(Cand->ExplicitSubRegs[0] == SubReg && "LeadingSuperRegs correct"); @@ -513,7 +524,7 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) { // Each part of Cand is a sub-register of this. Make the full Cand also // a sub-register with a concatenated sub-register index. CodeGenSubRegIndex *Concat = RegBank.getConcatSubRegIndex(Parts); - std::pair NewSubReg = + std::pair NewSubReg = std::make_pair(Concat, Cand); if (!SubRegs.insert(NewSubReg).second) @@ -570,9 +581,8 @@ void CodeGenRegister::computeSuperRegs(CodeGenRegBank &RegBank) { TopoSig = RegBank.getTopoSig(Id); } -void -CodeGenRegister::addSubRegsPreOrder(SetVector &OSet, - CodeGenRegBank &RegBank) const { +void CodeGenRegister::addSubRegsPreOrder( + SetVector &OSet, CodeGenRegBank &RegBank) const { assert(SubRegsComplete && "Must precompute sub-registers"); for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) { CodeGenRegister *SR = ExplicitSubRegs[i]; @@ -611,7 +621,7 @@ struct TupleExpander : SetTheory::Expander { : SynthDefs(SynthDefs) {} void expand(SetTheory &ST, Record *Def, SetTheory::RecSet &Elts) override { - std::vector Indices = Def->getValueAsListOfDefs("SubRegIndices"); + std::vector Indices = Def->getValueAsListOfDefs("SubRegIndices"); unsigned Dim = Indices.size(); ListInit *SubRegs = Def->getValueAsListInit("SubRegs"); if (Dim != SubRegs->size()) @@ -635,17 +645,18 @@ struct TupleExpander : SetTheory::Expander { Record *RegisterCl = Def->getRecords().getClass("Register"); RecTy *RegisterRecTy = RecordRecTy::get(RegisterCl); std::vector RegNames = - Def->getValueAsListOfStrings("RegAsmNames"); + Def->getValueAsListOfStrings("RegAsmNames"); // Zip them up. RecordKeeper &RK = Def->getRecords(); for (unsigned n = 0; n != Length; ++n) { std::string Name; Record *Proto = Lists[0][n]; - std::vector Tuple; + std::vector Tuple; for (unsigned i = 0; i != Dim; ++i) { Record *Reg = Lists[i][n]; - if (i) Name += '_'; + if (i) + Name += '_'; Name += Reg->getName(); Tuple.push_back(DefInit::get(Reg)); } @@ -660,7 +671,7 @@ struct TupleExpander : SetTheory::Expander { if (RegNames.size() <= n) PrintFatalError(Def->getLoc(), "Register tuple definition missing name for '" + - Name + "'."); + Name + "'."); AsmName = StringInit::get(RK, RegNames[n]); } @@ -703,15 +714,13 @@ struct TupleExpander : SetTheory::Expander { RV.setValue(BitInit::get(RK, true)); // Copy fields from the RegisterTuples def. - if (Field == "SubRegIndices" || - Field == "CompositeIndices") { + if (Field == "SubRegIndices" || Field == "CompositeIndices") { NewReg->addValue(*Def->getValue(Field)); continue; } // Some fields get their default uninitialized value. - if (Field == "DwarfNumbers" || - Field == "DwarfAlias" || + if (Field == "DwarfNumbers" || Field == "DwarfAlias" || Field == "Aliases") { if (const RecordVal *DefRV = RegisterCl->getValue(Field)) NewReg->addValue(*DefRV); @@ -740,7 +749,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R) : TheDef(R), Name(std::string(R->getName())), TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1), TSFlags(0) { GeneratePressureSet = R->getValueAsBit("GeneratePressureSet"); - std::vector TypeList = R->getValueAsListOfDefs("RegTypes"); + std::vector TypeList = R->getValueAsListOfDefs("RegTypes"); if (TypeList.empty()) PrintFatalError(R->getLoc(), "RegTypes list must not be empty!"); for (unsigned i = 0, e = TypeList.size(); i != e; ++i) { @@ -779,7 +788,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R) Order.pop_back(); if (!contains(Reg)) PrintFatalError(R->getLoc(), " AltOrder register " + Reg->getName() + - " is not a class member"); + " is not a class member"); } } @@ -793,8 +802,8 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R) "Impossible to determine register size"); if (!RSI.hasDefault()) { RegSizeInfo RI; - RI.RegSize = RI.SpillSize = Size ? Size - : VTs[0].getSimple().getSizeInBits(); + RI.RegSize = RI.SpillSize = + Size ? Size : VTs[0].getSimple().getSizeInBits(); RI.SpillAlignment = R->getValueAsInt("Alignment"); RSI.insertRegSizeForMode(DefaultMode, RI); } @@ -890,7 +899,7 @@ bool CodeGenRegisterClass::contains(const CodeGenRegister *Reg) const { deref>()); } -unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank& RegBank) const { +unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank &RegBank) const { if (TheDef && !TheDef->isValueUnset("Weight")) return TheDef->getValueAsInt("Weight"); @@ -902,19 +911,19 @@ unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank& RegBank) const { namespace llvm { - raw_ostream &operator<<(raw_ostream &OS, const CodeGenRegisterClass::Key &K) { - OS << "{ " << K.RSI; - for (const auto R : *K.Members) - OS << ", " << R->getName(); - return OS << " }"; - } +raw_ostream &operator<<(raw_ostream &OS, const CodeGenRegisterClass::Key &K) { + OS << "{ " << K.RSI; + for (const auto R : *K.Members) + OS << ", " << R->getName(); + return OS << " }"; +} } // end namespace llvm // This is a simple lexicographical order that can be used to search for sets. // It is not the same as the topological order provided by TopoOrderRC. -bool CodeGenRegisterClass::Key:: -operator<(const CodeGenRegisterClass::Key &B) const { +bool CodeGenRegisterClass::Key::operator<( + const CodeGenRegisterClass::Key &B) const { assert(Members && B.Members); return std::tie(*Members, RSI) < std::tie(*B.Members, B.RSI); } @@ -1066,7 +1075,7 @@ CodeGenRegisterClass::getMatchingSubClassWithSubRegs( // Find all the subreg classes and order them by size too. std::vector> SuperRegClasses; - for (auto &RC: RegClasses) { + for (auto &RC : RegClasses) { BitVector SuperRegClassesBV(RegClasses.size()); RC.getSuperRegClasses(SubIdx, SuperRegClassesBV); if (SuperRegClassesBV.any()) @@ -1129,8 +1138,8 @@ void CodeGenRegisterClass::getSuperRegClasses(const CodeGenSubRegIndex *SubIdx, } // Populate a unique sorted list of units from a register set. -void CodeGenRegisterClass::buildRegUnitSet(const CodeGenRegBank &RegBank, - std::vector &RegUnits) const { +void CodeGenRegisterClass::buildRegUnitSet( + const CodeGenRegBank &RegBank, std::vector &RegUnits) const { std::vector TmpUnits; for (RegUnitIterator UnitI(Members); UnitI.isValid(); ++UnitI) { const RegUnit &RU = RegBank.getRegUnit(*UnitI); @@ -1158,7 +1167,8 @@ CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank, //===----------------------------------------------------------------------===// CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records, - const CodeGenHwModes &Modes) : CGH(Modes) { + const CodeGenHwModes &Modes) + : CGH(Modes) { // Configure register Sets to understand register classes and tuples. Sets.addFieldExpander("RegisterClass", "MemberList"); Sets.addFieldExpander("CalleeSavedRegs", "SaveList"); @@ -1167,7 +1177,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records, // Read in the user-defined (named) sub-register indices. // More indices will be synthesized later. - std::vector SRIs = Records.getAllDerivedDefinitions("SubRegIndex"); + std::vector SRIs = Records.getAllDerivedDefinitions("SubRegIndex"); llvm::sort(SRIs, LessRecord()); for (unsigned i = 0, e = SRIs.size(); i != e; ++i) getSubRegIdx(SRIs[i]); @@ -1238,8 +1248,9 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records, SRI.computeConcatTransitiveClosure(); if (!SRI.ConcatenationOf.empty()) ConcatIdx.insert(std::make_pair( - SmallVector(SRI.ConcatenationOf.begin(), - SRI.ConcatenationOf.end()), &SRI)); + SmallVector(SRI.ConcatenationOf.begin(), + SRI.ConcatenationOf.end()), + &SRI)); } // Infer even more sub-registers by combining leading super-registers. @@ -1269,7 +1280,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records, NumNativeRegUnits = RegUnits.size(); // Read in register class definitions. - std::vector RCs = Records.getAllDerivedDefinitions("RegisterClass"); + std::vector RCs = Records.getAllDerivedDefinitions("RegisterClass"); if (RCs.empty()) PrintFatalError("No 'RegisterClass' subclasses defined!"); @@ -1299,8 +1310,8 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records, } // Create a synthetic CodeGenSubRegIndex without a corresponding Record. -CodeGenSubRegIndex* -CodeGenRegBank::createSubRegIndex(StringRef Name, StringRef Namespace) { +CodeGenSubRegIndex *CodeGenRegBank::createSubRegIndex(StringRef Name, + StringRef Namespace) { SubRegIndices.emplace_back(Name, Namespace, SubRegIndices.size() + 1); return &SubRegIndices.back(); } @@ -1315,7 +1326,7 @@ CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(Record *Def) { } const CodeGenSubRegIndex * -CodeGenRegBank::findSubRegIdx(const Record* Def) const { +CodeGenRegBank::findSubRegIdx(const Record *Def) const { return Def2SubRegIdx.lookup(Def); } @@ -1339,7 +1350,7 @@ void CodeGenRegBank::addToMaps(CodeGenRegisterClass *RC) { } // Create a synthetic sub-class if it is missing. -CodeGenRegisterClass* +CodeGenRegisterClass * CodeGenRegBank::getOrCreateSubClass(const CodeGenRegisterClass *RC, const CodeGenRegister::Vec *Members, StringRef Name) { @@ -1362,7 +1373,7 @@ CodeGenRegisterClass *CodeGenRegBank::getRegClass(const Record *Def) const { PrintFatalError(Def->getLoc(), "Not a known RegisterClass!"); } -CodeGenSubRegIndex* +CodeGenSubRegIndex * CodeGenRegBank::getCompositeSubRegIndex(CodeGenSubRegIndex *A, CodeGenSubRegIndex *B) { // Look for an existing entry. @@ -1377,8 +1388,8 @@ CodeGenRegBank::getCompositeSubRegIndex(CodeGenSubRegIndex *A, return Comp; } -CodeGenSubRegIndex *CodeGenRegBank:: -getConcatSubRegIndex(const SmallVector &Parts) { +CodeGenSubRegIndex *CodeGenRegBank::getConcatSubRegIndex( + const SmallVector &Parts) { assert(Parts.size() > 1 && "Need two parts to concatenate"); #ifndef NDEBUG for (CodeGenSubRegIndex *Idx : Parts) { @@ -1419,26 +1430,26 @@ getConcatSubRegIndex(const SmallVector &Parts) { } void CodeGenRegBank::computeComposites() { - using RegMap = std::map; + using RegMap = std::map; // Subreg -> { Reg->Reg }, where the right-hand side is the mapping from // register to (sub)register associated with the action of the left-hand // side subregister. - std::map SubRegAction; + std::map SubRegAction; for (const CodeGenRegister &R : Registers) { const CodeGenRegister::SubRegMap &SM = R.getSubRegs(); - for (std::pair P : SM) + for (std::pair P : SM) SubRegAction[P.first].insert({&R, P.second}); } // Calculate the composition of two subregisters as compositions of their // associated actions. - auto compose = [&SubRegAction] (const CodeGenSubRegIndex *Sub1, - const CodeGenSubRegIndex *Sub2) { + auto compose = [&SubRegAction](const CodeGenSubRegIndex *Sub1, + const CodeGenSubRegIndex *Sub2) { RegMap C; const RegMap &Img1 = SubRegAction.at(Sub1); const RegMap &Img2 = SubRegAction.at(Sub2); - for (std::pair P : Img1) { + for (std::pair P : Img1) { auto F = Img2.find(P.second); if (F != Img2.end()) C.insert({P.first, F->second}); @@ -1447,13 +1458,13 @@ void CodeGenRegBank::computeComposites() { }; // Check if the two maps agree on the intersection of their domains. - auto agree = [] (const RegMap &Map1, const RegMap &Map2) { + auto agree = [](const RegMap &Map1, const RegMap &Map2) { // Technically speaking, an empty map agrees with any other map, but // this could flag false positives. We're interested in non-vacuous // agreements. if (Map1.empty() || Map2.empty()) return false; - for (std::pair P : Map1) { + for (std::pair P : Map1) { auto F = Map2.find(P.first); if (F == Map2.end() || P.second != F->second) return false; @@ -1461,9 +1472,9 @@ void CodeGenRegBank::computeComposites() { return true; }; - using CompositePair = std::pair; - SmallSet UserDefined; + using CompositePair = + std::pair; + SmallSet UserDefined; for (const CodeGenSubRegIndex &Idx : SubRegIndices) for (auto P : Idx.getComposites()) UserDefined.insert(std::make_pair(&Idx, P.first)); @@ -1528,8 +1539,8 @@ void CodeGenRegBank::computeSubRegLaneMasks() { if (Idx.getComposites().empty()) { if (Bit > LaneBitmask::BitWidth) { PrintFatalError( - Twine("Ran out of lanemask bits to represent subregister ") - + Idx.getName()); + Twine("Ran out of lanemask bits to represent subregister ") + + Idx.getName()); } Idx.LaneMask = LaneBitmask::getLane(Bit); ++Bit; @@ -1556,7 +1567,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() { unsigned DstBit = Idx.LaneMask.getHighestLane(); assert(Idx.LaneMask == LaneBitmask::getLane(DstBit) && "Must be a leaf subregister"); - MaskRolPair MaskRol = { LaneBitmask::getLane(0), (uint8_t)DstBit }; + MaskRolPair MaskRol = {LaneBitmask::getLane(0), (uint8_t)DstBit}; LaneTransforms.push_back(MaskRol); } else { // Go through all leaf subregisters and find the ones that compose with @@ -1571,7 +1582,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() { // Replicate the behaviour from the lane mask generation loop above. unsigned SrcBit = NextBit; LaneBitmask SrcMask = LaneBitmask::getLane(SrcBit); - if (NextBit < LaneBitmask::BitWidth-1) + if (NextBit < LaneBitmask::BitWidth - 1) ++NextBit; assert(Idx2.LaneMask == SrcMask); @@ -1586,8 +1597,8 @@ void CodeGenRegBank::computeSubRegLaneMasks() { // Create Mask+Rotate operation and merge with existing ops if possible. unsigned DstBit = Composite->LaneMask.getHighestLane(); int Shift = DstBit - SrcBit; - uint8_t RotateLeft = Shift >= 0 ? (uint8_t)Shift - : LaneBitmask::BitWidth + Shift; + uint8_t RotateLeft = + Shift >= 0 ? (uint8_t)Shift : LaneBitmask::BitWidth + Shift; for (auto &I : LaneTransforms) { if (I.RotateLeft == RotateLeft) { I.Mask |= SrcMask; @@ -1595,7 +1606,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() { } } if (SrcMask.any()) { - MaskRolPair MaskRol = { SrcMask, RotateLeft }; + MaskRolPair MaskRol = {SrcMask, RotateLeft}; LaneTransforms.push_back(MaskRol); } } @@ -1611,7 +1622,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() { // in a sequence with 0 entries we can just pick any other. Choose // Mask 0xffffffff with Rotation 0. if (LaneTransforms.size() == 0) { - MaskRolPair P = { LaneBitmask::getAll(), 0 }; + MaskRolPair P = {LaneBitmask::getAll(), 0}; LaneTransforms.push_back(P); } } @@ -1679,7 +1690,7 @@ struct UberRegSet { // // UberRegSets[0] is a special non-allocatable set. static void computeUberSets(std::vector &UberSets, - std::vector &RegSets, + std::vector &RegSets, CodeGenRegBank &RegBank) { const auto &Registers = RegBank.getRegisters(); @@ -1742,7 +1753,8 @@ static void computeUberWeights(std::vector &UberSets, CodeGenRegBank &RegBank) { // Skip the first unallocatable set. for (std::vector::iterator I = std::next(UberSets.begin()), - E = UberSets.end(); I != E; ++I) { + E = UberSets.end(); + I != E; ++I) { // Initialize all unit weights in this set, and remember the max units/reg. const CodeGenRegister *Reg = nullptr; @@ -1797,7 +1809,7 @@ static void computeUberWeights(std::vector &UberSets, // - induces recomputation of UberWeights. static bool normalizeWeight(CodeGenRegister *Reg, std::vector &UberSets, - std::vector &RegSets, + std::vector &RegSets, BitVector &NormalRegs, CodeGenRegister::RegUnitList &NormalUnits, CodeGenRegBank &RegBank) { @@ -1830,15 +1842,14 @@ static bool normalizeWeight(CodeGenRegister *Reg, // for this register, has not been used to normalize a subregister's set, // and has not already been used to singularly determine this UberRegSet. unsigned AdjustUnit = *Reg->getRegUnits().begin(); - if (Reg->getRegUnits().count() != 1 - || hasRegUnit(NormalUnits, AdjustUnit) - || hasRegUnit(UberSet->SingularDeterminants, AdjustUnit)) { + if (Reg->getRegUnits().count() != 1 || + hasRegUnit(NormalUnits, AdjustUnit) || + hasRegUnit(UberSet->SingularDeterminants, AdjustUnit)) { // We don't have an adjustable unit, so adopt a new one. AdjustUnit = RegBank.newRegUnit(UberSet->Weight - RegWeight); Reg->adoptRegUnit(AdjustUnit); // Adopting a unit does not immediately require recomputing set weights. - } - else { + } else { // Adjust the existing single unit. if (!RegBank.getRegUnit(AdjustUnit).Artificial) RegBank.increaseRegUnitWeight(AdjustUnit, UberSet->Weight - RegWeight); @@ -1860,7 +1871,7 @@ static bool normalizeWeight(CodeGenRegister *Reg, // where each register's weight is defined as sum of its units' weights. void CodeGenRegBank::computeRegUnitWeights() { std::vector UberSets; - std::vector RegSets(Registers.size()); + std::vector RegSets(Registers.size()); computeUberSets(UberSets, RegSets, *this); // UberSets and RegSets are now immutable. @@ -1871,7 +1882,7 @@ void CodeGenRegBank::computeRegUnitWeights() { unsigned NumIters = 0; for (bool Changed = true; Changed; ++NumIters) { assert(NumIters <= NumNativeRegUnits && "Runaway register unit weights"); - (void) NumIters; + (void)NumIters; Changed = false; for (auto &Reg : Registers) { CodeGenRegister::RegUnitList NormalUnits; @@ -1887,9 +1898,9 @@ void CodeGenRegBank::computeRegUnitWeights() { static std::vector::const_iterator findRegUnitSet(const std::vector &UniqueSets, const RegUnitSet &Set) { - std::vector::const_iterator - I = UniqueSets.begin(), E = UniqueSets.end(); - for(;I != E; ++I) { + std::vector::const_iterator I = UniqueSets.begin(), + E = UniqueSets.end(); + for (; I != E; ++I) { if (I->Units == Set.Units) break; } @@ -1899,8 +1910,8 @@ findRegUnitSet(const std::vector &UniqueSets, // Return true if the RUSubSet is a subset of RUSuperSet. static bool isRegUnitSubSet(const std::vector &RUSubSet, const std::vector &RUSuperSet) { - return std::includes(RUSuperSet.begin(), RUSuperSet.end(), - RUSubSet.begin(), RUSubSet.end()); + return std::includes(RUSuperSet.begin(), RUSuperSet.end(), RUSubSet.begin(), + RUSubSet.end()); } /// Iteratively prune unit sets. Prune subsets that are close to the superset, @@ -1925,8 +1936,8 @@ void CodeGenRegBank::pruneUnitSets() { // Form an equivalence class of UnitSets with no significant difference. std::vector SuperSetIDs; - for (unsigned SubIdx = 0, EndIdx = RegUnitSets.size(); - SubIdx != EndIdx; ++SubIdx) { + for (unsigned SubIdx = 0, EndIdx = RegUnitSets.size(); SubIdx != EndIdx; + ++SubIdx) { const RegUnitSet &SubSet = RegUnitSets[SubIdx]; unsigned SuperIdx = 0; for (; SuperIdx != EndIdx; ++SuperIdx) { @@ -1935,10 +1946,10 @@ void CodeGenRegBank::pruneUnitSets() { unsigned UnitWeight = RegUnits[SubSet.Units[0]].Weight; const RegUnitSet &SuperSet = RegUnitSets[SuperIdx]; - if (isRegUnitSubSet(SubSet.Units, SuperSet.Units) - && (SubSet.Units.size() + 3 > SuperSet.Units.size()) - && UnitWeight == RegUnits[SuperSet.Units[0]].Weight - && UnitWeight == RegUnits[SuperSet.Units.back()].Weight) { + if (isRegUnitSubSet(SubSet.Units, SuperSet.Units) && + (SubSet.Units.size() + 3 > SuperSet.Units.size()) && + UnitWeight == RegUnits[SuperSet.Units[0]].Weight && + UnitWeight == RegUnits[SuperSet.Units.back()].Weight) { LLVM_DEBUG(dbgs() << "UnitSet " << SubIdx << " subsumed by " << SuperIdx << "\n"); // We can pick any of the set names for the merged set. Go for the @@ -1988,7 +1999,7 @@ void CodeGenRegBank::computeRegUnitSets() { // Find an existing RegUnitSet. std::vector::const_iterator SetI = - findRegUnitSet(RegUnitSets, RegUnitSets.back()); + findRegUnitSet(RegUnitSets, RegUnitSets.back()); if (SetI != std::prev(RegUnitSets.end())) RegUnitSets.pop_back(); } @@ -2023,10 +2034,10 @@ void CodeGenRegBank::computeRegUnitSets() { // In theory, this is combinatorial. In practice, it needs to be bounded // by a small number of sets for regpressure to be efficient. // If the assert is hit, we need to implement pruning. - assert(Idx < (2*NumRegUnitSubSets) && "runaway unit set inference"); + assert(Idx < (2 * NumRegUnitSubSets) && "runaway unit set inference"); // Compare new sets with all original classes. - for (unsigned SearchIdx = (Idx >= NumRegUnitSubSets) ? 0 : Idx+1; + for (unsigned SearchIdx = (Idx >= NumRegUnitSubSets) ? 0 : Idx + 1; SearchIdx != EndIdx; ++SearchIdx) { std::set Intersection; std::set_intersection(RegUnitSets[Idx].Units.begin(), @@ -2040,7 +2051,7 @@ void CodeGenRegBank::computeRegUnitSets() { // Speculatively grow the RegUnitSets to hold the new set. RegUnitSets.resize(RegUnitSets.size() + 1); RegUnitSets.back().Name = - RegUnitSets[Idx].Name + "_with_" + RegUnitSets[SearchIdx].Name; + RegUnitSets[Idx].Name + "_with_" + RegUnitSets[SearchIdx].Name; std::set_union(RegUnitSets[Idx].Units.begin(), RegUnitSets[Idx].Units.end(), @@ -2051,7 +2062,7 @@ void CodeGenRegBank::computeRegUnitSets() { // Find an existing RegUnitSet, or add the union to the unique sets. std::vector::const_iterator SetI = - findRegUnitSet(RegUnitSets, RegUnitSets.back()); + findRegUnitSet(RegUnitSets, RegUnitSets.back()); if (SetI != std::prev(RegUnitSets.end())) RegUnitSets.pop_back(); else { @@ -2098,8 +2109,8 @@ void CodeGenRegBank::computeRegUnitSets() { dbgs() << "\n UnitSetIDs:"); // Find all supersets. - for (unsigned USIdx = 0, USEnd = RegUnitSets.size(); - USIdx != USEnd; ++USIdx) { + for (unsigned USIdx = 0, USEnd = RegUnitSets.size(); USIdx != USEnd; + ++USIdx) { if (isRegUnitSubSet(RCRegUnits, RegUnitSets[USIdx].Units)) { LLVM_DEBUG(dbgs() << " " << USIdx); RegClassUnitSets[RCIdx].push_back(USIdx); @@ -2114,8 +2125,8 @@ void CodeGenRegBank::computeRegUnitSets() { // contain the unit. Normally, this matches an existing list of UnitSets for a // register class. If not, we create a new entry in RegClassUnitSets as a // "fake" register class. - for (unsigned UnitIdx = 0, UnitEnd = NumNativeRegUnits; - UnitIdx < UnitEnd; ++UnitIdx) { + for (unsigned UnitIdx = 0, UnitEnd = NumNativeRegUnits; UnitIdx < UnitEnd; + ++UnitIdx) { std::vector RUSets; for (unsigned i = 0, e = RegUnitSets.size(); i != e; ++i) { RegUnitSet &RUSet = RegUnitSets[i]; @@ -2124,8 +2135,8 @@ void CodeGenRegBank::computeRegUnitSets() { RUSets.push_back(i); } unsigned RCUnitSetsIdx = 0; - for (unsigned e = RegClassUnitSets.size(); - RCUnitSetsIdx != e; ++RCUnitSetsIdx) { + for (unsigned e = RegClassUnitSets.size(); RCUnitSetsIdx != e; + ++RCUnitSetsIdx) { if (RegClassUnitSets[RCUnitSetsIdx] == RUSets) { break; } @@ -2301,9 +2312,8 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) { continue; } // This is a real subset. See if we have a matching class. - CodeGenRegisterClass *SubRC = - getOrCreateSubClass(RC, &I->second, - RC->getName() + "_with_" + I->first->getName()); + CodeGenRegisterClass *SubRC = getOrCreateSubClass( + RC, &I->second, RC->getName() + "_with_" + I->first->getName()); RC->setSubClassWithSubReg(&SubIdx, SubRC); } } @@ -2315,8 +2325,9 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) { // has a maximal result for any SubIdx and any X >= FirstSubRegRC. // -void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC, - std::list::iterator FirstSubRegRC) { +void CodeGenRegBank::inferMatchingSuperRegClass( + CodeGenRegisterClass *RC, + std::list::iterator FirstSubRegRC) { DenseMap> SubToSuperRegs; BitVector TopoSigs(getNumTopoSigs()); @@ -2374,9 +2385,9 @@ void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC, // Only a subset of RC maps into SubRC. Make sure it is represented by a // class. - getOrCreateSubClass(RC, &SubSetVec, RC->getName() + "_with_" + - SubIdx.getName() + "_in_" + - SubRC.getName()); + getOrCreateSubClass(RC, &SubSetVec, + RC->getName() + "_with_" + SubIdx.getName() + "_in_" + + SubRC.getName()); } } } @@ -2431,8 +2442,7 @@ void CodeGenRegBank::computeInferredRegisterClasses() { /// return null. If the register is in multiple classes, and the classes have a /// superset-subset relationship and the same set of types, return the /// superclass. Otherwise return null. -const CodeGenRegisterClass* -CodeGenRegBank::getRegClassForRegister(Record *R) { +const CodeGenRegisterClass *CodeGenRegBank::getRegClassForRegister(Record *R) { const CodeGenRegister *Reg = getReg(R); const CodeGenRegisterClass *FoundRC = nullptr; for (const auto &RC : getRegClasses()) { @@ -2477,8 +2487,8 @@ CodeGenRegBank::getMinimalPhysRegClass(Record *RegRecord, const CodeGenRegister *Reg = getReg(RegRecord); const CodeGenRegisterClass *BestRC = nullptr; for (const auto &RC : getRegClasses()) { - if ((!VT || RC.hasType(*VT)) && - RC.contains(Reg) && (!BestRC || BestRC->hasSubClass(&RC))) + if ((!VT || RC.hasType(*VT)) && RC.contains(Reg) && + (!BestRC || BestRC->hasSubClass(&RC))) BestRC = &RC; } @@ -2486,8 +2496,8 @@ CodeGenRegBank::getMinimalPhysRegClass(Record *RegRecord, return BestRC; } -BitVector CodeGenRegBank::computeCoveredRegisters(ArrayRef Regs) { - SetVector Set; +BitVector CodeGenRegBank::computeCoveredRegisters(ArrayRef Regs) { + SetVector Set; // First add Regs with all sub-registers. for (unsigned i = 0, e = Regs.size(); i != e; ++i) { diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h index 97f6081..cfc6d87 100644 --- a/llvm/utils/TableGen/CodeGenRegisters.h +++ b/llvm/utils/TableGen/CodeGenRegisters.h @@ -44,810 +44,798 @@ namespace llvm { - class CodeGenRegBank; +class CodeGenRegBank; - /// Used to encode a step in a register lane mask transformation. - /// Mask the bits specified in Mask, then rotate them Rol bits to the left - /// assuming a wraparound at 32bits. - struct MaskRolPair { - LaneBitmask Mask; - uint8_t RotateLeft; +/// Used to encode a step in a register lane mask transformation. +/// Mask the bits specified in Mask, then rotate them Rol bits to the left +/// assuming a wraparound at 32bits. +struct MaskRolPair { + LaneBitmask Mask; + uint8_t RotateLeft; - bool operator==(const MaskRolPair Other) const { - return Mask == Other.Mask && RotateLeft == Other.RotateLeft; - } - bool operator!=(const MaskRolPair Other) const { - return Mask != Other.Mask || RotateLeft != Other.RotateLeft; - } - }; - - /// CodeGenSubRegIndex - Represents a sub-register index. - class CodeGenSubRegIndex { - Record *const TheDef; - std::string Name; - std::string Namespace; - - public: - uint16_t Size; - uint16_t Offset; - const unsigned EnumValue; - mutable LaneBitmask LaneMask; - mutable SmallVector CompositionLaneMaskTransform; - - /// A list of subregister indexes concatenated resulting in this - /// subregister index. This is the reverse of CodeGenRegBank::ConcatIdx. - SmallVector ConcatenationOf; - - // Are all super-registers containing this SubRegIndex covered by their - // sub-registers? - bool AllSuperRegsCovered; - // A subregister index is "artificial" if every subregister obtained - // from applying this index is artificial. Artificial subregister - // indexes are not used to create new register classes. - bool Artificial; - - CodeGenSubRegIndex(Record *R, unsigned Enum); - CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum); - CodeGenSubRegIndex(CodeGenSubRegIndex&) = delete; - - const std::string &getName() const { return Name; } - const std::string &getNamespace() const { return Namespace; } - std::string getQualifiedName() const; - - // Map of composite subreg indices. - typedef std::map>> - CompMap; - - // Returns the subreg index that results from composing this with Idx. - // Returns NULL if this and Idx don't compose. - CodeGenSubRegIndex *compose(CodeGenSubRegIndex *Idx) const { - CompMap::const_iterator I = Composed.find(Idx); - return I == Composed.end() ? nullptr : I->second; - } + bool operator==(const MaskRolPair Other) const { + return Mask == Other.Mask && RotateLeft == Other.RotateLeft; + } + bool operator!=(const MaskRolPair Other) const { + return Mask != Other.Mask || RotateLeft != Other.RotateLeft; + } +}; + +/// CodeGenSubRegIndex - Represents a sub-register index. +class CodeGenSubRegIndex { + Record *const TheDef; + std::string Name; + std::string Namespace; + +public: + uint16_t Size; + uint16_t Offset; + const unsigned EnumValue; + mutable LaneBitmask LaneMask; + mutable SmallVector CompositionLaneMaskTransform; + + /// A list of subregister indexes concatenated resulting in this + /// subregister index. This is the reverse of CodeGenRegBank::ConcatIdx. + SmallVector ConcatenationOf; + + // Are all super-registers containing this SubRegIndex covered by their + // sub-registers? + bool AllSuperRegsCovered; + // A subregister index is "artificial" if every subregister obtained + // from applying this index is artificial. Artificial subregister + // indexes are not used to create new register classes. + bool Artificial; + + CodeGenSubRegIndex(Record *R, unsigned Enum); + CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum); + CodeGenSubRegIndex(CodeGenSubRegIndex &) = delete; + + const std::string &getName() const { return Name; } + const std::string &getNamespace() const { return Namespace; } + std::string getQualifiedName() const; + + // Map of composite subreg indices. + typedef std::map>> + CompMap; + + // Returns the subreg index that results from composing this with Idx. + // Returns NULL if this and Idx don't compose. + CodeGenSubRegIndex *compose(CodeGenSubRegIndex *Idx) const { + CompMap::const_iterator I = Composed.find(Idx); + return I == Composed.end() ? nullptr : I->second; + } - // Add a composite subreg index: this+A = B. - // Return a conflicting composite, or NULL - CodeGenSubRegIndex *addComposite(CodeGenSubRegIndex *A, - CodeGenSubRegIndex *B) { - assert(A && B); - std::pair Ins = + // Add a composite subreg index: this+A = B. + // Return a conflicting composite, or NULL + CodeGenSubRegIndex *addComposite(CodeGenSubRegIndex *A, + CodeGenSubRegIndex *B) { + assert(A && B); + std::pair Ins = Composed.insert(std::make_pair(A, B)); - // Synthetic subreg indices that aren't contiguous (for instance ARM - // register tuples) don't have a bit range, so it's OK to let - // B->Offset == -1. For the other cases, accumulate the offset and set - // the size here. Only do so if there is no offset yet though. - if ((Offset != (uint16_t)-1 && A->Offset != (uint16_t)-1) && - (B->Offset == (uint16_t)-1)) { - B->Offset = Offset + A->Offset; - B->Size = A->Size; - } - return (Ins.second || Ins.first->second == B) ? nullptr - : Ins.first->second; - } - - // Update the composite maps of components specified in 'ComposedOf'. - void updateComponents(CodeGenRegBank&); - - // Return the map of composites. - const CompMap &getComposites() const { return Composed; } - - // Compute LaneMask from Composed. Return LaneMask. - LaneBitmask computeLaneMask() const; - - void setConcatenationOf(ArrayRef Parts); - - /// Replaces subregister indexes in the `ConcatenationOf` list with - /// list of subregisters they are composed of (if any). Do this recursively. - void computeConcatTransitiveClosure(); - - bool operator<(const CodeGenSubRegIndex &RHS) const { - return this->EnumValue < RHS.EnumValue; - } - - private: - CompMap Composed; - }; - - /// CodeGenRegister - Represents a register definition. - class CodeGenRegister { - public: - Record *TheDef; - unsigned EnumValue; - std::vector CostPerUse; - bool CoveredBySubRegs = true; - bool HasDisjunctSubRegs = false; - bool Artificial = true; - bool Constant = false; + // Synthetic subreg indices that aren't contiguous (for instance ARM + // register tuples) don't have a bit range, so it's OK to let + // B->Offset == -1. For the other cases, accumulate the offset and set + // the size here. Only do so if there is no offset yet though. + if ((Offset != (uint16_t)-1 && A->Offset != (uint16_t)-1) && + (B->Offset == (uint16_t)-1)) { + B->Offset = Offset + A->Offset; + B->Size = A->Size; + } + return (Ins.second || Ins.first->second == B) ? nullptr : Ins.first->second; + } - // Map SubRegIndex -> Register. - typedef std::map>> - SubRegMap; + // Update the composite maps of components specified in 'ComposedOf'. + void updateComponents(CodeGenRegBank &); - CodeGenRegister(Record *R, unsigned Enum); + // Return the map of composites. + const CompMap &getComposites() const { return Composed; } - StringRef getName() const; + // Compute LaneMask from Composed. Return LaneMask. + LaneBitmask computeLaneMask() const; - // Extract more information from TheDef. This is used to build an object - // graph after all CodeGenRegister objects have been created. - void buildObjectGraph(CodeGenRegBank&); + void setConcatenationOf(ArrayRef Parts); - // Lazily compute a map of all sub-registers. - // This includes unique entries for all sub-sub-registers. - const SubRegMap &computeSubRegs(CodeGenRegBank&); + /// Replaces subregister indexes in the `ConcatenationOf` list with + /// list of subregisters they are composed of (if any). Do this recursively. + void computeConcatTransitiveClosure(); - // Compute extra sub-registers by combining the existing sub-registers. - void computeSecondarySubRegs(CodeGenRegBank&); + bool operator<(const CodeGenSubRegIndex &RHS) const { + return this->EnumValue < RHS.EnumValue; + } - // Add this as a super-register to all sub-registers after the sub-register - // graph has been built. - void computeSuperRegs(CodeGenRegBank&); +private: + CompMap Composed; +}; - const SubRegMap &getSubRegs() const { - assert(SubRegsComplete && "Must precompute sub-registers"); - return SubRegs; - } +/// CodeGenRegister - Represents a register definition. +class CodeGenRegister { +public: + Record *TheDef; + unsigned EnumValue; + std::vector CostPerUse; + bool CoveredBySubRegs = true; + bool HasDisjunctSubRegs = false; + bool Artificial = true; + bool Constant = false; - // Add sub-registers to OSet following a pre-order defined by the .td file. - void addSubRegsPreOrder(SetVector &OSet, - CodeGenRegBank&) const; + // Map SubRegIndex -> Register. + typedef std::map>> + SubRegMap; - // Return the sub-register index naming Reg as a sub-register of this - // register. Returns NULL if Reg is not a sub-register. - CodeGenSubRegIndex *getSubRegIndex(const CodeGenRegister *Reg) const { - return SubReg2Idx.lookup(Reg); - } + CodeGenRegister(Record *R, unsigned Enum); - typedef std::vector SuperRegList; + StringRef getName() const; - // Get the list of super-registers in topological order, small to large. - // This is valid after computeSubRegs visits all registers during RegBank - // construction. - const SuperRegList &getSuperRegs() const { - assert(SubRegsComplete && "Must precompute sub-registers"); - return SuperRegs; - } + // Extract more information from TheDef. This is used to build an object + // graph after all CodeGenRegister objects have been created. + void buildObjectGraph(CodeGenRegBank &); - // Get the list of ad hoc aliases. The graph is symmetric, so the list - // contains all registers in 'Aliases', and all registers that mention this - // register in 'Aliases'. - ArrayRef getExplicitAliases() const { - return ExplicitAliases; - } + // Lazily compute a map of all sub-registers. + // This includes unique entries for all sub-sub-registers. + const SubRegMap &computeSubRegs(CodeGenRegBank &); - // Get the topological signature of this register. This is a small integer - // less than RegBank.getNumTopoSigs(). Registers with the same TopoSig have - // identical sub-register structure. That is, they support the same set of - // sub-register indices mapping to the same kind of sub-registers - // (TopoSig-wise). - unsigned getTopoSig() const { - assert(SuperRegsComplete && "TopoSigs haven't been computed yet."); - return TopoSig; - } + // Compute extra sub-registers by combining the existing sub-registers. + void computeSecondarySubRegs(CodeGenRegBank &); - // List of register units in ascending order. - typedef SparseBitVector<> RegUnitList; - typedef SmallVector RegUnitLaneMaskList; + // Add this as a super-register to all sub-registers after the sub-register + // graph has been built. + void computeSuperRegs(CodeGenRegBank &); - // How many entries in RegUnitList are native? - RegUnitList NativeRegUnits; + const SubRegMap &getSubRegs() const { + assert(SubRegsComplete && "Must precompute sub-registers"); + return SubRegs; + } - // Get the list of register units. - // This is only valid after computeSubRegs() completes. - const RegUnitList &getRegUnits() const { return RegUnits; } + // Add sub-registers to OSet following a pre-order defined by the .td file. + void addSubRegsPreOrder(SetVector &OSet, + CodeGenRegBank &) const; - ArrayRef getRegUnitLaneMasks() const { - return ArrayRef(RegUnitLaneMasks).slice(0, NativeRegUnits.count()); - } + // Return the sub-register index naming Reg as a sub-register of this + // register. Returns NULL if Reg is not a sub-register. + CodeGenSubRegIndex *getSubRegIndex(const CodeGenRegister *Reg) const { + return SubReg2Idx.lookup(Reg); + } - // Get the native register units. This is a prefix of getRegUnits(). - RegUnitList getNativeRegUnits() const { - return NativeRegUnits; - } + typedef std::vector SuperRegList; - void setRegUnitLaneMasks(const RegUnitLaneMaskList &LaneMasks) { - RegUnitLaneMasks = LaneMasks; - } + // Get the list of super-registers in topological order, small to large. + // This is valid after computeSubRegs visits all registers during RegBank + // construction. + const SuperRegList &getSuperRegs() const { + assert(SubRegsComplete && "Must precompute sub-registers"); + return SuperRegs; + } - // Inherit register units from subregisters. - // Return true if the RegUnits changed. - bool inheritRegUnits(CodeGenRegBank &RegBank); + // Get the list of ad hoc aliases. The graph is symmetric, so the list + // contains all registers in 'Aliases', and all registers that mention this + // register in 'Aliases'. + ArrayRef getExplicitAliases() const { + return ExplicitAliases; + } - // Adopt a register unit for pressure tracking. - // A unit is adopted iff its unit number is >= NativeRegUnits.count(). - void adoptRegUnit(unsigned RUID) { RegUnits.set(RUID); } + // Get the topological signature of this register. This is a small integer + // less than RegBank.getNumTopoSigs(). Registers with the same TopoSig have + // identical sub-register structure. That is, they support the same set of + // sub-register indices mapping to the same kind of sub-registers + // (TopoSig-wise). + unsigned getTopoSig() const { + assert(SuperRegsComplete && "TopoSigs haven't been computed yet."); + return TopoSig; + } - // Get the sum of this register's register unit weights. - unsigned getWeight(const CodeGenRegBank &RegBank) const; + // List of register units in ascending order. + typedef SparseBitVector<> RegUnitList; + typedef SmallVector RegUnitLaneMaskList; - // Canonically ordered set. - typedef std::vector Vec; + // How many entries in RegUnitList are native? + RegUnitList NativeRegUnits; - private: - bool SubRegsComplete; - bool SuperRegsComplete; - unsigned TopoSig; + // Get the list of register units. + // This is only valid after computeSubRegs() completes. + const RegUnitList &getRegUnits() const { return RegUnits; } - // The sub-registers explicit in the .td file form a tree. - SmallVector ExplicitSubRegIndices; - SmallVector ExplicitSubRegs; + ArrayRef getRegUnitLaneMasks() const { + return ArrayRef(RegUnitLaneMasks).slice(0, NativeRegUnits.count()); + } - // Explicit ad hoc aliases, symmetrized to form an undirected graph. - SmallVector ExplicitAliases; + // Get the native register units. This is a prefix of getRegUnits(). + RegUnitList getNativeRegUnits() const { return NativeRegUnits; } - // Super-registers where this is the first explicit sub-register. - SuperRegList LeadingSuperRegs; + void setRegUnitLaneMasks(const RegUnitLaneMaskList &LaneMasks) { + RegUnitLaneMasks = LaneMasks; + } - SubRegMap SubRegs; - SuperRegList SuperRegs; - DenseMap SubReg2Idx; - RegUnitList RegUnits; - RegUnitLaneMaskList RegUnitLaneMasks; - }; + // Inherit register units from subregisters. + // Return true if the RegUnits changed. + bool inheritRegUnits(CodeGenRegBank &RegBank); + + // Adopt a register unit for pressure tracking. + // A unit is adopted iff its unit number is >= NativeRegUnits.count(). + void adoptRegUnit(unsigned RUID) { RegUnits.set(RUID); } + + // Get the sum of this register's register unit weights. + unsigned getWeight(const CodeGenRegBank &RegBank) const; + + // Canonically ordered set. + typedef std::vector Vec; + +private: + bool SubRegsComplete; + bool SuperRegsComplete; + unsigned TopoSig; + + // The sub-registers explicit in the .td file form a tree. + SmallVector ExplicitSubRegIndices; + SmallVector ExplicitSubRegs; + + // Explicit ad hoc aliases, symmetrized to form an undirected graph. + SmallVector ExplicitAliases; + + // Super-registers where this is the first explicit sub-register. + SuperRegList LeadingSuperRegs; + + SubRegMap SubRegs; + SuperRegList SuperRegs; + DenseMap SubReg2Idx; + RegUnitList RegUnits; + RegUnitLaneMaskList RegUnitLaneMasks; +}; + +inline bool operator<(const CodeGenRegister &A, const CodeGenRegister &B) { + return A.EnumValue < B.EnumValue; +} + +inline bool operator==(const CodeGenRegister &A, const CodeGenRegister &B) { + return A.EnumValue == B.EnumValue; +} + +class CodeGenRegisterClass { + CodeGenRegister::Vec Members; + // Allocation orders. Order[0] always contains all registers in Members. + std::vector> Orders; + // Bit mask of sub-classes including this, indexed by their EnumValue. + BitVector SubClasses; + // List of super-classes, topologocally ordered to have the larger classes + // first. This is the same as sorting by EnumValue. + SmallVector SuperClasses; + Record *TheDef; + std::string Name; + + // For a synthesized class, inherit missing properties from the nearest + // super-class. + void inheritProperties(CodeGenRegBank &); + + // Map SubRegIndex -> sub-class. This is the largest sub-class where all + // registers have a SubRegIndex sub-register. + DenseMap + SubClassWithSubReg; + + // Map SubRegIndex -> set of super-reg classes. This is all register + // classes SuperRC such that: + // + // R:SubRegIndex in this RC for all R in SuperRC. + // + DenseMap> + SuperRegClasses; + + // Bit vector of TopoSigs for the registers in this class. This will be + // very sparse on regular architectures. + BitVector TopoSigs; + +public: + unsigned EnumValue; + StringRef Namespace; + SmallVector VTs; + RegSizeInfoByHwMode RSI; + int CopyCost; + bool Allocatable; + StringRef AltOrderSelect; + uint8_t AllocationPriority; + bool GlobalPriority; + uint8_t TSFlags; + /// Contains the combination of the lane masks of all subregisters. + LaneBitmask LaneMask; + /// True if there are at least 2 subregisters which do not interfere. + bool HasDisjunctSubRegs; + bool CoveredBySubRegs; + /// A register class is artificial if all its members are artificial. + bool Artificial; + /// Generate register pressure set for this register class and any class + /// synthesized from it. + bool GeneratePressureSet; + + // Return the Record that defined this class, or NULL if the class was + // created by TableGen. + Record *getDef() const { return TheDef; } + + std::string getNamespaceQualification() const; + const std::string &getName() const { return Name; } + std::string getQualifiedName() const; + std::string getIdName() const; + std::string getQualifiedIdName() const; + ArrayRef getValueTypes() const { return VTs; } + unsigned getNumValueTypes() const { return VTs.size(); } + bool hasType(const ValueTypeByHwMode &VT) const; + + const ValueTypeByHwMode &getValueTypeNum(unsigned VTNum) const { + if (VTNum < VTs.size()) + return VTs[VTNum]; + llvm_unreachable("VTNum greater than number of ValueTypes in RegClass!"); + } - inline bool operator<(const CodeGenRegister &A, const CodeGenRegister &B) { - return A.EnumValue < B.EnumValue; - } - - inline bool operator==(const CodeGenRegister &A, const CodeGenRegister &B) { - return A.EnumValue == B.EnumValue; - } - - class CodeGenRegisterClass { - CodeGenRegister::Vec Members; - // Allocation orders. Order[0] always contains all registers in Members. - std::vector> Orders; - // Bit mask of sub-classes including this, indexed by their EnumValue. - BitVector SubClasses; - // List of super-classes, topologocally ordered to have the larger classes - // first. This is the same as sorting by EnumValue. - SmallVector SuperClasses; - Record *TheDef; - std::string Name; - - // For a synthesized class, inherit missing properties from the nearest - // super-class. - void inheritProperties(CodeGenRegBank&); - - // Map SubRegIndex -> sub-class. This is the largest sub-class where all - // registers have a SubRegIndex sub-register. - DenseMap - SubClassWithSubReg; - - // Map SubRegIndex -> set of super-reg classes. This is all register - // classes SuperRC such that: - // - // R:SubRegIndex in this RC for all R in SuperRC. - // - DenseMap> - SuperRegClasses; - - // Bit vector of TopoSigs for the registers in this class. This will be - // very sparse on regular architectures. - BitVector TopoSigs; - - public: - unsigned EnumValue; - StringRef Namespace; - SmallVector VTs; - RegSizeInfoByHwMode RSI; - int CopyCost; - bool Allocatable; - StringRef AltOrderSelect; - uint8_t AllocationPriority; - bool GlobalPriority; - uint8_t TSFlags; - /// Contains the combination of the lane masks of all subregisters. - LaneBitmask LaneMask; - /// True if there are at least 2 subregisters which do not interfere. - bool HasDisjunctSubRegs; - bool CoveredBySubRegs; - /// A register class is artificial if all its members are artificial. - bool Artificial; - /// Generate register pressure set for this register class and any class - /// synthesized from it. - bool GeneratePressureSet; - - // Return the Record that defined this class, or NULL if the class was - // created by TableGen. - Record *getDef() const { return TheDef; } - - std::string getNamespaceQualification() const; - const std::string &getName() const { return Name; } - std::string getQualifiedName() const; - std::string getIdName() const; - std::string getQualifiedIdName() const; - ArrayRef getValueTypes() const { return VTs; } - unsigned getNumValueTypes() const { return VTs.size(); } - bool hasType(const ValueTypeByHwMode &VT) const; - - const ValueTypeByHwMode &getValueTypeNum(unsigned VTNum) const { - if (VTNum < VTs.size()) - return VTs[VTNum]; - llvm_unreachable("VTNum greater than number of ValueTypes in RegClass!"); - } + // Return true if this class contains the register. + bool contains(const CodeGenRegister *) const; - // Return true if this class contains the register. - bool contains(const CodeGenRegister*) const; - - // Returns true if RC is a subclass. - // RC is a sub-class of this class if it is a valid replacement for any - // instruction operand where a register of this classis required. It must - // satisfy these conditions: - // - // 1. All RC registers are also in this. - // 2. The RC spill size must not be smaller than our spill size. - // 3. RC spill alignment must be compatible with ours. - // - bool hasSubClass(const CodeGenRegisterClass *RC) const { - return SubClasses.test(RC->EnumValue); - } + // Returns true if RC is a subclass. + // RC is a sub-class of this class if it is a valid replacement for any + // instruction operand where a register of this classis required. It must + // satisfy these conditions: + // + // 1. All RC registers are also in this. + // 2. The RC spill size must not be smaller than our spill size. + // 3. RC spill alignment must be compatible with ours. + // + bool hasSubClass(const CodeGenRegisterClass *RC) const { + return SubClasses.test(RC->EnumValue); + } - // getSubClassWithSubReg - Returns the largest sub-class where all - // registers have a SubIdx sub-register. - CodeGenRegisterClass * - getSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx) const { - return SubClassWithSubReg.lookup(SubIdx); - } + // getSubClassWithSubReg - Returns the largest sub-class where all + // registers have a SubIdx sub-register. + CodeGenRegisterClass * + getSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx) const { + return SubClassWithSubReg.lookup(SubIdx); + } - /// Find largest subclass where all registers have SubIdx subregisters in - /// SubRegClass and the largest subregister class that contains those - /// subregisters without (as far as possible) also containing additional registers. - /// - /// This can be used to find a suitable pair of classes for subregister copies. - /// \return std::pair where SubClass is a SubClass is - /// a class where every register has SubIdx and SubRegClass is a class where - /// every register is covered by the SubIdx subregister of SubClass. - std::optional> - getMatchingSubClassWithSubRegs(CodeGenRegBank &RegBank, - const CodeGenSubRegIndex *SubIdx) const; - - void setSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx, - CodeGenRegisterClass *SubRC) { - SubClassWithSubReg[SubIdx] = SubRC; - } + /// Find largest subclass where all registers have SubIdx subregisters in + /// SubRegClass and the largest subregister class that contains those + /// subregisters without (as far as possible) also containing additional + /// registers. + /// + /// This can be used to find a suitable pair of classes for subregister + /// copies. \return std::pair where SubClass is a + /// SubClass is a class where every register has SubIdx and SubRegClass is a + /// class where every register is covered by the SubIdx subregister of + /// SubClass. + std::optional> + getMatchingSubClassWithSubRegs(CodeGenRegBank &RegBank, + const CodeGenSubRegIndex *SubIdx) const; + + void setSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx, + CodeGenRegisterClass *SubRC) { + SubClassWithSubReg[SubIdx] = SubRC; + } - // getSuperRegClasses - Returns a bit vector of all register classes - // containing only SubIdx super-registers of this class. - void getSuperRegClasses(const CodeGenSubRegIndex *SubIdx, - BitVector &Out) const; + // getSuperRegClasses - Returns a bit vector of all register classes + // containing only SubIdx super-registers of this class. + void getSuperRegClasses(const CodeGenSubRegIndex *SubIdx, + BitVector &Out) const; - // addSuperRegClass - Add a class containing only SubIdx super-registers. - void addSuperRegClass(CodeGenSubRegIndex *SubIdx, - CodeGenRegisterClass *SuperRC) { - SuperRegClasses[SubIdx].insert(SuperRC); - } + // addSuperRegClass - Add a class containing only SubIdx super-registers. + void addSuperRegClass(CodeGenSubRegIndex *SubIdx, + CodeGenRegisterClass *SuperRC) { + SuperRegClasses[SubIdx].insert(SuperRC); + } - // getSubClasses - Returns a constant BitVector of subclasses indexed by - // EnumValue. - // The SubClasses vector includes an entry for this class. - const BitVector &getSubClasses() const { return SubClasses; } + // getSubClasses - Returns a constant BitVector of subclasses indexed by + // EnumValue. + // The SubClasses vector includes an entry for this class. + const BitVector &getSubClasses() const { return SubClasses; } - // getSuperClasses - Returns a list of super classes ordered by EnumValue. - // The array does not include an entry for this class. - ArrayRef getSuperClasses() const { - return SuperClasses; - } + // getSuperClasses - Returns a list of super classes ordered by EnumValue. + // The array does not include an entry for this class. + ArrayRef getSuperClasses() const { + return SuperClasses; + } - // Returns an ordered list of class members. - // The order of registers is the same as in the .td file. - // No = 0 is the default allocation order, No = 1 is the first alternative. - ArrayRef getOrder(unsigned No = 0) const { - return Orders[No]; - } + // Returns an ordered list of class members. + // The order of registers is the same as in the .td file. + // No = 0 is the default allocation order, No = 1 is the first alternative. + ArrayRef getOrder(unsigned No = 0) const { return Orders[No]; } - // Return the total number of allocation orders available. - unsigned getNumOrders() const { return Orders.size(); } + // Return the total number of allocation orders available. + unsigned getNumOrders() const { return Orders.size(); } - // Get the set of registers. This set contains the same registers as - // getOrder(0). - const CodeGenRegister::Vec &getMembers() const { return Members; } + // Get the set of registers. This set contains the same registers as + // getOrder(0). + const CodeGenRegister::Vec &getMembers() const { return Members; } - // Get a bit vector of TopoSigs present in this register class. - const BitVector &getTopoSigs() const { return TopoSigs; } + // Get a bit vector of TopoSigs present in this register class. + const BitVector &getTopoSigs() const { return TopoSigs; } - // Get a weight of this register class. - unsigned getWeight(const CodeGenRegBank&) const; + // Get a weight of this register class. + unsigned getWeight(const CodeGenRegBank &) const; - // Populate a unique sorted list of units from a register set. - void buildRegUnitSet(const CodeGenRegBank &RegBank, - std::vector &RegUnits) const; + // Populate a unique sorted list of units from a register set. + void buildRegUnitSet(const CodeGenRegBank &RegBank, + std::vector &RegUnits) const; - CodeGenRegisterClass(CodeGenRegBank&, Record *R); - CodeGenRegisterClass(CodeGenRegisterClass&) = delete; + CodeGenRegisterClass(CodeGenRegBank &, Record *R); + CodeGenRegisterClass(CodeGenRegisterClass &) = delete; - // A key representing the parts of a register class used for forming - // sub-classes. Note the ordering provided by this key is not the same as - // the topological order used for the EnumValues. - struct Key { - const CodeGenRegister::Vec *Members; - RegSizeInfoByHwMode RSI; + // A key representing the parts of a register class used for forming + // sub-classes. Note the ordering provided by this key is not the same as + // the topological order used for the EnumValues. + struct Key { + const CodeGenRegister::Vec *Members; + RegSizeInfoByHwMode RSI; - Key(const CodeGenRegister::Vec *M, const RegSizeInfoByHwMode &I) + Key(const CodeGenRegister::Vec *M, const RegSizeInfoByHwMode &I) : Members(M), RSI(I) {} - Key(const CodeGenRegisterClass &RC) + Key(const CodeGenRegisterClass &RC) : Members(&RC.getMembers()), RSI(RC.RSI) {} - // Lexicographical order of (Members, RegSizeInfoByHwMode). - bool operator<(const Key&) const; - }; - - // Create a non-user defined register class. - CodeGenRegisterClass(CodeGenRegBank&, StringRef Name, Key Props); - - // Called by CodeGenRegBank::CodeGenRegBank(). - static void computeSubClasses(CodeGenRegBank&); - - // Get ordering value among register base classes. - std::optional getBaseClassOrder() const { - if (TheDef && !TheDef->isValueUnset("BaseClassOrder")) - return TheDef->getValueAsInt("BaseClassOrder"); - return {}; - } + // Lexicographical order of (Members, RegSizeInfoByHwMode). + bool operator<(const Key &) const; }; - // Register categories are used when we need to deterine the category a - // register falls into (GPR, vector, fixed, etc.) without having to know - // specific information about the target architecture. - class CodeGenRegisterCategory { - Record *TheDef; - std::string Name; - std::list Classes; + // Create a non-user defined register class. + CodeGenRegisterClass(CodeGenRegBank &, StringRef Name, Key Props); - public: - CodeGenRegisterCategory(CodeGenRegBank &, Record *R); - CodeGenRegisterCategory(CodeGenRegisterCategory &) = delete; + // Called by CodeGenRegBank::CodeGenRegBank(). + static void computeSubClasses(CodeGenRegBank &); - // Return the Record that defined this class, or NULL if the class was - // created by TableGen. - Record *getDef() const { return TheDef; } - - std::string getName() const { return Name; } - std::list getClasses() const { return Classes; } - }; + // Get ordering value among register base classes. + std::optional getBaseClassOrder() const { + if (TheDef && !TheDef->isValueUnset("BaseClassOrder")) + return TheDef->getValueAsInt("BaseClassOrder"); + return {}; + } +}; + +// Register categories are used when we need to deterine the category a +// register falls into (GPR, vector, fixed, etc.) without having to know +// specific information about the target architecture. +class CodeGenRegisterCategory { + Record *TheDef; + std::string Name; + std::list Classes; + +public: + CodeGenRegisterCategory(CodeGenRegBank &, Record *R); + CodeGenRegisterCategory(CodeGenRegisterCategory &) = delete; + + // Return the Record that defined this class, or NULL if the class was + // created by TableGen. + Record *getDef() const { return TheDef; } + + std::string getName() const { return Name; } + std::list getClasses() const { return Classes; } +}; + +// Register units are used to model interference and register pressure. +// Every register is assigned one or more register units such that two +// registers overlap if and only if they have a register unit in common. +// +// Normally, one register unit is created per leaf register. Non-leaf +// registers inherit the units of their sub-registers. +struct RegUnit { + // Weight assigned to this RegUnit for estimating register pressure. + // This is useful when equalizing weights in register classes with mixed + // register topologies. + unsigned Weight; + + // Each native RegUnit corresponds to one or two root registers. The full + // set of registers containing this unit can be computed as the union of + // these two registers and their super-registers. + const CodeGenRegister *Roots[2]; + + // Index into RegClassUnitSets where we can find the list of UnitSets that + // contain this unit. + unsigned RegClassUnitSetsIdx; + // A register unit is artificial if at least one of its roots is + // artificial. + bool Artificial; + + RegUnit() : Weight(0), RegClassUnitSetsIdx(0), Artificial(false) { + Roots[0] = Roots[1] = nullptr; + } - // Register units are used to model interference and register pressure. - // Every register is assigned one or more register units such that two - // registers overlap if and only if they have a register unit in common. - // - // Normally, one register unit is created per leaf register. Non-leaf - // registers inherit the units of their sub-registers. - struct RegUnit { - // Weight assigned to this RegUnit for estimating register pressure. - // This is useful when equalizing weights in register classes with mixed - // register topologies. - unsigned Weight; - - // Each native RegUnit corresponds to one or two root registers. The full - // set of registers containing this unit can be computed as the union of - // these two registers and their super-registers. - const CodeGenRegister *Roots[2]; - - // Index into RegClassUnitSets where we can find the list of UnitSets that - // contain this unit. - unsigned RegClassUnitSetsIdx; - // A register unit is artificial if at least one of its roots is - // artificial. - bool Artificial; - - RegUnit() : Weight(0), RegClassUnitSetsIdx(0), Artificial(false) { - Roots[0] = Roots[1] = nullptr; - } + ArrayRef getRoots() const { + assert(!(Roots[1] && !Roots[0]) && "Invalid roots array"); + return ArrayRef(Roots, !!Roots[0] + !!Roots[1]); + } +}; - ArrayRef getRoots() const { - assert(!(Roots[1] && !Roots[0]) && "Invalid roots array"); - return ArrayRef(Roots, !!Roots[0] + !!Roots[1]); - } - }; +// Each RegUnitSet is a sorted vector with a name. +struct RegUnitSet { + typedef std::vector::const_iterator iterator; - // Each RegUnitSet is a sorted vector with a name. - struct RegUnitSet { - typedef std::vector::const_iterator iterator; + std::string Name; + std::vector Units; + unsigned Weight = 0; // Cache the sum of all unit weights. + unsigned Order = 0; // Cache the sort key. - std::string Name; - std::vector Units; - unsigned Weight = 0; // Cache the sum of all unit weights. - unsigned Order = 0; // Cache the sort key. + RegUnitSet() = default; +}; - RegUnitSet() = default; - }; +// Base vector for identifying TopoSigs. The contents uniquely identify a +// TopoSig, only computeSuperRegs needs to know how. +typedef SmallVector TopoSigId; - // Base vector for identifying TopoSigs. The contents uniquely identify a - // TopoSig, only computeSuperRegs needs to know how. - typedef SmallVector TopoSigId; +// CodeGenRegBank - Represent a target's registers and the relations between +// them. +class CodeGenRegBank { + SetTheory Sets; - // CodeGenRegBank - Represent a target's registers and the relations between - // them. - class CodeGenRegBank { - SetTheory Sets; + const CodeGenHwModes &CGH; - const CodeGenHwModes &CGH; + std::deque SubRegIndices; + DenseMap Def2SubRegIdx; - std::deque SubRegIndices; - DenseMap Def2SubRegIdx; + CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace); - CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace); + typedef std::map, CodeGenSubRegIndex *> + ConcatIdxMap; + ConcatIdxMap ConcatIdx; - typedef std::map, - CodeGenSubRegIndex*> ConcatIdxMap; - ConcatIdxMap ConcatIdx; + // Registers. + std::deque Registers; + StringMap RegistersByName; + DenseMap Def2Reg; + unsigned NumNativeRegUnits; - // Registers. - std::deque Registers; - StringMap RegistersByName; - DenseMap Def2Reg; - unsigned NumNativeRegUnits; + std::map TopoSigs; - std::map TopoSigs; + // Includes native (0..NumNativeRegUnits-1) and adopted register units. + SmallVector RegUnits; - // Includes native (0..NumNativeRegUnits-1) and adopted register units. - SmallVector RegUnits; + // Register classes. + std::list RegClasses; + DenseMap Def2RC; + typedef std::map RCKeyMap; + RCKeyMap Key2RC; - // Register classes. - std::list RegClasses; - DenseMap Def2RC; - typedef std::map RCKeyMap; - RCKeyMap Key2RC; + // Register categories. + std::list RegCategories; + DenseMap Def2RCat; + using RCatKeyMap = + std::map; + RCatKeyMap Key2RCat; - // Register categories. - std::list RegCategories; - DenseMap Def2RCat; - using RCatKeyMap = - std::map; - RCatKeyMap Key2RCat; + // Remember each unique set of register units. Initially, this contains a + // unique set for each register class. Simliar sets are coalesced with + // pruneUnitSets and new supersets are inferred during computeRegUnitSets. + std::vector RegUnitSets; - // Remember each unique set of register units. Initially, this contains a - // unique set for each register class. Simliar sets are coalesced with - // pruneUnitSets and new supersets are inferred during computeRegUnitSets. - std::vector RegUnitSets; + // Map RegisterClass index to the index of the RegUnitSet that contains the + // class's units and any inferred RegUnit supersets. + // + // NOTE: This could grow beyond the number of register classes when we map + // register units to lists of unit sets. If the list of unit sets does not + // already exist for a register class, we create a new entry in this vector. + std::vector> RegClassUnitSets; - // Map RegisterClass index to the index of the RegUnitSet that contains the - // class's units and any inferred RegUnit supersets. - // - // NOTE: This could grow beyond the number of register classes when we map - // register units to lists of unit sets. If the list of unit sets does not - // already exist for a register class, we create a new entry in this vector. - std::vector> RegClassUnitSets; + // Give each register unit set an order based on sorting criteria. + std::vector RegUnitSetOrder; - // Give each register unit set an order based on sorting criteria. - std::vector RegUnitSetOrder; + // Keep track of synthesized definitions generated in TupleExpander. + std::vector> SynthDefs; - // Keep track of synthesized definitions generated in TupleExpander. - std::vector> SynthDefs; + // Add RC to *2RC maps. + void addToMaps(CodeGenRegisterClass *); - // Add RC to *2RC maps. - void addToMaps(CodeGenRegisterClass*); + // Create a synthetic sub-class if it is missing. + CodeGenRegisterClass *getOrCreateSubClass(const CodeGenRegisterClass *RC, + const CodeGenRegister::Vec *Membs, + StringRef Name); - // Create a synthetic sub-class if it is missing. - CodeGenRegisterClass *getOrCreateSubClass(const CodeGenRegisterClass *RC, - const CodeGenRegister::Vec *Membs, - StringRef Name); + // Infer missing register classes. + void computeInferredRegisterClasses(); + void inferCommonSubClass(CodeGenRegisterClass *RC); + void inferSubClassWithSubReg(CodeGenRegisterClass *RC); - // Infer missing register classes. - void computeInferredRegisterClasses(); - void inferCommonSubClass(CodeGenRegisterClass *RC); - void inferSubClassWithSubReg(CodeGenRegisterClass *RC); + void inferMatchingSuperRegClass(CodeGenRegisterClass *RC) { + inferMatchingSuperRegClass(RC, RegClasses.begin()); + } - void inferMatchingSuperRegClass(CodeGenRegisterClass *RC) { - inferMatchingSuperRegClass(RC, RegClasses.begin()); - } + void inferMatchingSuperRegClass( + CodeGenRegisterClass *RC, + std::list::iterator FirstSubRegRC); - void inferMatchingSuperRegClass( - CodeGenRegisterClass *RC, - std::list::iterator FirstSubRegRC); + // Iteratively prune unit sets. + void pruneUnitSets(); - // Iteratively prune unit sets. - void pruneUnitSets(); + // Compute a weight for each register unit created during getSubRegs. + void computeRegUnitWeights(); - // Compute a weight for each register unit created during getSubRegs. - void computeRegUnitWeights(); + // Create a RegUnitSet for each RegClass and infer superclasses. + void computeRegUnitSets(); - // Create a RegUnitSet for each RegClass and infer superclasses. - void computeRegUnitSets(); + // Populate the Composite map from sub-register relationships. + void computeComposites(); - // Populate the Composite map from sub-register relationships. - void computeComposites(); + // Compute a lane mask for each sub-register index. + void computeSubRegLaneMasks(); - // Compute a lane mask for each sub-register index. - void computeSubRegLaneMasks(); + /// Computes a lane mask for each register unit enumerated by a physical + /// register. + void computeRegUnitLaneMasks(); - /// Computes a lane mask for each register unit enumerated by a physical - /// register. - void computeRegUnitLaneMasks(); +public: + CodeGenRegBank(RecordKeeper &, const CodeGenHwModes &); + CodeGenRegBank(CodeGenRegBank &) = delete; - public: - CodeGenRegBank(RecordKeeper&, const CodeGenHwModes&); - CodeGenRegBank(CodeGenRegBank&) = delete; + SetTheory &getSets() { return Sets; } - SetTheory &getSets() { return Sets; } + const CodeGenHwModes &getHwModes() const { return CGH; } - const CodeGenHwModes &getHwModes() const { return CGH; } + // Sub-register indices. The first NumNamedIndices are defined by the user + // in the .td files. The rest are synthesized such that all sub-registers + // have a unique name. + const std::deque &getSubRegIndices() const { + return SubRegIndices; + } - // Sub-register indices. The first NumNamedIndices are defined by the user - // in the .td files. The rest are synthesized such that all sub-registers - // have a unique name. - const std::deque &getSubRegIndices() const { - return SubRegIndices; - } + // Find a SubRegIndex from its Record def or add to the list if it does + // not exist there yet. + CodeGenSubRegIndex *getSubRegIdx(Record *); - // Find a SubRegIndex from its Record def or add to the list if it does - // not exist there yet. - CodeGenSubRegIndex *getSubRegIdx(Record*); + // Find a SubRegIndex from its Record def. + const CodeGenSubRegIndex *findSubRegIdx(const Record *Def) const; - // Find a SubRegIndex from its Record def. - const CodeGenSubRegIndex *findSubRegIdx(const Record* Def) const; + // Find or create a sub-register index representing the A+B composition. + CodeGenSubRegIndex *getCompositeSubRegIndex(CodeGenSubRegIndex *A, + CodeGenSubRegIndex *B); - // Find or create a sub-register index representing the A+B composition. - CodeGenSubRegIndex *getCompositeSubRegIndex(CodeGenSubRegIndex *A, - CodeGenSubRegIndex *B); + // Find or create a sub-register index representing the concatenation of + // non-overlapping sibling indices. + CodeGenSubRegIndex * + getConcatSubRegIndex(const SmallVector &); - // Find or create a sub-register index representing the concatenation of - // non-overlapping sibling indices. - CodeGenSubRegIndex * - getConcatSubRegIndex(const SmallVector&); + const std::deque &getRegisters() const { return Registers; } - const std::deque &getRegisters() const { - return Registers; - } + const StringMap &getRegistersByName() const { + return RegistersByName; + } - const StringMap &getRegistersByName() const { - return RegistersByName; - } + // Find a register from its Record def. + CodeGenRegister *getReg(Record *); - // Find a register from its Record def. - CodeGenRegister *getReg(Record*); + // Get a Register's index into the Registers array. + unsigned getRegIndex(const CodeGenRegister *Reg) const { + return Reg->EnumValue - 1; + } - // Get a Register's index into the Registers array. - unsigned getRegIndex(const CodeGenRegister *Reg) const { - return Reg->EnumValue - 1; - } + // Return the number of allocated TopoSigs. The first TopoSig representing + // leaf registers is allocated number 0. + unsigned getNumTopoSigs() const { return TopoSigs.size(); } - // Return the number of allocated TopoSigs. The first TopoSig representing - // leaf registers is allocated number 0. - unsigned getNumTopoSigs() const { - return TopoSigs.size(); - } + // Find or create a TopoSig for the given TopoSigId. + // This function is only for use by CodeGenRegister::computeSuperRegs(). + // Others should simply use Reg->getTopoSig(). + unsigned getTopoSig(const TopoSigId &Id) { + return TopoSigs.insert(std::make_pair(Id, TopoSigs.size())).first->second; + } - // Find or create a TopoSig for the given TopoSigId. - // This function is only for use by CodeGenRegister::computeSuperRegs(). - // Others should simply use Reg->getTopoSig(). - unsigned getTopoSig(const TopoSigId &Id) { - return TopoSigs.insert(std::make_pair(Id, TopoSigs.size())).first->second; - } + // Create a native register unit that is associated with one or two root + // registers. + unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = nullptr) { + RegUnits.resize(RegUnits.size() + 1); + RegUnit &RU = RegUnits.back(); + RU.Roots[0] = R0; + RU.Roots[1] = R1; + RU.Artificial = R0->Artificial; + if (R1) + RU.Artificial |= R1->Artificial; + return RegUnits.size() - 1; + } - // Create a native register unit that is associated with one or two root - // registers. - unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = nullptr) { - RegUnits.resize(RegUnits.size() + 1); - RegUnit &RU = RegUnits.back(); - RU.Roots[0] = R0; - RU.Roots[1] = R1; - RU.Artificial = R0->Artificial; - if (R1) - RU.Artificial |= R1->Artificial; - return RegUnits.size() - 1; - } + // Create a new non-native register unit that can be adopted by a register + // to increase its pressure. Note that NumNativeRegUnits is not increased. + unsigned newRegUnit(unsigned Weight) { + RegUnits.resize(RegUnits.size() + 1); + RegUnits.back().Weight = Weight; + return RegUnits.size() - 1; + } - // Create a new non-native register unit that can be adopted by a register - // to increase its pressure. Note that NumNativeRegUnits is not increased. - unsigned newRegUnit(unsigned Weight) { - RegUnits.resize(RegUnits.size() + 1); - RegUnits.back().Weight = Weight; - return RegUnits.size() - 1; - } + // Native units are the singular unit of a leaf register. Register aliasing + // is completely characterized by native units. Adopted units exist to give + // register additional weight but don't affect aliasing. + bool isNativeUnit(unsigned RUID) const { return RUID < NumNativeRegUnits; } - // Native units are the singular unit of a leaf register. Register aliasing - // is completely characterized by native units. Adopted units exist to give - // register additional weight but don't affect aliasing. - bool isNativeUnit(unsigned RUID) const { - return RUID < NumNativeRegUnits; - } + unsigned getNumNativeRegUnits() const { return NumNativeRegUnits; } - unsigned getNumNativeRegUnits() const { - return NumNativeRegUnits; - } + RegUnit &getRegUnit(unsigned RUID) { return RegUnits[RUID]; } + const RegUnit &getRegUnit(unsigned RUID) const { return RegUnits[RUID]; } - RegUnit &getRegUnit(unsigned RUID) { return RegUnits[RUID]; } - const RegUnit &getRegUnit(unsigned RUID) const { return RegUnits[RUID]; } + std::list &getRegClasses() { return RegClasses; } - std::list &getRegClasses() { return RegClasses; } + const std::list &getRegClasses() const { + return RegClasses; + } - const std::list &getRegClasses() const { - return RegClasses; - } + std::list &getRegCategories() { + return RegCategories; + } - std::list &getRegCategories() { - return RegCategories; - } + const std::list &getRegCategories() const { + return RegCategories; + } - const std::list &getRegCategories() const { - return RegCategories; - } + // Find a register class from its def. + CodeGenRegisterClass *getRegClass(const Record *) const; + + /// getRegisterClassForRegister - Find the register class that contains the + /// specified physical register. If the register is not in a register + /// class, return null. If the register is in multiple classes, and the + /// classes have a superset-subset relationship and the same set of types, + /// return the superclass. Otherwise return null. + const CodeGenRegisterClass *getRegClassForRegister(Record *R); + + // Analog of TargetRegisterInfo::getMinimalPhysRegClass. Unlike + // getRegClassForRegister, this tries to find the smallest class containing + // the physical register. If \p VT is specified, it will only find classes + // with a matching type + const CodeGenRegisterClass * + getMinimalPhysRegClass(Record *RegRecord, ValueTypeByHwMode *VT = nullptr); + + // Get the sum of unit weights. + unsigned getRegUnitSetWeight(const std::vector &Units) const { + unsigned Weight = 0; + for (unsigned Unit : Units) + Weight += getRegUnit(Unit).Weight; + return Weight; + } - // Find a register class from its def. - CodeGenRegisterClass *getRegClass(const Record *) const; - - /// getRegisterClassForRegister - Find the register class that contains the - /// specified physical register. If the register is not in a register - /// class, return null. If the register is in multiple classes, and the - /// classes have a superset-subset relationship and the same set of types, - /// return the superclass. Otherwise return null. - const CodeGenRegisterClass* getRegClassForRegister(Record *R); - - // Analog of TargetRegisterInfo::getMinimalPhysRegClass. Unlike - // getRegClassForRegister, this tries to find the smallest class containing - // the physical register. If \p VT is specified, it will only find classes - // with a matching type - const CodeGenRegisterClass * - getMinimalPhysRegClass(Record *RegRecord, ValueTypeByHwMode *VT = nullptr); - - // Get the sum of unit weights. - unsigned getRegUnitSetWeight(const std::vector &Units) const { - unsigned Weight = 0; - for (unsigned Unit : Units) - Weight += getRegUnit(Unit).Weight; - return Weight; - } + unsigned getRegSetIDAt(unsigned Order) const { + return RegUnitSetOrder[Order]; + } - unsigned getRegSetIDAt(unsigned Order) const { - return RegUnitSetOrder[Order]; - } + const RegUnitSet &getRegSetAt(unsigned Order) const { + return RegUnitSets[RegUnitSetOrder[Order]]; + } - const RegUnitSet &getRegSetAt(unsigned Order) const { - return RegUnitSets[RegUnitSetOrder[Order]]; - } + // Increase a RegUnitWeight. + void increaseRegUnitWeight(unsigned RUID, unsigned Inc) { + getRegUnit(RUID).Weight += Inc; + } - // Increase a RegUnitWeight. - void increaseRegUnitWeight(unsigned RUID, unsigned Inc) { - getRegUnit(RUID).Weight += Inc; - } + // Get the number of register pressure dimensions. + unsigned getNumRegPressureSets() const { return RegUnitSets.size(); } - // Get the number of register pressure dimensions. - unsigned getNumRegPressureSets() const { return RegUnitSets.size(); } + // Get a set of register unit IDs for a given dimension of pressure. + const RegUnitSet &getRegPressureSet(unsigned Idx) const { + return RegUnitSets[Idx]; + } - // Get a set of register unit IDs for a given dimension of pressure. - const RegUnitSet &getRegPressureSet(unsigned Idx) const { - return RegUnitSets[Idx]; - } + // The number of pressure set lists may be larget than the number of + // register classes if some register units appeared in a list of sets that + // did not correspond to an existing register class. + unsigned getNumRegClassPressureSetLists() const { + return RegClassUnitSets.size(); + } - // The number of pressure set lists may be larget than the number of - // register classes if some register units appeared in a list of sets that - // did not correspond to an existing register class. - unsigned getNumRegClassPressureSetLists() const { - return RegClassUnitSets.size(); - } + // Get a list of pressure set IDs for a register class. Liveness of a + // register in this class impacts each pressure set in this list by the + // weight of the register. An exact solution requires all registers in a + // class to have the same class, but it is not strictly guaranteed. + ArrayRef getRCPressureSetIDs(unsigned RCIdx) const { + return RegClassUnitSets[RCIdx]; + } - // Get a list of pressure set IDs for a register class. Liveness of a - // register in this class impacts each pressure set in this list by the - // weight of the register. An exact solution requires all registers in a - // class to have the same class, but it is not strictly guaranteed. - ArrayRef getRCPressureSetIDs(unsigned RCIdx) const { - return RegClassUnitSets[RCIdx]; - } + // Computed derived records such as missing sub-register indices. + void computeDerivedInfo(); - // Computed derived records such as missing sub-register indices. - void computeDerivedInfo(); - - // Compute the set of registers completely covered by the registers in Regs. - // The returned BitVector will have a bit set for each register in Regs, - // all sub-registers, and all super-registers that are covered by the - // registers in Regs. - // - // This is used to compute the mask of call-preserved registers from a list - // of callee-saves. - BitVector computeCoveredRegisters(ArrayRef Regs); - - // Bit mask of lanes that cover their registers. A sub-register index whose - // LaneMask is contained in CoveringLanes will be completely covered by - // another sub-register with the same or larger lane mask. - LaneBitmask CoveringLanes; - - // Helper function for printing debug information. Handles artificial - // (non-native) reg units. - void printRegUnitName(unsigned Unit) const; - }; + // Compute the set of registers completely covered by the registers in Regs. + // The returned BitVector will have a bit set for each register in Regs, + // all sub-registers, and all super-registers that are covered by the + // registers in Regs. + // + // This is used to compute the mask of call-preserved registers from a list + // of callee-saves. + BitVector computeCoveredRegisters(ArrayRef Regs); + + // Bit mask of lanes that cover their registers. A sub-register index whose + // LaneMask is contained in CoveringLanes will be completely covered by + // another sub-register with the same or larger lane mask. + LaneBitmask CoveringLanes; + + // Helper function for printing debug information. Handles artificial + // (non-native) reg units. + void printRegUnitName(unsigned Unit) const; +}; } // end namespace llvm diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp index 54463da..9cebc42 100644 --- a/llvm/utils/TableGen/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/CodeGenSchedule.cpp @@ -51,7 +51,7 @@ struct InstrsOp : public SetTheory::Operator { // (instregex "OpcPat",...) Find all instructions matching an opcode pattern. struct InstRegexOp : public SetTheory::Operator { const CodeGenTarget &Target; - InstRegexOp(const CodeGenTarget &t): Target(t) {} + InstRegexOp(const CodeGenTarget &t) : Target(t) {} /// Remove any text inside of parentheses from S. static std::string removeParens(llvm::StringRef S) { @@ -182,8 +182,8 @@ struct InstRegexOp : public SetTheory::Operator { /// CodeGenModels ctor interprets machine model records and populates maps. CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK, - const CodeGenTarget &TGT): - Records(RK), Target(TGT) { + const CodeGenTarget &TGT) + : Records(RK), Target(TGT) { Sets.addFieldExpander("InstRW", "Instrs"); @@ -298,9 +298,8 @@ static APInt constructOperandMask(ArrayRef Indices) { return OperandMask; } -static void -processSTIPredicate(STIPredicateFunction &Fn, - const ProcModelMapTy &ProcModelMap) { +static void processSTIPredicate(STIPredicateFunction &Fn, + const ProcModelMapTy &ProcModelMap) { DenseMap Opcode2Index; using OpcodeMapPair = std::pair; std::vector OpcodeMappings; @@ -380,30 +379,29 @@ processSTIPredicate(STIPredicateFunction &Fn, // Sort OpcodeMappings elements based on their CPU and predicate masks. // As a last resort, order elements by opcode identifier. - llvm::sort(OpcodeMappings, - [&](const OpcodeMapPair &Lhs, const OpcodeMapPair &Rhs) { - unsigned LhsIdx = Opcode2Index[Lhs.first]; - unsigned RhsIdx = Opcode2Index[Rhs.first]; - const std::pair &LhsMasks = OpcodeMasks[LhsIdx]; - const std::pair &RhsMasks = OpcodeMasks[RhsIdx]; - - auto PopulationCountAndLeftBit = - [](const APInt &Other) -> std::pair { - return std::pair(Other.popcount(), - -Other.countl_zero()); - }; - auto lhsmask_first = PopulationCountAndLeftBit(LhsMasks.first); - auto rhsmask_first = PopulationCountAndLeftBit(RhsMasks.first); - if (lhsmask_first != rhsmask_first) - return lhsmask_first < rhsmask_first; - - auto lhsmask_second = PopulationCountAndLeftBit(LhsMasks.second); - auto rhsmask_second = PopulationCountAndLeftBit(RhsMasks.second); - if (lhsmask_second != rhsmask_second) - return lhsmask_second < rhsmask_second; - - return LhsIdx < RhsIdx; - }); + llvm::sort( + OpcodeMappings, [&](const OpcodeMapPair &Lhs, const OpcodeMapPair &Rhs) { + unsigned LhsIdx = Opcode2Index[Lhs.first]; + unsigned RhsIdx = Opcode2Index[Rhs.first]; + const std::pair &LhsMasks = OpcodeMasks[LhsIdx]; + const std::pair &RhsMasks = OpcodeMasks[RhsIdx]; + + auto PopulationCountAndLeftBit = + [](const APInt &Other) -> std::pair { + return std::pair(Other.popcount(), -Other.countl_zero()); + }; + auto lhsmask_first = PopulationCountAndLeftBit(LhsMasks.first); + auto rhsmask_first = PopulationCountAndLeftBit(RhsMasks.first); + if (lhsmask_first != rhsmask_first) + return lhsmask_first < rhsmask_first; + + auto lhsmask_second = PopulationCountAndLeftBit(LhsMasks.second); + auto rhsmask_second = PopulationCountAndLeftBit(RhsMasks.second); + if (lhsmask_second != rhsmask_second) + return lhsmask_second < rhsmask_second; + + return LhsIdx < RhsIdx; + }); // Now construct opcode groups. Groups are used by the SubtargetEmitter when // expanding the body of a STIPredicate function. In particular, each opcode @@ -498,8 +496,7 @@ void CodeGenSchedModels::collectLoadStoreQueueInfo() { CodeGenProcModel &PM = getProcModel(Queue->getValueAsDef("SchedModel")); if (Queue->isSubClassOf("LoadQueue")) { if (PM.LoadQueue) { - PrintError(Queue->getLoc(), - "Expected a single LoadQueue definition"); + PrintError(Queue->getLoc(), "Expected a single LoadQueue definition"); PrintNote(PM.LoadQueue->getLoc(), "Previous definition of LoadQueue was here"); } @@ -509,8 +506,7 @@ void CodeGenSchedModels::collectLoadStoreQueueInfo() { if (Queue->isSubClassOf("StoreQueue")) { if (PM.StoreQueue) { - PrintError(Queue->getLoc(), - "Expected a single StoreQueue definition"); + PrintError(Queue->getLoc(), "Expected a single StoreQueue definition"); PrintNote(PM.StoreQueue->getLoc(), "Previous definition of StoreQueue was here"); } @@ -542,14 +538,15 @@ void CodeGenSchedModels::collectProcModels() { // Check for duplicated names. auto I = std::adjacent_find(ProcRecords.begin(), ProcRecords.end(), [](const Record *Rec1, const Record *Rec2) { - return Rec1->getValueAsString("Name") == Rec2->getValueAsString("Name"); - }); + return Rec1->getValueAsString("Name") == + Rec2->getValueAsString("Name"); + }); if (I != ProcRecords.end()) PrintFatalError((*I)->getLoc(), "Duplicate processor name " + - (*I)->getValueAsString("Name")); + (*I)->getValueAsString("Name")); // Reserve space because we can. Reallocation would be ok. - ProcModels.reserve(ProcRecords.size()+1); + ProcModels.reserve(ProcRecords.size() + 1); // Use idx=0 for NoModel/NoItineraries. Record *NoModelDef = Records.getDef("NoSchedModel"); @@ -574,8 +571,7 @@ void CodeGenSchedModels::addProcModel(Record *ProcDef) { if (ModelKey->isSubClassOf("SchedMachineModel")) { Record *ItinsDef = ModelKey->getValueAsDef("Itineraries"); ProcModels.emplace_back(ProcModels.size(), Name, ModelKey, ItinsDef); - } - else { + } else { // An itinerary is defined without a machine model. Infer a new model. if (!ModelKey->getValueAsListOfDefs("IID").empty()) Name = Name + "Model"; @@ -587,7 +583,7 @@ void CodeGenSchedModels::addProcModel(Record *ProcDef) { // Recursively find all reachable SchedReadWrite records. static void scanSchedRW(Record *RWDef, RecVec &RWDefs, - SmallPtrSet &RWSet) { + SmallPtrSet &RWSet) { if (!RWSet.insert(RWDef).second) return; RWDefs.push_back(RWDef); @@ -596,8 +592,7 @@ static void scanSchedRW(Record *RWDef, RecVec &RWDefs, RecVec Seq = RWDef->getValueAsListOfDefs("Writes"); for (Record *WSRec : Seq) scanSchedRW(WSRec, RWDefs, RWSet); - } - else if (RWDef->isSubClassOf("SchedVariant")) { + } else if (RWDef->isSubClassOf("SchedVariant")) { // Visit each variant (guarded by a different predicate). RecVec Vars = RWDef->getValueAsListOfDefs("Variants"); for (Record *Variant : Vars) { @@ -616,7 +611,7 @@ void CodeGenSchedModels::collectSchedRW() { SchedWrites.resize(1); SchedReads.resize(1); - SmallPtrSet RWSet; + SmallPtrSet RWSet; // Find all SchedReadWrites referenced by instruction defs. RecVec SWDefs, SRDefs; @@ -673,8 +668,7 @@ void CodeGenSchedModels::collectSchedRW() { if (!AliasDef->isSubClassOf("SchedWrite")) PrintFatalError(ADef->getLoc(), "SchedWrite Alias must be SchedWrite"); scanSchedRW(AliasDef, SWDefs, RWSet); - } - else { + } else { assert(MatchDef->isSubClassOf("SchedRead") && "Unknown SchedReadWrite"); if (!AliasDef->isSubClassOf("SchedRead")) PrintFatalError(ADef->getLoc(), "SchedRead Alias must be SchedRead"); @@ -690,7 +684,7 @@ void CodeGenSchedModels::collectSchedRW() { } llvm::sort(SRDefs, LessRecord()); for (Record *SRDef : SRDefs) { - assert(!getSchedRWIdx(SRDef, /*IsRead-*/true) && "duplicate SchedWrite"); + assert(!getSchedRWIdx(SRDef, /*IsRead-*/ true) && "duplicate SchedWrite"); SchedReads.emplace_back(SchedReads.size(), SRDef); } // Initialize WriteSequence vectors. @@ -753,9 +747,9 @@ unsigned CodeGenSchedModels::getSchedRWIdx(const Record *Def, } bool CodeGenSchedModels::hasReadOfWrite(Record *WriteDef) const { - for (auto& ProcModel : ProcModels) { + for (auto &ProcModel : ProcModels) { const RecVec &RADefs = ProcModel.ReadAdvanceDefs; - for (auto& RADef : RADefs) { + for (auto &RADef : RADefs) { RecVec ValidWrites = RADef->getValueAsListOfDefs("ValidWrites"); if (is_contained(ValidWrites, WriteDef)) return true; @@ -764,8 +758,8 @@ bool CodeGenSchedModels::hasReadOfWrite(Record *WriteDef) const { return false; } -static void splitSchedReadWrites(const RecVec &RWDefs, - RecVec &WriteDefs, RecVec &ReadDefs) { +static void splitSchedReadWrites(const RecVec &RWDefs, RecVec &WriteDefs, + RecVec &ReadDefs) { for (Record *RWDef : RWDefs) { if (RWDef->isSubClassOf("SchedWrite")) WriteDefs.push_back(RWDef); @@ -777,8 +771,8 @@ static void splitSchedReadWrites(const RecVec &RWDefs, } // Split the SchedReadWrites defs and call findRWs for each list. -void CodeGenSchedModels::findRWs(const RecVec &RWDefs, - IdxVec &Writes, IdxVec &Reads) const { +void CodeGenSchedModels::findRWs(const RecVec &RWDefs, IdxVec &Writes, + IdxVec &Reads) const { RecVec WriteDefs; RecVec ReadDefs; splitSchedReadWrites(RWDefs, WriteDefs, ReadDefs); @@ -803,8 +797,7 @@ void CodeGenSchedModels::expandRWSequence(unsigned RWIdx, IdxVec &RWSeq, RWSeq.push_back(RWIdx); return; } - int Repeat = - SchedRW.TheDef ? SchedRW.TheDef->getValueAsInt("Repeat") : 1; + int Repeat = SchedRW.TheDef ? SchedRW.TheDef->getValueAsInt("Repeat") : 1; for (int i = 0; i < Repeat; ++i) { for (unsigned I : SchedRW.Sequence) { expandRWSequence(I, RWSeq, IsRead); @@ -815,8 +808,8 @@ void CodeGenSchedModels::expandRWSequence(unsigned RWIdx, IdxVec &RWSeq, // Expand a SchedWrite as a sequence following any aliases that coincide with // the given processor model. void CodeGenSchedModels::expandRWSeqForProc( - unsigned RWIdx, IdxVec &RWSeq, bool IsRead, - const CodeGenProcModel &ProcModel) const { + unsigned RWIdx, IdxVec &RWSeq, bool IsRead, + const CodeGenProcModel &ProcModel) const { const CodeGenSchedRW &SchedWrite = getSchedRW(RWIdx, IsRead); Record *AliasDef = nullptr; @@ -828,14 +821,16 @@ void CodeGenSchedModels::expandRWSeqForProc( continue; } if (AliasDef) - PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases " - "defined for processor " + ProcModel.ModelName + - " Ensure only one SchedAlias exists per RW."); + PrintFatalError(AliasRW.TheDef->getLoc(), + "Multiple aliases " + "defined for processor " + + ProcModel.ModelName + + " Ensure only one SchedAlias exists per RW."); AliasDef = AliasRW.TheDef; } if (AliasDef) { - expandRWSeqForProc(getSchedRWIdx(AliasDef, IsRead), - RWSeq, IsRead,ProcModel); + expandRWSeqForProc(getSchedRWIdx(AliasDef, IsRead), RWSeq, IsRead, + ProcModel); return; } if (!SchedWrite.IsSequence) { @@ -843,7 +838,7 @@ void CodeGenSchedModels::expandRWSeqForProc( return; } int Repeat = - SchedWrite.TheDef ? SchedWrite.TheDef->getValueAsInt("Repeat") : 1; + SchedWrite.TheDef ? SchedWrite.TheDef->getValueAsInt("Repeat") : 1; for (int I = 0, E = Repeat; I < E; ++I) { for (unsigned Idx : SchedWrite.Sequence) { expandRWSeqForProc(Idx, RWSeq, IsRead, ProcModel); @@ -888,8 +883,7 @@ void CodeGenSchedModels::collectSchedClasses() { // NoItinerary is always the first class at Idx=0 assert(SchedClasses.empty() && "Expected empty sched class"); - SchedClasses.emplace_back(0, "NoInstrModel", - Records.getDef("NoItinerary")); + SchedClasses.emplace_back(0, "NoInstrModel", Records.getDef("NoItinerary")); SchedClasses.back().ProcIndices.push_back(0); // Create a SchedClass for each unique combination of itinerary class and @@ -901,7 +895,7 @@ void CodeGenSchedModels::collectSchedClasses() { findRWs(Inst->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads); // ProcIdx == 0 indicates the class applies to all processors. - unsigned SCIdx = addSchedClass(ItinDef, Writes, Reads, /*ProcIndices*/{0}); + unsigned SCIdx = addSchedClass(ItinDef, Writes, Reads, /*ProcIndices*/ {0}); InstrClassMap[Inst->TheDef] = SCIdx; } // Create classes for InstRW defs. @@ -933,7 +927,8 @@ void CodeGenSchedModels::collectSchedClasses() { } CodeGenSchedClass &SC = getSchedClass(SCIdx); if (SC.ProcIndices[0] != 0) - PrintFatalError(Inst->TheDef->getLoc(), "Instruction's sched class " + PrintFatalError(Inst->TheDef->getLoc(), + "Instruction's sched class " "must not be subtarget specific."); IdxVec ProcIndices; @@ -962,8 +957,7 @@ void CodeGenSchedModels::collectSchedClasses() { << InstName); IdxVec Writes; IdxVec Reads; - findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"), - Writes, Reads); + findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads); LLVM_DEBUG({ for (unsigned WIdx : Writes) dbgs() << " " << SchedWrites[WIdx].Name; @@ -1032,25 +1026,23 @@ unsigned CodeGenSchedModels::addSchedClass(Record *ItinClassDef, assert(!ProcIndices.empty() && "expect at least one ProcIdx"); auto IsKeyEqual = [=](const CodeGenSchedClass &SC) { - return SC.isKeyEqual(ItinClassDef, OperWrites, OperReads); - }; + return SC.isKeyEqual(ItinClassDef, OperWrites, OperReads); + }; auto I = find_if(make_range(schedClassBegin(), schedClassEnd()), IsKeyEqual); unsigned Idx = I == schedClassEnd() ? 0 : std::distance(schedClassBegin(), I); if (Idx || SchedClasses[0].isKeyEqual(ItinClassDef, OperWrites, OperReads)) { IdxVec PI; std::set_union(SchedClasses[Idx].ProcIndices.begin(), - SchedClasses[Idx].ProcIndices.end(), - ProcIndices.begin(), ProcIndices.end(), - std::back_inserter(PI)); + SchedClasses[Idx].ProcIndices.end(), ProcIndices.begin(), + ProcIndices.end(), std::back_inserter(PI)); SchedClasses[Idx].ProcIndices = std::move(PI); return Idx; } Idx = SchedClasses.size(); - SchedClasses.emplace_back(Idx, - createSchedClassName(ItinClassDef, OperWrites, - OperReads), - ItinClassDef); + SchedClasses.emplace_back( + Idx, createSchedClassName(ItinClassDef, OperWrites, OperReads), + ItinClassDef); CodeGenSchedClass &SC = SchedClasses.back(); SC.Writes = OperWrites; SC.Reads = OperReads; @@ -1083,17 +1075,16 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { // the Instrs to it. for (auto &Entry : ClassInstrs) { unsigned OldSCIdx = Entry.first; - ArrayRef InstDefs = Entry.second; + ArrayRef InstDefs = Entry.second; // If the all instrs in the current class are accounted for, then leave // them mapped to their old class. if (OldSCIdx) { const RecVec &RWDefs = SchedClasses[OldSCIdx].InstRWs; if (!RWDefs.empty()) { const RecVec *OrigInstDefs = Sets.expand(RWDefs[0]); - unsigned OrigNumInstrs = - count_if(*OrigInstDefs, [&](Record *OIDef) { - return InstrClassMap[OIDef] == OldSCIdx; - }); + unsigned OrigNumInstrs = count_if(*OrigInstDefs, [&](Record *OIDef) { + return InstrClassMap[OIDef] == OldSCIdx; + }); if (OrigNumInstrs == InstDefs.size()) { assert(SchedClasses[OldSCIdx].ProcIndices[0] == 0 && "expected a generic SchedClass"); @@ -1148,8 +1139,7 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { "\"."); PrintFatalNote(OldRWDef->getLoc(), "Previous match was here."); } - assert(OldRWDef != InstRWDef && - "SchedClass has duplicate InstRW def"); + assert(OldRWDef != InstRWDef && "SchedClass has duplicate InstRW def"); SC.InstRWs.push_back(OldRWDef); } } @@ -1162,7 +1152,8 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { // True if collectProcItins found anything. bool CodeGenSchedModels::hasItineraries() const { - for (const CodeGenProcModel &PM : make_range(procModelBegin(),procModelEnd())) + for (const CodeGenProcModel &PM : + make_range(procModelBegin(), procModelEnd())) if (PM.hasItineraries()) return true; return false; @@ -1217,14 +1208,14 @@ void CodeGenSchedModels::collectProcItins() { void CodeGenSchedModels::collectProcItinRW() { RecVec ItinRWDefs = Records.getAllDerivedDefinitions("ItinRW"); llvm::sort(ItinRWDefs, LessRecord()); - for (Record *RWDef : ItinRWDefs) { + for (Record *RWDef : ItinRWDefs) { if (!RWDef->getValueInit("SchedModel")->isComplete()) PrintFatalError(RWDef->getLoc(), "SchedModel is undefined"); Record *ModelDef = RWDef->getValueAsDef("SchedModel"); ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef); if (I == ProcModelMap.end()) { - PrintFatalError(RWDef->getLoc(), "Undefined SchedMachineModel " - + ModelDef->getName()); + PrintFatalError(RWDef->getLoc(), + "Undefined SchedMachineModel " + ModelDef->getName()); } ProcModels[I->second].ItinRWDefs.push_back(RWDef); } @@ -1254,10 +1245,10 @@ void CodeGenSchedModels::inferSchedClasses() { if (!SchedClasses[Idx].InstRWs.empty()) inferFromInstRWs(Idx); if (!SchedClasses[Idx].Writes.empty()) { - inferFromRW(SchedClasses[Idx].Writes, SchedClasses[Idx].Reads, - Idx, SchedClasses[Idx].ProcIndices); + inferFromRW(SchedClasses[Idx].Writes, SchedClasses[Idx].Reads, Idx, + SchedClasses[Idx].ProcIndices); } - assert(SchedClasses.size() < (NumInstrSchedClasses*6) && + assert(SchedClasses.size() < (NumInstrSchedClasses * 6) && "too many SchedVariants"); } } @@ -1274,9 +1265,9 @@ void CodeGenSchedModels::inferFromItinClass(Record *ItinClassDef, if (!llvm::is_contained(Matched, ItinClassDef)) continue; if (HasMatch) - PrintFatalError(Rec->getLoc(), "Duplicate itinerary class " - + ItinClassDef->getName() - + " in ItinResources for " + PM.ModelName); + PrintFatalError(Rec->getLoc(), + "Duplicate itinerary class " + ItinClassDef->getName() + + " in ItinResources for " + PM.ModelName); HasMatch = true; IdxVec Writes, Reads; findRWs(Rec->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads); @@ -1317,8 +1308,8 @@ struct TransVariant { unsigned ProcIdx; // Processor model index or zero for any. unsigned TransVecIdx; // Index into PredTransitions::TransVec. - TransVariant(Record *def, unsigned rwi, unsigned pi, unsigned ti): - VarOrSeqDef(def), RWIdx(rwi), ProcIdx(pi), TransVecIdx(ti) {} + TransVariant(Record *def, unsigned rwi, unsigned pi, unsigned ti) + : VarOrSeqDef(def), RWIdx(rwi), ProcIdx(pi), TransVecIdx(ti) {} }; // Associate a predicate with the SchedReadWrite that it guards. @@ -1328,15 +1319,16 @@ struct PredCheck { unsigned RWIdx; Record *Predicate; - PredCheck(bool r, unsigned w, Record *p): IsRead(r), RWIdx(w), Predicate(p) {} + PredCheck(bool r, unsigned w, Record *p) + : IsRead(r), RWIdx(w), Predicate(p) {} }; // A Predicate transition is a list of RW sequences guarded by a PredTerm. struct PredTransition { // A predicate term is a conjunction of PredChecks. SmallVector PredTerm; - SmallVector, 16> WriteSequences; - SmallVector, 16> ReadSequences; + SmallVector, 16> WriteSequences; + SmallVector, 16> ReadSequences; unsigned ProcIndex = 0; PredTransition() = default; @@ -1354,7 +1346,7 @@ class PredTransitions { public: std::vector TransVec; - PredTransitions(CodeGenSchedModels &sm): SchedModels(sm) {} + PredTransitions(CodeGenSchedModels &sm) : SchedModels(sm) {} bool substituteVariantOperand(const SmallVectorImpl &RWSeq, bool IsRead, unsigned StartIdx); @@ -1368,9 +1360,8 @@ public: private: bool mutuallyExclusive(Record *PredDef, ArrayRef Preds, ArrayRef Term); - void getIntersectingVariants( - const CodeGenSchedRW &SchedRW, unsigned TransIdx, - std::vector &IntersectingVariants); + void getIntersectingVariants(const CodeGenSchedRW &SchedRW, unsigned TransIdx, + std::vector &IntersectingVariants); void pushVariant(const TransVariant &VInfo, bool IsRead); }; @@ -1388,7 +1379,7 @@ private: bool PredTransitions::mutuallyExclusive(Record *PredDef, ArrayRef Preds, ArrayRef Term) { - for (const PredCheck &PC: Term) { + for (const PredCheck &PC : Term) { if (PC.Predicate == PredDef) return false; @@ -1446,8 +1437,8 @@ static std::vector getAllPredicates(ArrayRef Variants, // given SchedRW whose processor indices and predicates are not mutually // exclusive with the given transition. void PredTransitions::getIntersectingVariants( - const CodeGenSchedRW &SchedRW, unsigned TransIdx, - std::vector &IntersectingVariants) { + const CodeGenSchedRW &SchedRW, unsigned TransIdx, + std::vector &IntersectingVariants) { bool GenericRW = false; @@ -1489,7 +1480,7 @@ void PredTransitions::getIntersectingVariants( } const CodeGenSchedRW &AliasRW = - SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW")); + SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW")); if (AliasRW.HasVariants) { const RecVec VarDefs = AliasRW.TheDef->getValueAsListOfDefs("Variants"); @@ -1516,8 +1507,7 @@ void PredTransitions::getIntersectingVariants( // The first variant builds on the existing transition. Variant.TransVecIdx = TransIdx; IntersectingVariants.push_back(Variant); - } - else { + } else { // Push another copy of the current transition for more variants. Variant.TransVecIdx = TransVec.size(); IntersectingVariants.push_back(Variant); @@ -1525,15 +1515,15 @@ void PredTransitions::getIntersectingVariants( } } if (GenericRW && IntersectingVariants.empty()) { - PrintFatalError(SchedRW.TheDef->getLoc(), "No variant of this type has " + PrintFatalError(SchedRW.TheDef->getLoc(), + "No variant of this type has " "a matching predicate on any processor"); } } // Push the Reads/Writes selected by this variant onto the PredTransition // specified by VInfo. -void PredTransitions:: -pushVariant(const TransVariant &VInfo, bool IsRead) { +void PredTransitions::pushVariant(const TransVariant &VInfo, bool IsRead) { PredTransition &Trans = TransVec[VInfo.TransVecIdx]; // If this operand transition is reached through a processor-specific alias, @@ -1541,11 +1531,10 @@ pushVariant(const TransVariant &VInfo, bool IsRead) { IdxVec SelectedRWs; if (VInfo.VarOrSeqDef->isSubClassOf("SchedVar")) { Record *PredDef = VInfo.VarOrSeqDef->getValueAsDef("Predicate"); - Trans.PredTerm.emplace_back(IsRead, VInfo.RWIdx,PredDef); + Trans.PredTerm.emplace_back(IsRead, VInfo.RWIdx, PredDef); RecVec SelectedDefs = VInfo.VarOrSeqDef->getValueAsListOfDefs("Selected"); SchedModels.findRWs(SelectedDefs, SelectedRWs, IsRead); - } - else { + } else { assert(VInfo.VarOrSeqDef->isSubClassOf("WriteSequence") && "variant must be a SchedVariant or aliased WriteSequence"); SelectedRWs.push_back(SchedModels.getSchedRWIdx(VInfo.VarOrSeqDef, IsRead)); @@ -1553,10 +1542,10 @@ pushVariant(const TransVariant &VInfo, bool IsRead) { const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(VInfo.RWIdx, IsRead); - SmallVectorImpl> &RWSequences = IsRead - ? Trans.ReadSequences : Trans.WriteSequences; + SmallVectorImpl> &RWSequences = + IsRead ? Trans.ReadSequences : Trans.WriteSequences; if (SchedRW.IsVariadic) { - unsigned OperIdx = RWSequences.size()-1; + unsigned OperIdx = RWSequences.size() - 1; // Make N-1 copies of this transition's last sequence. RWSequences.reserve(RWSequences.size() + SelectedRWs.size() - 1); RWSequences.insert(RWSequences.end(), SelectedRWs.size() - 1, @@ -1565,8 +1554,8 @@ pushVariant(const TransVariant &VInfo, bool IsRead) { // sequence (split the current operand into N operands). // Note that write sequences should be expanded within this loop--the entire // sequence belongs to a single operand. - for (IdxIter RWI = SelectedRWs.begin(), RWE = SelectedRWs.end(); - RWI != RWE; ++RWI, ++OperIdx) { + for (IdxIter RWI = SelectedRWs.begin(), RWE = SelectedRWs.end(); RWI != RWE; + ++RWI, ++OperIdx) { IdxVec ExpandedRWs; if (IsRead) ExpandedRWs.push_back(*RWI); @@ -1575,8 +1564,7 @@ pushVariant(const TransVariant &VInfo, bool IsRead) { llvm::append_range(RWSequences[OperIdx], ExpandedRWs); } assert(OperIdx == RWSequences.size() && "missed a sequence"); - } - else { + } else { // Push this transition's expanded sequence onto this transition's last // sequence (add to the current operand's sequence). SmallVectorImpl &Seq = RWSequences.back(); @@ -1644,8 +1632,9 @@ bool PredTransitions::substituteVariants(const PredTransition &Trans) { // Visit each original write sequence. for (const auto &WriteSequence : Trans.WriteSequences) { // Push a new (empty) write sequence onto all partial Transitions. - for (std::vector::iterator I = - TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) { + for (std::vector::iterator I = TransVec.begin() + StartIdx, + E = TransVec.end(); + I != E; ++I) { I->WriteSequences.emplace_back(); } Subst |= @@ -1654,8 +1643,9 @@ bool PredTransitions::substituteVariants(const PredTransition &Trans) { // Visit each original read sequence. for (const auto &ReadSequence : Trans.ReadSequences) { // Push a new (empty) read sequence onto all partial Transitions. - for (std::vector::iterator I = - TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) { + for (std::vector::iterator I = TransVec.begin() + StartIdx, + E = TransVec.end(); + I != E; ++I) { I->ReadSequences.emplace_back(); } Subst |= substituteVariantOperand(ReadSequence, /*IsRead=*/true, StartIdx); @@ -1814,7 +1804,7 @@ bool CodeGenSchedModels::hasSuperGroup(RecVec &SubUnits, CodeGenProcModel &PM) { continue; RecVec SuperUnits = ProcResourceDef->getValueAsListOfDefs("Resources"); RecIter RI = SubUnits.begin(), RE = SubUnits.end(); - for ( ; RI != RE; ++RI) { + for (; RI != RE; ++RI) { if (!is_contained(SuperUnits, *RI)) { break; } @@ -1831,22 +1821,22 @@ void CodeGenSchedModels::verifyProcResourceGroups(CodeGenProcModel &PM) { if (!PM.ProcResourceDefs[i]->isSubClassOf("ProcResGroup")) continue; RecVec CheckUnits = - PM.ProcResourceDefs[i]->getValueAsListOfDefs("Resources"); - for (unsigned j = i+1; j < e; ++j) { + PM.ProcResourceDefs[i]->getValueAsListOfDefs("Resources"); + for (unsigned j = i + 1; j < e; ++j) { if (!PM.ProcResourceDefs[j]->isSubClassOf("ProcResGroup")) continue; RecVec OtherUnits = - PM.ProcResourceDefs[j]->getValueAsListOfDefs("Resources"); + PM.ProcResourceDefs[j]->getValueAsListOfDefs("Resources"); if (std::find_first_of(CheckUnits.begin(), CheckUnits.end(), - OtherUnits.begin(), OtherUnits.end()) - != CheckUnits.end()) { + OtherUnits.begin(), + OtherUnits.end()) != CheckUnits.end()) { // CheckUnits and OtherUnits overlap llvm::append_range(OtherUnits, CheckUnits); if (!hasSuperGroup(OtherUnits, PM)) { PrintFatalError((PM.ProcResourceDefs[i])->getLoc(), - "proc resource group overlaps with " - + PM.ProcResourceDefs[j]->getName() - + " but no supergroup contains both."); + "proc resource group overlaps with " + + PM.ProcResourceDefs[j]->getName() + + " but no supergroup contains both."); } } } @@ -1862,7 +1852,7 @@ void CodeGenSchedModels::collectRegisterFiles() { // For each register file definition, construct a CodeGenRegisterFile object // and add it to the appropriate scheduling model. CodeGenProcModel &PM = getProcModel(RF->getValueAsDef("SchedModel")); - PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(),RF)); + PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(), RF)); CodeGenRegisterFile &CGRF = PM.RegisterFiles.back(); CGRF.MaxMovesEliminatedPerCycle = RF->getValueAsInt("MaxMovesEliminatedPerCycle"); @@ -2013,7 +2003,7 @@ void CodeGenSchedModels::checkCompleteness() { PrintError(Inst->TheDef->getLoc(), "No schedule information for instruction '" + Inst->TheDef->getName() + "' in SchedMachineModel '" + - ProcModel.ModelDef->getName() + "'"); + ProcModel.ModelDef->getName() + "'"); Complete = false; } continue; @@ -2039,14 +2029,18 @@ void CodeGenSchedModels::checkCompleteness() { } } if (!Complete) { - errs() << "\n\nIncomplete schedule models found.\n" - << "- Consider setting 'CompleteModel = 0' while developing new models.\n" - << "- Pseudo instructions can be marked with 'hasNoSchedulingInfo = 1'.\n" - << "- Instructions should usually have Sched<[...]> as a superclass, " - "you may temporarily use an empty list.\n" - << "- Instructions related to unsupported features can be excluded with " - "list UnsupportedFeatures = [HasA,..,HasY]; in the " - "processor model.\n\n"; + errs() + << "\n\nIncomplete schedule models found.\n" + << "- Consider setting 'CompleteModel = 0' while developing new " + "models.\n" + << "- Pseudo instructions can be marked with 'hasNoSchedulingInfo = " + "1'.\n" + << "- Instructions should usually have Sched<[...]> as a superclass, " + "you may temporarily use an empty list.\n" + << "- Instructions related to unsupported features can be excluded " + "with " + "list UnsupportedFeatures = [HasA,..,HasY]; in the " + "processor model.\n\n"; PrintFatalError("Incomplete schedule model"); } } @@ -2057,15 +2051,15 @@ void CodeGenSchedModels::collectItinProcResources(Record *ItinClassDef) { const CodeGenProcModel &PM = ProcModels[PIdx]; // For all ItinRW entries. bool HasMatch = false; - for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end(); - II != IE; ++II) { + for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end(); II != IE; + ++II) { RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses"); if (!llvm::is_contained(Matched, ItinClassDef)) continue; if (HasMatch) - PrintFatalError((*II)->getLoc(), "Duplicate itinerary class " - + ItinClassDef->getName() - + " in ItinResources for " + PM.ModelName); + PrintFatalError((*II)->getLoc(), + "Duplicate itinerary class " + ItinClassDef->getName() + + " in ItinResources for " + PM.ModelName); HasMatch = true; IdxVec Writes, Reads; findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads); @@ -2081,8 +2075,7 @@ void CodeGenSchedModels::collectRWResources(unsigned RWIdx, bool IsRead, if (!IsRead && SchedRW.TheDef->isSubClassOf("SchedWriteRes")) { for (unsigned Idx : ProcIndices) addWriteRes(SchedRW.TheDef, Idx); - } - else if (IsRead && SchedRW.TheDef->isSubClassOf("SchedReadAdvance")) { + } else if (IsRead && SchedRW.TheDef->isSubClassOf("SchedReadAdvance")) { for (unsigned Idx : ProcIndices) addReadAdvance(SchedRW.TheDef, Idx); } @@ -2128,31 +2121,30 @@ Record *CodeGenSchedModels::findProcResUnits(Record *ProcResKind, assert(!ProcResGroups.empty()); for (Record *ProcResDef : ProcResourceDefs) { - if (ProcResDef->getValueAsDef("Kind") == ProcResKind - && ProcResDef->getValueAsDef("SchedModel") == PM.ModelDef) { + if (ProcResDef->getValueAsDef("Kind") == ProcResKind && + ProcResDef->getValueAsDef("SchedModel") == PM.ModelDef) { if (ProcUnitDef) { PrintFatalError(Loc, - "Multiple ProcessorResourceUnits associated with " - + ProcResKind->getName()); + "Multiple ProcessorResourceUnits associated with " + + ProcResKind->getName()); } ProcUnitDef = ProcResDef; } } for (Record *ProcResGroup : ProcResGroups) { - if (ProcResGroup == ProcResKind - && ProcResGroup->getValueAsDef("SchedModel") == PM.ModelDef) { + if (ProcResGroup == ProcResKind && + ProcResGroup->getValueAsDef("SchedModel") == PM.ModelDef) { if (ProcUnitDef) { PrintFatalError(Loc, - "Multiple ProcessorResourceUnits associated with " - + ProcResKind->getName()); + "Multiple ProcessorResourceUnits associated with " + + ProcResKind->getName()); } ProcUnitDef = ProcResGroup; } } if (!ProcUnitDef) { - PrintFatalError(Loc, - "No ProcessorResources associated with " - + ProcResKind->getName()); + PrintFatalError(Loc, "No ProcessorResources associated with " + + ProcResKind->getName()); } return ProcUnitDef; } @@ -2208,14 +2200,16 @@ unsigned CodeGenProcModel::getProcResourceIdx(Record *PRDef) const { RecIter PRPos = find(ProcResourceDefs, PRDef); if (PRPos == ProcResourceDefs.end()) PrintFatalError(PRDef->getLoc(), "ProcResource def is not included in " - "the ProcResources list for " + ModelName); + "the ProcResources list for " + + ModelName); // Idx=0 is reserved for invalid. return 1 + (PRPos - ProcResourceDefs.begin()); } bool CodeGenProcModel::isUnsupported(const CodeGenInstruction &Inst) const { for (const Record *TheDef : UnsupportedFeaturesDefs) { - for (const Record *PredDef : Inst.TheDef->getValueAsListOfDefs("Predicates")) { + for (const Record *PredDef : + Inst.TheDef->getValueAsListOfDefs("Predicates")) { if (TheDef->getName() == PredDef->getName()) return true; } @@ -2239,12 +2233,11 @@ void CodeGenSchedRW::dump() const { } } -void CodeGenSchedClass::dump(const CodeGenSchedModels* SchedModels) const { - dbgs() << "SCHEDCLASS " << Index << ":" << Name << '\n' - << " Writes: "; +void CodeGenSchedClass::dump(const CodeGenSchedModels *SchedModels) const { + dbgs() << "SCHEDCLASS " << Index << ":" << Name << '\n' << " Writes: "; for (unsigned i = 0, N = Writes.size(); i < N; ++i) { SchedModels->getSchedWrite(Writes[i]).dump(); - if (i < N-1) { + if (i < N - 1) { dbgs() << '\n'; dbgs().indent(10); } @@ -2252,12 +2245,13 @@ void CodeGenSchedClass::dump(const CodeGenSchedModels* SchedModels) const { dbgs() << "\n Reads: "; for (unsigned i = 0, N = Reads.size(); i < N; ++i) { SchedModels->getSchedRead(Reads[i]).dump(); - if (i < N-1) { + if (i < N - 1) { dbgs() << '\n'; dbgs().indent(10); } } - dbgs() << "\n ProcIdx: "; dumpIdxVec(ProcIndices); + dbgs() << "\n ProcIdx: "; + dumpIdxVec(ProcIndices); if (!Transitions.empty()) { dbgs() << "\n Transitions for Proc "; for (const CodeGenSchedTransition &Transition : Transitions) { diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h index 76ef1e4..61980e7 100644 --- a/llvm/utils/TableGen/CodeGenSchedule.h +++ b/llvm/utils/TableGen/CodeGenSchedule.h @@ -33,8 +33,8 @@ class CodeGenTarget; class CodeGenSchedModels; class CodeGenInstruction; -using RecVec = std::vector; -using RecIter = std::vector::const_iterator; +using RecVec = std::vector; +using RecIter = std::vector::const_iterator; using IdxVec = std::vector; using IdxIter = std::vector::const_iterator; @@ -59,10 +59,10 @@ struct CodeGenSchedRW { RecVec Aliases; CodeGenSchedRW() - : Index(0), TheDef(nullptr), IsRead(false), IsAlias(false), - HasVariants(false), IsVariadic(false), IsSequence(false) {} + : Index(0), TheDef(nullptr), IsRead(false), IsAlias(false), + HasVariants(false), IsVariadic(false), IsSequence(false) {} CodeGenSchedRW(unsigned Idx, Record *Def) - : Index(Idx), TheDef(Def), IsAlias(false), IsVariadic(false) { + : Index(Idx), TheDef(Def), IsAlias(false), IsVariadic(false) { Name = std::string(Def->getName()); IsRead = Def->isSubClassOf("SchedRead"); HasVariants = Def->isSubClassOf("SchedVariant"); @@ -148,7 +148,7 @@ struct CodeGenSchedClass { DenseSet InstRWProcIndices; CodeGenSchedClass(unsigned Index, std::string Name, Record *ItinClassDef) - : Index(Index), Name(std::move(Name)), ItinClassDef(ItinClassDef) {} + : Index(Index), Name(std::move(Name)), ItinClassDef(ItinClassDef) {} bool isKeyEqual(Record *IC, ArrayRef W, ArrayRef R) const { @@ -173,7 +173,8 @@ struct CodeGenRegisterCost { Record *RCDef; unsigned Cost; bool AllowMoveElimination; - CodeGenRegisterCost(Record *RC, unsigned RegisterCost, bool AllowMoveElim = false) + CodeGenRegisterCost(Record *RC, unsigned RegisterCost, + bool AllowMoveElim = false) : RCDef(RC), Cost(RegisterCost), AllowMoveElimination(AllowMoveElim) {} CodeGenRegisterCost(const CodeGenRegisterCost &) = default; CodeGenRegisterCost &operator=(const CodeGenRegisterCost &) = delete; @@ -193,12 +194,12 @@ struct CodeGenRegisterFile { unsigned NumPhysRegs; std::vector Costs; - CodeGenRegisterFile(StringRef name, Record *def, unsigned MaxMoveElimPerCy = 0, + CodeGenRegisterFile(StringRef name, Record *def, + unsigned MaxMoveElimPerCy = 0, bool AllowZeroMoveElimOnly = false) : Name(name), RegisterFileDef(def), MaxMovesEliminatedPerCycle(MaxMoveElimPerCy), - AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly), - NumPhysRegs(0) {} + AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly), NumPhysRegs(0) {} bool hasDefaultCosts() const { return Costs.empty(); } }; @@ -255,10 +256,9 @@ struct CodeGenProcModel { Record *LoadQueue; Record *StoreQueue; - CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef, - Record *IDef) : - Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef), - RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {} + CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef, Record *IDef) + : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef), + RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {} bool hasItineraries() const { return !ItinsDef->getValueAsListOfDefs("IID").empty(); @@ -443,14 +443,14 @@ class CodeGenSchedModels { // Map each instruction to its unique SchedClass index considering the // combination of it's itinerary class, SchedRW list, and InstRW records. - using InstClassMapTy = DenseMap; + using InstClassMapTy = DenseMap; InstClassMapTy InstrClassMap; std::vector STIPredicates; std::vector getAllProcIndices() const; public: - CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT); + CodeGenSchedModels(RecordKeeper &RK, const CodeGenTarget &TGT); // iterator access to the scheduling classes. using class_iterator = std::vector::iterator; @@ -460,10 +460,10 @@ public: class_iterator classes_end() { return SchedClasses.end(); } const_class_iterator classes_end() const { return SchedClasses.end(); } iterator_range classes() { - return make_range(classes_begin(), classes_end()); + return make_range(classes_begin(), classes_end()); } iterator_range classes() const { - return make_range(classes_begin(), classes_end()); + return make_range(classes_begin(), classes_end()); } iterator_range explicit_classes() { return make_range(classes_begin(), classes_begin() + NumInstrSchedClasses); @@ -476,8 +476,8 @@ public: Record *ModelDef = ProcDef->getValueAsDef("SchedModel"); Record *ItinsDef = ProcDef->getValueAsDef("ProcItin"); if (!ItinsDef->getValueAsListOfDefs("IID").empty()) { - assert(ModelDef->getValueAsBit("NoModel") - && "Itineraries must be defined within SchedMachineModel"); + assert(ModelDef->getValueAsBit("NoModel") && + "Itineraries must be defined within SchedMachineModel"); return ItinsDef; } return ModelDef; @@ -496,7 +496,7 @@ public: return ProcModels[I->second]; } const CodeGenProcModel &getProcModel(Record *ModelDef) const { - return const_cast(this)->getProcModel(ModelDef); + return const_cast(this)->getProcModel(ModelDef); } // Iterate over the unique processor models. @@ -527,11 +527,11 @@ public: CodeGenSchedRW &getSchedRW(Record *Def) { bool IsRead = Def->isSubClassOf("SchedRead"); unsigned Idx = getSchedRWIdx(Def, IsRead); - return const_cast( - IsRead ? getSchedRead(Idx) : getSchedWrite(Idx)); + return const_cast(IsRead ? getSchedRead(Idx) + : getSchedWrite(Idx)); } const CodeGenSchedRW &getSchedRW(Record *Def) const { - return const_cast(*this).getSchedRW(Def); + return const_cast(*this).getSchedRW(Def); } unsigned getSchedRWIdx(const Record *Def, bool IsRead) const; @@ -579,6 +579,7 @@ public: ArrayRef getSTIPredicates() const { return STIPredicates; } + private: void collectProcModels(); diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp index ceaa51b1..8e2957e 100644 --- a/llvm/utils/TableGen/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/CodeGenTarget.cpp @@ -49,10 +49,14 @@ MVT::SimpleValueType llvm::getValueType(const Record *Rec) { StringRef llvm::getName(MVT::SimpleValueType T) { switch (T) { - case MVT::Other: return "UNKNOWN"; - case MVT::iPTR: return "TLI.getPointerTy()"; - case MVT::iPTRAny: return "TLI.getPointerTy()"; - default: return getEnumName(T); + case MVT::Other: + return "UNKNOWN"; + case MVT::iPTR: + return "TLI.getPointerTy()"; + case MVT::iPTRAny: + return "TLI.getPointerTy()"; + default: + return getEnumName(T); } } @@ -280,12 +284,11 @@ std::string llvm::getQualifiedName(const Record *R) { return Namespace + "::" + R->getName().str(); } - /// getTarget - Return the current instance of the Target class. /// CodeGenTarget::CodeGenTarget(RecordKeeper &records) - : Records(records), CGH(records) { - std::vector Targets = Records.getAllDerivedDefinitions("Target"); + : Records(records), CGH(records) { + std::vector Targets = Records.getAllDerivedDefinitions("Target"); if (Targets.size() == 0) PrintFatalError("No 'Target' subclasses defined!"); if (Targets.size() != 1) @@ -294,8 +297,7 @@ CodeGenTarget::CodeGenTarget(RecordKeeper &records) MacroFusions = Records.getAllDerivedDefinitions("Fusion"); } -CodeGenTarget::~CodeGenTarget() { -} +CodeGenTarget::~CodeGenTarget() {} StringRef CodeGenTarget::getName() const { return TargetRec->getName(); } @@ -331,7 +333,7 @@ bool CodeGenTarget::getAllowRegisterRenaming() const { /// getAsmParser - Return the AssemblyParser definition for this target. /// Record *CodeGenTarget::getAsmParser() const { - std::vector LI = TargetRec->getValueAsListOfDefs("AssemblyParsers"); + std::vector LI = TargetRec->getValueAsListOfDefs("AssemblyParsers"); if (AsmParserNum >= LI.size()) PrintFatalError("Target does not have an AsmParser #" + Twine(AsmParserNum) + "!"); @@ -342,8 +344,8 @@ Record *CodeGenTarget::getAsmParser() const { /// this target. /// Record *CodeGenTarget::getAsmParserVariant(unsigned i) const { - std::vector LI = - TargetRec->getValueAsListOfDefs("AssemblyParserVariants"); + std::vector LI = + TargetRec->getValueAsListOfDefs("AssemblyParserVariants"); if (i >= LI.size()) PrintFatalError("Target does not have an AsmParserVariant #" + Twine(i) + "!"); @@ -354,15 +356,15 @@ Record *CodeGenTarget::getAsmParserVariant(unsigned i) const { /// available for this target. /// unsigned CodeGenTarget::getAsmParserVariantCount() const { - std::vector LI = - TargetRec->getValueAsListOfDefs("AssemblyParserVariants"); + std::vector LI = + TargetRec->getValueAsListOfDefs("AssemblyParserVariants"); return LI.size(); } /// getAsmWriter - Return the AssemblyWriter definition for this target. /// Record *CodeGenTarget::getAsmWriter() const { - std::vector LI = TargetRec->getValueAsListOfDefs("AssemblyWriters"); + std::vector LI = TargetRec->getValueAsListOfDefs("AssemblyWriters"); if (AsmWriterNum >= LI.size()) PrintFatalError("Target does not have an AsmWriter #" + Twine(AsmWriterNum) + "!"); @@ -437,8 +439,7 @@ const CodeGenRegisterClass &CodeGenTarget::getRegisterClass(Record *R) const { return *getRegBank().getRegClass(R); } -std::vector CodeGenTarget::getRegisterVTs(Record *R) - const { +std::vector CodeGenTarget::getRegisterVTs(Record *R) const { const CodeGenRegister *Reg = getRegBank().getReg(R); std::vector Result; for (const auto &RC : getRegBank().getRegClasses()) { @@ -454,16 +455,15 @@ std::vector CodeGenTarget::getRegisterVTs(Record *R) return Result; } - void CodeGenTarget::ReadLegalValueTypes() const { for (const auto &RC : getRegBank().getRegClasses()) llvm::append_range(LegalValueTypes, RC.VTs); // Remove duplicates. llvm::sort(LegalValueTypes); - LegalValueTypes.erase(std::unique(LegalValueTypes.begin(), - LegalValueTypes.end()), - LegalValueTypes.end()); + LegalValueTypes.erase( + std::unique(LegalValueTypes.begin(), LegalValueTypes.end()), + LegalValueTypes.end()); } CodeGenSchedModels &CodeGenTarget::getSchedModels() const { @@ -473,7 +473,7 @@ CodeGenSchedModels &CodeGenTarget::getSchedModels() const { } void CodeGenTarget::ReadInstructions() const { - std::vector Insts = Records.getAllDerivedDefinitions("Instruction"); + std::vector Insts = Records.getAllDerivedDefinitions("Instruction"); if (Insts.size() <= 2) PrintFatalError("No 'Instruction' subclasses defined!"); @@ -482,11 +482,10 @@ void CodeGenTarget::ReadInstructions() const { Instructions[Insts[i]] = std::make_unique(Insts[i]); } -static const CodeGenInstruction * -GetInstByName(const char *Name, - const DenseMap> &Insts, - RecordKeeper &Records) { +static const CodeGenInstruction *GetInstByName( + const char *Name, + const DenseMap> &Insts, + RecordKeeper &Records) { const Record *Rec = Records.getDef(Name); const auto I = Insts.find(Rec); @@ -545,7 +544,6 @@ void CodeGenTarget::ComputeInstrsByEnum() const { Inst->EnumVal = Num++; } - /// isLittleEndianEncoding - Return whether this target encodes its instruction /// in little-endian format, i.e. bits laid out in the order [0..n] /// @@ -576,7 +574,7 @@ void CodeGenTarget::reverseBitsForLittleEndianEncoding() { unsigned bitSwapIdx = numBits - bit - 1; Init *OrigBit = BI->getBit(bit); Init *BitSwap = BI->getBit(bitSwapIdx); - NewBits[bit] = BitSwap; + NewBits[bit] = BitSwap; NewBits[bitSwapIdx] = OrigBit; } if (numBits % 2) { @@ -605,10 +603,10 @@ bool CodeGenTarget::guessInstructionProperties() const { // ComplexPattern implementation // ComplexPattern::ComplexPattern(Record *R) { - Ty = R->getValueAsDef("Ty"); + Ty = R->getValueAsDef("Ty"); NumOperands = R->getValueAsInt("NumOperands"); SelectFunc = std::string(R->getValueAsString("SelectFunc")); - RootNodes = R->getValueAsListOfDefs("RootNodes"); + RootNodes = R->getValueAsListOfDefs("RootNodes"); // FIXME: This is a hack to statically increase the priority of patterns which // maps a sub-dag to a complex pattern. e.g. favors LEA over ADD. To get best @@ -623,7 +621,7 @@ ComplexPattern::ComplexPattern(Record *R) { // FIXME: Why is this different from parseSDPatternOperatorProperties? // Parse the properties. Properties = 0; - std::vector PropList = R->getValueAsListOfDefs("Properties"); + std::vector PropList = R->getValueAsListOfDefs("Properties"); for (unsigned i = 0, e = PropList.size(); i != e; ++i) if (PropList[i]->getName() == "SDNPHasChain") { Properties |= 1 << SDNPHasChain; diff --git a/llvm/utils/TableGen/CodeGenTarget.h b/llvm/utils/TableGen/CodeGenTarget.h index 29f1024..2ae3a3a 100644 --- a/llvm/utils/TableGen/CodeGenTarget.h +++ b/llvm/utils/TableGen/CodeGenTarget.h @@ -58,10 +58,10 @@ class CodeGenTarget { RecordKeeper &Records; Record *TargetRec; - mutable DenseMap> Instructions; + mutable DenseMap> + Instructions; mutable std::unique_ptr RegBank; - mutable std::vector RegAltNameIndices; + mutable std::vector RegAltNameIndices; mutable SmallVector LegalValueTypes; CodeGenHwModes CGH; std::vector MacroFusions; @@ -75,6 +75,7 @@ class CodeGenTarget { mutable StringRef InstNamespace; mutable std::vector InstrsByEnum; mutable unsigned NumPseudoInstructions = 0; + public: CodeGenTarget(RecordKeeper &Records); ~CodeGenTarget(); @@ -130,8 +131,9 @@ public: /// return it. const CodeGenRegister *getRegisterByName(StringRef Name) const; - const std::vector &getRegAltNameIndices() const { - if (RegAltNameIndices.empty()) ReadRegAltNameIndices(); + const std::vector &getRegAltNameIndices() const { + if (RegAltNameIndices.empty()) + ReadRegAltNameIndices(); return RegAltNameIndices; } @@ -156,15 +158,17 @@ public: const std::vector getMacroFusions() const { return MacroFusions; } private: - DenseMap> & + DenseMap> & getInstructions() const { - if (Instructions.empty()) ReadInstructions(); + if (Instructions.empty()) + ReadInstructions(); return Instructions; } -public: +public: CodeGenInstruction &getInstruction(const Record *InstRec) const { - if (Instructions.empty()) ReadInstructions(); + if (Instructions.empty()) + ReadInstructions(); auto I = Instructions.find(InstRec); assert(I != Instructions.end() && "Not an instruction"); return *I->second; @@ -200,10 +204,11 @@ public: } typedef ArrayRef::const_iterator inst_iterator; - inst_iterator inst_begin() const{return getInstructionsByEnumValue().begin();} + inst_iterator inst_begin() const { + return getInstructionsByEnumValue().begin(); + } inst_iterator inst_end() const { return getInstructionsByEnumValue().end(); } - /// isLittleEndianEncoding - are instruction bit patterns defined as [0..n]? /// bool isLittleEndianEncoding() const; @@ -226,22 +231,21 @@ class ComplexPattern { Record *Ty; unsigned NumOperands; std::string SelectFunc; - std::vector RootNodes; + std::vector RootNodes; unsigned Properties; // Node properties unsigned Complexity; + public: ComplexPattern(Record *R); Record *getValueType() const { return Ty; } unsigned getNumOperands() const { return NumOperands; } const std::string &getSelectFunc() const { return SelectFunc; } - const std::vector &getRootNodes() const { - return RootNodes; - } + const std::vector &getRootNodes() const { return RootNodes; } bool hasProperty(enum SDNP Prop) const { return Properties & (1 << Prop); } unsigned getComplexity() const { return Complexity; } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp index eaf7f7f..32b2746 100644 --- a/llvm/utils/TableGen/DAGISelEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelEmitter.cpp @@ -27,6 +27,7 @@ namespace { class DAGISelEmitter { RecordKeeper &Records; // Just so we can get at the timing functions. CodeGenDAGPatterns CGP; + public: explicit DAGISelEmitter(RecordKeeper &R) : Records(R), CGP(R) {} void run(raw_ostream &OS); @@ -42,7 +43,8 @@ public: /// latencies in this calculation. static unsigned getResultPatternCost(TreePatternNode *P, CodeGenDAGPatterns &CGP) { - if (P->isLeaf()) return 0; + if (P->isLeaf()) + return 0; unsigned Cost = 0; Record *Op = P->getOperator(); @@ -61,7 +63,8 @@ static unsigned getResultPatternCost(TreePatternNode *P, /// pattern. static unsigned getResultPatternSize(TreePatternNode *P, CodeGenDAGPatterns &CGP) { - if (P->isLeaf()) return 0; + if (P->isLeaf()) + return 0; unsigned Cost = 0; Record *Op = P->getOperator(); @@ -98,19 +101,25 @@ struct PatternSortingPredicate { // input over nodes that cover fewer. int LHSSize = LHS->getPatternComplexity(CGP); int RHSSize = RHS->getPatternComplexity(CGP); - if (LHSSize > RHSSize) return true; // LHS -> bigger -> less cost - if (LHSSize < RHSSize) return false; + if (LHSSize > RHSSize) + return true; // LHS -> bigger -> less cost + if (LHSSize < RHSSize) + return false; // If the patterns have equal complexity, compare generated instruction cost unsigned LHSCost = getResultPatternCost(LHS->getDstPattern(), CGP); unsigned RHSCost = getResultPatternCost(RHS->getDstPattern(), CGP); - if (LHSCost < RHSCost) return true; - if (LHSCost > RHSCost) return false; + if (LHSCost < RHSCost) + return true; + if (LHSCost > RHSCost) + return false; unsigned LHSPatSize = getResultPatternSize(LHS->getDstPattern(), CGP); unsigned RHSPatSize = getResultPatternSize(RHS->getDstPattern(), CGP); - if (LHSPatSize < RHSPatSize) return true; - if (LHSPatSize > RHSPatSize) return false; + if (LHSPatSize < RHSPatSize) + return true; + if (LHSPatSize > RHSPatSize) + return false; // Sort based on the UID of the pattern, to reflect source order. // Note that this is not guaranteed to be unique, since a single source @@ -122,11 +131,11 @@ struct PatternSortingPredicate { }; } // End anonymous namespace - void DAGISelEmitter::run(raw_ostream &OS) { Records.startTimer("Parse patterns"); emitSourceFileHeader("DAG Instruction Selector for the " + - CGP.getTargetInfo().getName().str() + " target", OS); + CGP.getTargetInfo().getName().str() + " target", + OS); OS << "// *** NOTE: This file is #included into the middle of the target\n" << "// *** instruction selector class. These functions are really " @@ -155,7 +164,7 @@ void DAGISelEmitter::run(raw_ostream &OS) { // Add all the patterns to a temporary list so we can sort them. Records.startTimer("Sort patterns"); - std::vector Patterns; + std::vector Patterns; for (const PatternToMatch &PTM : CGP.ptms()) Patterns.push_back(&PTM); @@ -167,7 +176,7 @@ void DAGISelEmitter::run(raw_ostream &OS) { Records.startTimer("Convert to matchers"); SmallVector PatternMatchers; for (const PatternToMatch *PTM : Patterns) { - for (unsigned Variant = 0; ; ++Variant) { + for (unsigned Variant = 0;; ++Variant) { if (Matcher *M = ConvertPatternToMatcher(*PTM, Variant, CGP)) PatternMatchers.push_back(M); else @@ -181,7 +190,7 @@ void DAGISelEmitter::run(raw_ostream &OS) { Records.startTimer("Optimize matchers"); OptimizeMatcher(TheMatcher, CGP); - //Matcher->dump(); + // Matcher->dump(); Records.startTimer("Emit matcher table"); EmitMatcherTable(TheMatcher.get(), CGP, OS); diff --git a/llvm/utils/TableGen/DAGISelMatcher.cpp b/llvm/utils/TableGen/DAGISelMatcher.cpp index 1a5c728..5461481 100644 --- a/llvm/utils/TableGen/DAGISelMatcher.cpp +++ b/llvm/utils/TableGen/DAGISelMatcher.cpp @@ -15,11 +15,9 @@ #include "llvm/TableGen/Record.h" using namespace llvm; -void Matcher::anchor() { } +void Matcher::anchor() {} -void Matcher::dump() const { - print(errs(), 0); -} +void Matcher::dump() const { print(errs(), 0); } void Matcher::print(raw_ostream &OS, unsigned indent) const { printImpl(OS, indent); @@ -27,9 +25,7 @@ void Matcher::print(raw_ostream &OS, unsigned indent) const { return Next->print(OS, indent); } -void Matcher::printOne(raw_ostream &OS) const { - printImpl(OS, 0); -} +void Matcher::printOne(raw_ostream &OS) const { printImpl(OS, 0); } /// unlinkNode - Unlink the specified node from this chain. If Other == this, /// we unlink the next pointer and return it. Otherwise we unlink Other from @@ -43,7 +39,8 @@ Matcher *Matcher::unlinkNode(Matcher *Other) { for (; Cur && Cur->getNext() != Other; Cur = Cur->getNext()) /*empty*/; - if (!Cur) return nullptr; + if (!Cur) + return nullptr; Cur->takeNext(); Cur->setNext(Other->takeNext()); return this; @@ -55,7 +52,8 @@ Matcher *Matcher::unlinkNode(Matcher *Other) { bool Matcher::canMoveBefore(const Matcher *Other) const { for (;; Other = Other->getNext()) { assert(Other && "Other didn't come before 'this'?"); - if (this == Other) return true; + if (this == Other) + return true; // We have to be able to move this node across the Other node. if (!canMoveBeforeNode(Other)) @@ -78,7 +76,6 @@ bool Matcher::canMoveBeforeNode(const Matcher *Other) const { return false; } - ScopeMatcher::~ScopeMatcher() { for (Matcher *C : Children) delete C; @@ -96,8 +93,8 @@ SwitchTypeMatcher::~SwitchTypeMatcher() { CheckPredicateMatcher::CheckPredicateMatcher( const TreePredicateFn &pred, const SmallVectorImpl &Ops) - : Matcher(CheckPredicate), Pred(pred.getOrigPatFragRecord()), - Operands(Ops.begin(), Ops.end()) {} + : Matcher(CheckPredicate), Pred(pred.getOrigPatFragRecord()), + Operands(Ops.begin(), Ops.end()) {} TreePredicateFn CheckPredicateMatcher::getPredicate() const { return TreePredicateFn(Pred); @@ -112,16 +109,15 @@ unsigned CheckPredicateMatcher::getOperandNo(unsigned i) const { return Operands[i]; } - // printImpl methods. void ScopeMatcher::printImpl(raw_ostream &OS, unsigned indent) const { OS.indent(indent) << "Scope\n"; for (const Matcher *C : Children) { if (!C) - OS.indent(indent+1) << "NULL POINTER\n"; + OS.indent(indent + 1) << "NULL POINTER\n"; else - C->print(OS, indent+2); + C->print(OS, indent + 2); } } @@ -137,7 +133,8 @@ void RecordMemRefMatcher::printImpl(raw_ostream &OS, unsigned indent) const { OS.indent(indent) << "RecordMemRef\n"; } -void CaptureGlueInputMatcher::printImpl(raw_ostream &OS, unsigned indent) const{ +void CaptureGlueInputMatcher::printImpl(raw_ostream &OS, + unsigned indent) const { OS.indent(indent) << "CaptureGlueInput\n"; } @@ -161,8 +158,8 @@ void CheckChildSameMatcher::printImpl(raw_ostream &OS, unsigned indent) const { OS.indent(indent) << "CheckChild" << ChildNo << "Same\n"; } -void CheckPatternPredicateMatcher:: -printImpl(raw_ostream &OS, unsigned indent) const { +void CheckPatternPredicateMatcher::printImpl(raw_ostream &OS, + unsigned indent) const { OS.indent(indent) << "CheckPatternPredicate " << Predicate << '\n'; } @@ -178,32 +175,30 @@ void SwitchOpcodeMatcher::printImpl(raw_ostream &OS, unsigned indent) const { OS.indent(indent) << "SwitchOpcode: {\n"; for (const auto &C : Cases) { OS.indent(indent) << "case " << C.first->getEnumName() << ":\n"; - C.second->print(OS, indent+2); + C.second->print(OS, indent + 2); } OS.indent(indent) << "}\n"; } - void CheckTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const { - OS.indent(indent) << "CheckType " << getEnumName(Type) << ", ResNo=" - << ResNo << '\n'; + OS.indent(indent) << "CheckType " << getEnumName(Type) << ", ResNo=" << ResNo + << '\n'; } void SwitchTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const { OS.indent(indent) << "SwitchType: {\n"; for (const auto &C : Cases) { OS.indent(indent) << "case " << getEnumName(C.first) << ":\n"; - C.second->print(OS, indent+2); + C.second->print(OS, indent + 2); } OS.indent(indent) << "}\n"; } void CheckChildTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const { - OS.indent(indent) << "CheckChildType " << ChildNo << " " - << getEnumName(Type) << '\n'; + OS.indent(indent) << "CheckChildType " << ChildNo << " " << getEnumName(Type) + << '\n'; } - void CheckIntegerMatcher::printImpl(raw_ostream &OS, unsigned indent) const { OS.indent(indent) << "CheckInteger " << Value << '\n'; } @@ -258,8 +253,8 @@ void EmitIntegerMatcher::printImpl(raw_ostream &OS, unsigned indent) const { << '\n'; } -void EmitStringIntegerMatcher:: -printImpl(raw_ostream &OS, unsigned indent) const { +void EmitStringIntegerMatcher::printImpl(raw_ostream &OS, + unsigned indent) const { OS.indent(indent) << "EmitStringInteger " << Val << " VT=" << getEnumName(VT) << '\n'; } @@ -273,13 +268,13 @@ void EmitRegisterMatcher::printImpl(raw_ostream &OS, unsigned indent) const { OS << " VT=" << getEnumName(VT) << '\n'; } -void EmitConvertToTargetMatcher:: -printImpl(raw_ostream &OS, unsigned indent) const { +void EmitConvertToTargetMatcher::printImpl(raw_ostream &OS, + unsigned indent) const { OS.indent(indent) << "EmitConvertToTarget " << Slot << '\n'; } -void EmitMergeInputChainsMatcher:: -printImpl(raw_ostream &OS, unsigned indent) const { +void EmitMergeInputChainsMatcher::printImpl(raw_ostream &OS, + unsigned indent) const { OS.indent(indent) << "EmitMergeInputChains \n"; } @@ -289,10 +284,9 @@ void EmitCopyToRegMatcher::printImpl(raw_ostream &OS, unsigned indent) const { void EmitNodeXFormMatcher::printImpl(raw_ostream &OS, unsigned indent) const { OS.indent(indent) << "EmitNodeXForm " << NodeXForm->getName() - << " Slot=" << Slot << '\n'; + << " Slot=" << Slot << '\n'; } - void EmitNodeMatcherCommon::printImpl(raw_ostream &OS, unsigned indent) const { OS.indent(indent); OS << (isa(this) ? "MorphNodeTo: " : "EmitNode: ") @@ -316,7 +310,7 @@ bool CheckOpcodeMatcher::isEqualImpl(const Matcher *M) const { // Note: pointer equality isn't enough here, we have to check the enum names // to ensure that the nodes are for the same opcode. return cast(M)->Opcode.getEnumName() == - Opcode.getEnumName(); + Opcode.getEnumName(); } bool EmitNodeMatcherCommon::isEqualImpl(const Matcher *m) const { @@ -327,9 +321,9 @@ bool EmitNodeMatcherCommon::isEqualImpl(const Matcher *m) const { M->NumFixedArityOperands == NumFixedArityOperands; } -void EmitNodeMatcher::anchor() { } +void EmitNodeMatcher::anchor() {} -void MorphNodeToMatcher::anchor() { } +void MorphNodeToMatcher::anchor() {} // isContradictoryImpl Implementations. @@ -337,7 +331,8 @@ static bool TypesAreContradictory(MVT::SimpleValueType T1, MVT::SimpleValueType T2) { // If the two types are the same, then they are the same, so they don't // contradict. - if (T1 == T2) return false; + if (T1 == T2) + return false; // If either type is about iPtr, then they don't conflict unless the other // one is not a scalar integer type. @@ -400,7 +395,8 @@ bool CheckIntegerMatcher::isContradictoryImpl(const Matcher *M) const { } bool CheckChildIntegerMatcher::isContradictoryImpl(const Matcher *M) const { - if (const CheckChildIntegerMatcher *CCIM = dyn_cast(M)) { + if (const CheckChildIntegerMatcher *CCIM = + dyn_cast(M)) { // If the two checks are about different nodes, we don't know if they // conflict! if (CCIM->getChildNo() != getChildNo()) diff --git a/llvm/utils/TableGen/DAGISelMatcher.h b/llvm/utils/TableGen/DAGISelMatcher.h index 6615a15..d4fe513 100644 --- a/llvm/utils/TableGen/DAGISelMatcher.h +++ b/llvm/utils/TableGen/DAGISelMatcher.h @@ -21,185 +21,186 @@ #include namespace llvm { - class CodeGenRegister; - class CodeGenDAGPatterns; - class CodeGenInstruction; - class Matcher; - class PatternToMatch; - class raw_ostream; - class ComplexPattern; - class Record; - class SDNodeInfo; - class TreePredicateFn; - class TreePattern; - - Matcher *ConvertPatternToMatcher(const PatternToMatch &Pattern, - unsigned Variant, - const CodeGenDAGPatterns &CGP); - void OptimizeMatcher(std::unique_ptr &Matcher, - const CodeGenDAGPatterns &CGP); - void EmitMatcherTable(Matcher *Matcher, const CodeGenDAGPatterns &CGP, - raw_ostream &OS); - - /// Matcher - Base class for all the DAG ISel Matcher representation - /// nodes. - class Matcher { - // The next matcher node that is executed after this one. Null if this is - // the last stage of a match. - std::unique_ptr Next; - size_t Size = 0; // Size in bytes of matcher and all its children (if any). - virtual void anchor(); - - public: - enum KindTy { - // Matcher state manipulation. - Scope, // Push a checking scope. - RecordNode, // Record the current node. - RecordChild, // Record a child of the current node. - RecordMemRef, // Record the memref in the current node. - CaptureGlueInput, // If the current node has an input glue, save it. - MoveChild, // Move current node to specified child. - MoveSibling, // Move current node to specified sibling. - MoveParent, // Move current node to parent. - - // Predicate checking. - CheckSame, // Fail if not same as prev match. - CheckChildSame, // Fail if child not same as prev match. - CheckPatternPredicate, - CheckPredicate, // Fail if node predicate fails. - CheckOpcode, // Fail if not opcode. - SwitchOpcode, // Dispatch based on opcode. - CheckType, // Fail if not correct type. - SwitchType, // Dispatch based on type. - CheckChildType, // Fail if child has wrong type. - CheckInteger, // Fail if wrong val. - CheckChildInteger, // Fail if child is wrong val. - CheckCondCode, // Fail if not condcode. - CheckChild2CondCode, // Fail if child is wrong condcode. - CheckValueType, - CheckComplexPat, - CheckAndImm, - CheckOrImm, - CheckImmAllOnesV, - CheckImmAllZerosV, - CheckFoldableChainNode, - - // Node creation/emisssion. - EmitInteger, // Create a TargetConstant - EmitStringInteger, // Create a TargetConstant from a string. - EmitRegister, // Create a register. - EmitConvertToTarget, // Convert a imm/fpimm to target imm/fpimm - EmitMergeInputChains, // Merge together a chains for an input. - EmitCopyToReg, // Emit a copytoreg into a physreg. - EmitNode, // Create a DAG node - EmitNodeXForm, // Run a SDNodeXForm - CompleteMatch, // Finish a match and update the results. - MorphNodeTo, // Build a node, finish a match and update results. - - // Highest enum value; watch out when adding more. - HighestKind = MorphNodeTo - }; - const KindTy Kind; - - protected: - Matcher(KindTy K) : Kind(K) {} - - public: - virtual ~Matcher() {} - - unsigned getSize() const { return Size; } - void setSize(unsigned sz) { Size = sz; } - KindTy getKind() const { return Kind; } - - Matcher *getNext() { return Next.get(); } - const Matcher *getNext() const { return Next.get(); } - void setNext(Matcher *C) { Next.reset(C); } - Matcher *takeNext() { return Next.release(); } - - std::unique_ptr &getNextPtr() { return Next; } - - bool isEqual(const Matcher *M) const { - if (getKind() != M->getKind()) - return false; - return isEqualImpl(M); - } +class CodeGenRegister; +class CodeGenDAGPatterns; +class CodeGenInstruction; +class Matcher; +class PatternToMatch; +class raw_ostream; +class ComplexPattern; +class Record; +class SDNodeInfo; +class TreePredicateFn; +class TreePattern; + +Matcher *ConvertPatternToMatcher(const PatternToMatch &Pattern, + unsigned Variant, + const CodeGenDAGPatterns &CGP); +void OptimizeMatcher(std::unique_ptr &Matcher, + const CodeGenDAGPatterns &CGP); +void EmitMatcherTable(Matcher *Matcher, const CodeGenDAGPatterns &CGP, + raw_ostream &OS); + +/// Matcher - Base class for all the DAG ISel Matcher representation +/// nodes. +class Matcher { + // The next matcher node that is executed after this one. Null if this is + // the last stage of a match. + std::unique_ptr Next; + size_t Size = 0; // Size in bytes of matcher and all its children (if any). + virtual void anchor(); - /// isSimplePredicateNode - Return true if this is a simple predicate that - /// operates on the node or its children without potential side effects or a - /// change of the current node. - bool isSimplePredicateNode() const { - switch (getKind()) { - default: - return false; - case CheckSame: - case CheckChildSame: - case CheckPatternPredicate: - case CheckPredicate: - case CheckOpcode: - case CheckType: - case CheckChildType: - case CheckInteger: - case CheckChildInteger: - case CheckCondCode: - case CheckChild2CondCode: - case CheckValueType: - case CheckAndImm: - case CheckOrImm: - case CheckImmAllOnesV: - case CheckImmAllZerosV: - case CheckFoldableChainNode: - return true; - } - } +public: + enum KindTy { + // Matcher state manipulation. + Scope, // Push a checking scope. + RecordNode, // Record the current node. + RecordChild, // Record a child of the current node. + RecordMemRef, // Record the memref in the current node. + CaptureGlueInput, // If the current node has an input glue, save it. + MoveChild, // Move current node to specified child. + MoveSibling, // Move current node to specified sibling. + MoveParent, // Move current node to parent. + + // Predicate checking. + CheckSame, // Fail if not same as prev match. + CheckChildSame, // Fail if child not same as prev match. + CheckPatternPredicate, + CheckPredicate, // Fail if node predicate fails. + CheckOpcode, // Fail if not opcode. + SwitchOpcode, // Dispatch based on opcode. + CheckType, // Fail if not correct type. + SwitchType, // Dispatch based on type. + CheckChildType, // Fail if child has wrong type. + CheckInteger, // Fail if wrong val. + CheckChildInteger, // Fail if child is wrong val. + CheckCondCode, // Fail if not condcode. + CheckChild2CondCode, // Fail if child is wrong condcode. + CheckValueType, + CheckComplexPat, + CheckAndImm, + CheckOrImm, + CheckImmAllOnesV, + CheckImmAllZerosV, + CheckFoldableChainNode, + + // Node creation/emisssion. + EmitInteger, // Create a TargetConstant + EmitStringInteger, // Create a TargetConstant from a string. + EmitRegister, // Create a register. + EmitConvertToTarget, // Convert a imm/fpimm to target imm/fpimm + EmitMergeInputChains, // Merge together a chains for an input. + EmitCopyToReg, // Emit a copytoreg into a physreg. + EmitNode, // Create a DAG node + EmitNodeXForm, // Run a SDNodeXForm + CompleteMatch, // Finish a match and update the results. + MorphNodeTo, // Build a node, finish a match and update results. + + // Highest enum value; watch out when adding more. + HighestKind = MorphNodeTo + }; + const KindTy Kind; - /// isSimplePredicateOrRecordNode - Return true if this is a record node or - /// a simple predicate. - bool isSimplePredicateOrRecordNode() const { - return isSimplePredicateNode() || getKind() == RecordNode || - getKind() == RecordChild; - } +protected: + Matcher(KindTy K) : Kind(K) {} - /// unlinkNode - Unlink the specified node from this chain. If Other == - /// this, we unlink the next pointer and return it. Otherwise we unlink - /// Other from the list and return this. - Matcher *unlinkNode(Matcher *Other); - - /// canMoveBefore - Return true if this matcher is the same as Other, or if - /// we can move this matcher past all of the nodes in-between Other and this - /// node. Other must be equal to or before this. - bool canMoveBefore(const Matcher *Other) const; - - /// canMoveBeforeNode - Return true if it is safe to move the current - /// matcher across the specified one. - bool canMoveBeforeNode(const Matcher *Other) const; - - /// isContradictory - Return true of these two matchers could never match on - /// the same node. - bool isContradictory(const Matcher *Other) const { - // Since this predicate is reflexive, we canonicalize the ordering so that - // we always match a node against nodes with kinds that are greater or - // equal to them. For example, we'll pass in a CheckType node as an - // argument to the CheckOpcode method, not the other way around. - if (getKind() < Other->getKind()) - return isContradictoryImpl(Other); - return Other->isContradictoryImpl(this); +public: + virtual ~Matcher() {} + + unsigned getSize() const { return Size; } + void setSize(unsigned sz) { Size = sz; } + KindTy getKind() const { return Kind; } + + Matcher *getNext() { return Next.get(); } + const Matcher *getNext() const { return Next.get(); } + void setNext(Matcher *C) { Next.reset(C); } + Matcher *takeNext() { return Next.release(); } + + std::unique_ptr &getNextPtr() { return Next; } + + bool isEqual(const Matcher *M) const { + if (getKind() != M->getKind()) + return false; + return isEqualImpl(M); + } + + /// isSimplePredicateNode - Return true if this is a simple predicate that + /// operates on the node or its children without potential side effects or a + /// change of the current node. + bool isSimplePredicateNode() const { + switch (getKind()) { + default: + return false; + case CheckSame: + case CheckChildSame: + case CheckPatternPredicate: + case CheckPredicate: + case CheckOpcode: + case CheckType: + case CheckChildType: + case CheckInteger: + case CheckChildInteger: + case CheckCondCode: + case CheckChild2CondCode: + case CheckValueType: + case CheckAndImm: + case CheckOrImm: + case CheckImmAllOnesV: + case CheckImmAllZerosV: + case CheckFoldableChainNode: + return true; } + } - void print(raw_ostream &OS, unsigned indent = 0) const; - void printOne(raw_ostream &OS) const; - void dump() const; + /// isSimplePredicateOrRecordNode - Return true if this is a record node or + /// a simple predicate. + bool isSimplePredicateOrRecordNode() const { + return isSimplePredicateNode() || getKind() == RecordNode || + getKind() == RecordChild; + } - protected: - virtual void printImpl(raw_ostream &OS, unsigned indent) const = 0; - virtual bool isEqualImpl(const Matcher *M) const = 0; - virtual bool isContradictoryImpl(const Matcher *M) const { return false; } - }; + /// unlinkNode - Unlink the specified node from this chain. If Other == + /// this, we unlink the next pointer and return it. Otherwise we unlink + /// Other from the list and return this. + Matcher *unlinkNode(Matcher *Other); + + /// canMoveBefore - Return true if this matcher is the same as Other, or if + /// we can move this matcher past all of the nodes in-between Other and this + /// node. Other must be equal to or before this. + bool canMoveBefore(const Matcher *Other) const; + + /// canMoveBeforeNode - Return true if it is safe to move the current + /// matcher across the specified one. + bool canMoveBeforeNode(const Matcher *Other) const; + + /// isContradictory - Return true of these two matchers could never match on + /// the same node. + bool isContradictory(const Matcher *Other) const { + // Since this predicate is reflexive, we canonicalize the ordering so that + // we always match a node against nodes with kinds that are greater or + // equal to them. For example, we'll pass in a CheckType node as an + // argument to the CheckOpcode method, not the other way around. + if (getKind() < Other->getKind()) + return isContradictoryImpl(Other); + return Other->isContradictoryImpl(this); + } + + void print(raw_ostream &OS, unsigned indent = 0) const; + void printOne(raw_ostream &OS) const; + void dump() const; + +protected: + virtual void printImpl(raw_ostream &OS, unsigned indent) const = 0; + virtual bool isEqualImpl(const Matcher *M) const = 0; + virtual bool isContradictoryImpl(const Matcher *M) const { return false; } +}; /// ScopeMatcher - This attempts to match each of its children to find the first /// one that successfully matches. If one child fails, it tries the next child. /// If none of the children match then this check fails. It never has a 'next'. class ScopeMatcher : public Matcher { - SmallVector Children; + SmallVector Children; + public: ScopeMatcher(SmallVectorImpl &&children) : Matcher(Scope), Children(std::move(children)) {} @@ -230,9 +231,7 @@ public: Children.resize(NC); } - static bool classof(const Matcher *N) { - return N->getKind() == Scope; - } + static bool classof(const Matcher *N) { return N->getKind() == Scope; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -248,16 +247,15 @@ class RecordMatcher : public Matcher { /// ResultNo - The slot number in the RecordedNodes vector that this will be, /// just printed as a comment. unsigned ResultNo; + public: RecordMatcher(const std::string &whatfor, unsigned resultNo) - : Matcher(RecordNode), WhatFor(whatfor), ResultNo(resultNo) {} + : Matcher(RecordNode), WhatFor(whatfor), ResultNo(resultNo) {} const std::string &getWhatFor() const { return WhatFor; } unsigned getResultNo() const { return ResultNo; } - static bool classof(const Matcher *N) { - return N->getKind() == RecordNode; - } + static bool classof(const Matcher *N) { return N->getKind() == RecordNode; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -277,19 +275,18 @@ class RecordChildMatcher : public Matcher { /// ResultNo - The slot number in the RecordedNodes vector that this will be, /// just printed as a comment. unsigned ResultNo; + public: RecordChildMatcher(unsigned childno, const std::string &whatfor, unsigned resultNo) - : Matcher(RecordChild), ChildNo(childno), WhatFor(whatfor), - ResultNo(resultNo) {} + : Matcher(RecordChild), ChildNo(childno), WhatFor(whatfor), + ResultNo(resultNo) {} unsigned getChildNo() const { return ChildNo; } const std::string &getWhatFor() const { return WhatFor; } unsigned getResultNo() const { return ResultNo; } - static bool classof(const Matcher *N) { - return N->getKind() == RecordChild; - } + static bool classof(const Matcher *N) { return N->getKind() == RecordChild; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -303,16 +300,13 @@ class RecordMemRefMatcher : public Matcher { public: RecordMemRefMatcher() : Matcher(RecordMemRef) {} - static bool classof(const Matcher *N) { - return N->getKind() == RecordMemRef; - } + static bool classof(const Matcher *N) { return N->getKind() == RecordMemRef; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; bool isEqualImpl(const Matcher *M) const override { return true; } }; - /// CaptureGlueInputMatcher - If the current record has a glue input, record /// it so that it is used as an input to the generated code. class CaptureGlueInputMatcher : public Matcher { @@ -332,14 +326,13 @@ private: /// specified child node. class MoveChildMatcher : public Matcher { unsigned ChildNo; + public: MoveChildMatcher(unsigned childNo) : Matcher(MoveChild), ChildNo(childNo) {} unsigned getChildNo() const { return ChildNo; } - static bool classof(const Matcher *N) { - return N->getKind() == MoveChild; - } + static bool classof(const Matcher *N) { return N->getKind() == MoveChild; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -374,9 +367,7 @@ class MoveParentMatcher : public Matcher { public: MoveParentMatcher() : Matcher(MoveParent) {} - static bool classof(const Matcher *N) { - return N->getKind() == MoveParent; - } + static bool classof(const Matcher *N) { return N->getKind() == MoveParent; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -388,15 +379,14 @@ private: /// when patterns have the same name in them, like '(mul GPR:$in, GPR:$in)'. class CheckSameMatcher : public Matcher { unsigned MatchNumber; + public: CheckSameMatcher(unsigned matchnumber) - : Matcher(CheckSame), MatchNumber(matchnumber) {} + : Matcher(CheckSame), MatchNumber(matchnumber) {} unsigned getMatchNumber() const { return MatchNumber; } - static bool classof(const Matcher *N) { - return N->getKind() == CheckSame; - } + static bool classof(const Matcher *N) { return N->getKind() == CheckSame; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -411,9 +401,10 @@ private: class CheckChildSameMatcher : public Matcher { unsigned ChildNo; unsigned MatchNumber; + public: CheckChildSameMatcher(unsigned childno, unsigned matchnumber) - : Matcher(CheckChildSame), ChildNo(childno), MatchNumber(matchnumber) {} + : Matcher(CheckChildSame), ChildNo(childno), MatchNumber(matchnumber) {} unsigned getChildNo() const { return ChildNo; } unsigned getMatchNumber() const { return MatchNumber; } @@ -435,9 +426,10 @@ private: /// not take a node as input. This is used for subtarget feature checks etc. class CheckPatternPredicateMatcher : public Matcher { std::string Predicate; + public: CheckPatternPredicateMatcher(StringRef predicate) - : Matcher(CheckPatternPredicate), Predicate(predicate) {} + : Matcher(CheckPatternPredicate), Predicate(predicate) {} StringRef getPredicate() const { return Predicate; } @@ -457,6 +449,7 @@ private: class CheckPredicateMatcher : public Matcher { TreePattern *Pred; const SmallVector Operands; + public: CheckPredicateMatcher(const TreePredicateFn &pred, const SmallVectorImpl &Operands); @@ -476,20 +469,18 @@ private: } }; - /// CheckOpcodeMatcher - This checks to see if the current node has the /// specified opcode, if not it fails to match. class CheckOpcodeMatcher : public Matcher { const SDNodeInfo &Opcode; + public: CheckOpcodeMatcher(const SDNodeInfo &opcode) - : Matcher(CheckOpcode), Opcode(opcode) {} + : Matcher(CheckOpcode), Opcode(opcode) {} const SDNodeInfo &getOpcode() const { return Opcode; } - static bool classof(const Matcher *N) { - return N->getKind() == CheckOpcode; - } + static bool classof(const Matcher *N) { return N->getKind() == CheckOpcode; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -502,16 +493,15 @@ private: /// then the match fails. This is semantically equivalent to a Scope node where /// every child does a CheckOpcode, but is much faster. class SwitchOpcodeMatcher : public Matcher { - SmallVector, 8> Cases; + SmallVector, 8> Cases; + public: SwitchOpcodeMatcher( SmallVectorImpl> &&cases) : Matcher(SwitchOpcode), Cases(std::move(cases)) {} ~SwitchOpcodeMatcher() override; - static bool classof(const Matcher *N) { - return N->getKind() == SwitchOpcode; - } + static bool classof(const Matcher *N) { return N->getKind() == SwitchOpcode; } unsigned getNumCases() const { return Cases.size(); } @@ -529,16 +519,15 @@ private: class CheckTypeMatcher : public Matcher { MVT::SimpleValueType Type; unsigned ResNo; + public: CheckTypeMatcher(MVT::SimpleValueType type, unsigned resno) - : Matcher(CheckType), Type(type), ResNo(resno) {} + : Matcher(CheckType), Type(type), ResNo(resno) {} MVT::SimpleValueType getType() const { return Type; } unsigned getResNo() const { return ResNo; } - static bool classof(const Matcher *N) { - return N->getKind() == CheckType; - } + static bool classof(const Matcher *N) { return N->getKind() == CheckType; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -553,16 +542,15 @@ private: /// then the match fails. This is semantically equivalent to a Scope node where /// every child does a CheckType, but is much faster. class SwitchTypeMatcher : public Matcher { - SmallVector, 8> Cases; + SmallVector, 8> Cases; + public: SwitchTypeMatcher( SmallVectorImpl> &&cases) : Matcher(SwitchType), Cases(std::move(cases)) {} ~SwitchTypeMatcher() override; - static bool classof(const Matcher *N) { - return N->getKind() == SwitchType; - } + static bool classof(const Matcher *N) { return N->getKind() == SwitchType; } unsigned getNumCases() const { return Cases.size(); } @@ -575,15 +563,15 @@ private: bool isEqualImpl(const Matcher *M) const override { return false; } }; - /// CheckChildTypeMatcher - This checks to see if a child node has the /// specified type, if not it fails to match. class CheckChildTypeMatcher : public Matcher { unsigned ChildNo; MVT::SimpleValueType Type; + public: CheckChildTypeMatcher(unsigned childno, MVT::SimpleValueType type) - : Matcher(CheckChildType), ChildNo(childno), Type(type) {} + : Matcher(CheckChildType), ChildNo(childno), Type(type) {} unsigned getChildNo() const { return ChildNo; } MVT::SimpleValueType getType() const { return Type; } @@ -601,20 +589,17 @@ private: bool isContradictoryImpl(const Matcher *M) const override; }; - /// CheckIntegerMatcher - This checks to see if the current node is a /// ConstantSDNode with the specified integer value, if not it fails to match. class CheckIntegerMatcher : public Matcher { int64_t Value; + public: - CheckIntegerMatcher(int64_t value) - : Matcher(CheckInteger), Value(value) {} + CheckIntegerMatcher(int64_t value) : Matcher(CheckInteger), Value(value) {} int64_t getValue() const { return Value; } - static bool classof(const Matcher *N) { - return N->getKind() == CheckInteger; - } + static bool classof(const Matcher *N) { return N->getKind() == CheckInteger; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -629,9 +614,10 @@ private: class CheckChildIntegerMatcher : public Matcher { unsigned ChildNo; int64_t Value; + public: CheckChildIntegerMatcher(unsigned childno, int64_t value) - : Matcher(CheckChildInteger), ChildNo(childno), Value(value) {} + : Matcher(CheckChildInteger), ChildNo(childno), Value(value) {} unsigned getChildNo() const { return ChildNo; } int64_t getValue() const { return Value; } @@ -653,9 +639,10 @@ private: /// CondCodeSDNode with the specified condition, if not it fails to match. class CheckCondCodeMatcher : public Matcher { StringRef CondCodeName; + public: CheckCondCodeMatcher(StringRef condcodename) - : Matcher(CheckCondCode), CondCodeName(condcodename) {} + : Matcher(CheckCondCode), CondCodeName(condcodename) {} StringRef getCondCodeName() const { return CondCodeName; } @@ -675,9 +662,10 @@ private: /// CondCodeSDNode with the specified condition, if not it fails to match. class CheckChild2CondCodeMatcher : public Matcher { StringRef CondCodeName; + public: CheckChild2CondCodeMatcher(StringRef condcodename) - : Matcher(CheckChild2CondCode), CondCodeName(condcodename) {} + : Matcher(CheckChild2CondCode), CondCodeName(condcodename) {} StringRef getCondCodeName() const { return CondCodeName; } @@ -697,9 +685,10 @@ private: /// VTSDNode with the specified type, if not it fails to match. class CheckValueTypeMatcher : public Matcher { StringRef TypeName; + public: CheckValueTypeMatcher(StringRef type_name) - : Matcher(CheckValueType), TypeName(type_name) {} + : Matcher(CheckValueType), TypeName(type_name) {} StringRef getTypeName() const { return TypeName; } @@ -715,8 +704,6 @@ private: bool isContradictoryImpl(const Matcher *M) const override; }; - - /// CheckComplexPatMatcher - This node runs the specified ComplexPattern on /// the current node. class CheckComplexPatMatcher : public Matcher { @@ -732,11 +719,12 @@ class CheckComplexPatMatcher : public Matcher { /// FirstResult - This is the first slot in the RecordedNodes list that the /// result of the match populates. unsigned FirstResult; + public: CheckComplexPatMatcher(const ComplexPattern &pattern, unsigned matchnumber, const std::string &name, unsigned firstresult) - : Matcher(CheckComplexPat), Pattern(pattern), MatchNumber(matchnumber), - Name(name), FirstResult(firstresult) {} + : Matcher(CheckComplexPat), Pattern(pattern), MatchNumber(matchnumber), + Name(name), FirstResult(firstresult) {} const ComplexPattern &getPattern() const { return Pattern; } unsigned getMatchNumber() const { return MatchNumber; } @@ -760,15 +748,13 @@ private: /// with something equivalent to the specified immediate. class CheckAndImmMatcher : public Matcher { int64_t Value; + public: - CheckAndImmMatcher(int64_t value) - : Matcher(CheckAndImm), Value(value) {} + CheckAndImmMatcher(int64_t value) : Matcher(CheckAndImm), Value(value) {} int64_t getValue() const { return Value; } - static bool classof(const Matcher *N) { - return N->getKind() == CheckAndImm; - } + static bool classof(const Matcher *N) { return N->getKind() == CheckAndImm; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -781,15 +767,13 @@ private: /// with something equivalent to the specified immediate. class CheckOrImmMatcher : public Matcher { int64_t Value; + public: - CheckOrImmMatcher(int64_t value) - : Matcher(CheckOrImm), Value(value) {} + CheckOrImmMatcher(int64_t value) : Matcher(CheckOrImm), Value(value) {} int64_t getValue() const { return Value; } - static bool classof(const Matcher *N) { - return N->getKind() == CheckOrImm; - } + static bool classof(const Matcher *N) { return N->getKind() == CheckOrImm; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -834,8 +818,7 @@ private: /// (which defines a chain operand) is safe to fold into a larger pattern. class CheckFoldableChainNodeMatcher : public Matcher { public: - CheckFoldableChainNodeMatcher() - : Matcher(CheckFoldableChainNode) {} + CheckFoldableChainNodeMatcher() : Matcher(CheckFoldableChainNode) {} static bool classof(const Matcher *N) { return N->getKind() == CheckFoldableChainNode; @@ -850,16 +833,15 @@ private: class EmitIntegerMatcher : public Matcher { int64_t Val; MVT::SimpleValueType VT; + public: EmitIntegerMatcher(int64_t val, MVT::SimpleValueType vt) - : Matcher(EmitInteger), Val(val), VT(vt) {} + : Matcher(EmitInteger), Val(val), VT(vt) {} int64_t getValue() const { return Val; } MVT::SimpleValueType getVT() const { return VT; } - static bool classof(const Matcher *N) { - return N->getKind() == EmitInteger; - } + static bool classof(const Matcher *N) { return N->getKind() == EmitInteger; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -874,9 +856,10 @@ private: class EmitStringIntegerMatcher : public Matcher { std::string Val; MVT::SimpleValueType VT; + public: EmitStringIntegerMatcher(const std::string &val, MVT::SimpleValueType vt) - : Matcher(EmitStringInteger), Val(val), VT(vt) {} + : Matcher(EmitStringInteger), Val(val), VT(vt) {} const std::string &getValue() const { return Val; } MVT::SimpleValueType getVT() const { return VT; } @@ -899,16 +882,15 @@ class EmitRegisterMatcher : public Matcher { /// this is a reference to zero_reg. const CodeGenRegister *Reg; MVT::SimpleValueType VT; + public: EmitRegisterMatcher(const CodeGenRegister *reg, MVT::SimpleValueType vt) - : Matcher(EmitRegister), Reg(reg), VT(vt) {} + : Matcher(EmitRegister), Reg(reg), VT(vt) {} const CodeGenRegister *getReg() const { return Reg; } MVT::SimpleValueType getVT() const { return VT; } - static bool classof(const Matcher *N) { - return N->getKind() == EmitRegister; - } + static bool classof(const Matcher *N) { return N->getKind() == EmitRegister; } private: void printImpl(raw_ostream &OS, unsigned indent) const override; @@ -923,9 +905,10 @@ private: /// ISD::TargetConstant, likewise for ConstantFP. class EmitConvertToTargetMatcher : public Matcher { unsigned Slot; + public: EmitConvertToTargetMatcher(unsigned slot) - : Matcher(EmitConvertToTarget), Slot(slot) {} + : Matcher(EmitConvertToTarget), Slot(slot) {} unsigned getSlot() const { return Slot; } @@ -946,9 +929,10 @@ private: /// chains of these nodes if they are not themselves a node in the pattern. class EmitMergeInputChainsMatcher : public Matcher { SmallVector ChainNodes; + public: EmitMergeInputChainsMatcher(ArrayRef nodes) - : Matcher(EmitMergeInputChains), ChainNodes(nodes.begin(), nodes.end()) {} + : Matcher(EmitMergeInputChains), ChainNodes(nodes.begin(), nodes.end()) {} unsigned getNumNodes() const { return ChainNodes.size(); } @@ -976,9 +960,8 @@ class EmitCopyToRegMatcher : public Matcher { const CodeGenRegister *DestPhysReg; public: - EmitCopyToRegMatcher(unsigned srcSlot, - const CodeGenRegister *destPhysReg) - : Matcher(EmitCopyToReg), SrcSlot(srcSlot), DestPhysReg(destPhysReg) {} + EmitCopyToRegMatcher(unsigned srcSlot, const CodeGenRegister *destPhysReg) + : Matcher(EmitCopyToReg), SrcSlot(srcSlot), DestPhysReg(destPhysReg) {} unsigned getSrcSlot() const { return SrcSlot; } const CodeGenRegister *getDestPhysReg() const { return DestPhysReg; } @@ -995,16 +978,15 @@ private: } }; - - /// EmitNodeXFormMatcher - Emit an operation that runs an SDNodeXForm on a /// recorded node and records the result. class EmitNodeXFormMatcher : public Matcher { unsigned Slot; Record *NodeXForm; + public: EmitNodeXFormMatcher(unsigned slot, Record *nodeXForm) - : Matcher(EmitNodeXForm), Slot(slot), NodeXForm(nodeXForm) {} + : Matcher(EmitNodeXForm), Slot(slot), NodeXForm(nodeXForm) {} unsigned getSlot() const { return Slot; } Record *getNodeXForm() const { return NodeXForm; } @@ -1033,6 +1015,7 @@ class EmitNodeMatcherCommon : public Matcher { /// If this is a varidic node, this is set to the number of fixed arity /// operands in the root of the pattern. The rest are appended to this node. int NumFixedArityOperands; + public: EmitNodeMatcherCommon(const CodeGenInstruction &cgi, ArrayRef vts, @@ -1061,7 +1044,6 @@ public: const SmallVectorImpl &getVTList() const { return VTs; } const SmallVectorImpl &getOperandList() const { return Operands; } - bool hasChain() const { return HasChain; } bool hasInGlue() const { return HasInGlue; } bool hasOutGlue() const { return HasOutGlue; } @@ -1081,6 +1063,7 @@ private: class EmitNodeMatcher : public EmitNodeMatcherCommon { void anchor() override; unsigned FirstResultSlot; + public: EmitNodeMatcher(const CodeGenInstruction &cgi, ArrayRef vts, @@ -1094,15 +1077,13 @@ public: unsigned getFirstResultSlot() const { return FirstResultSlot; } - static bool classof(const Matcher *N) { - return N->getKind() == EmitNode; - } - + static bool classof(const Matcher *N) { return N->getKind() == EmitNode; } }; class MorphNodeToMatcher : public EmitNodeMatcherCommon { void anchor() override; const PatternToMatch &Pattern; + public: MorphNodeToMatcher(const CodeGenInstruction &cgi, ArrayRef vts, @@ -1116,9 +1097,7 @@ public: const PatternToMatch &getPattern() const { return Pattern; } - static bool classof(const Matcher *N) { - return N->getKind() == MorphNodeTo; - } + static bool classof(const Matcher *N) { return N->getKind() == MorphNodeTo; } }; /// CompleteMatchMatcher - Complete a match by replacing the results of the @@ -1127,11 +1106,12 @@ public: class CompleteMatchMatcher : public Matcher { SmallVector Results; const PatternToMatch &Pattern; + public: CompleteMatchMatcher(ArrayRef results, const PatternToMatch &pattern) - : Matcher(CompleteMatch), Results(results.begin(), results.end()), - Pattern(pattern) {} + : Matcher(CompleteMatch), Results(results.begin(), results.end()), + Pattern(pattern) {} unsigned getNumResults() const { return Results.size(); } unsigned getResult(unsigned R) const { return Results[R]; } @@ -1145,7 +1125,7 @@ private: void printImpl(raw_ostream &OS, unsigned indent) const override; bool isEqualImpl(const Matcher *M) const override { return cast(M)->Results == Results && - &cast(M)->Pattern == &Pattern; + &cast(M)->Pattern == &Pattern; } }; diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp index 50156d3..8d002e5 100644 --- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp @@ -50,7 +50,7 @@ namespace { class MatcherTableEmitter { const CodeGenDAGPatterns &CGP; - SmallVector OpcodeCounts; + SmallVector OpcodeCounts; std::vector NodePredicates; std::vector NodePredicatesWithOperands; @@ -62,14 +62,13 @@ class MatcherTableEmitter { std::vector PatternPredicates; - std::vector ComplexPatterns; + std::vector ComplexPatterns; - - DenseMap NodeXFormMap; - std::vector NodeXForms; + DenseMap NodeXFormMap; + std::vector NodeXForms; std::vector VecIncludeStrings; - MapVector > VecPatterns; + MapVector> VecPatterns; unsigned getPatternIdxFromTable(std::string &&P, std::string &&include_loc) { const auto It = VecPatterns.find(P); @@ -184,8 +183,8 @@ private: unsigned SizeMatcher(Matcher *N, raw_ostream &OS); - unsigned EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, - raw_ostream &OS); + unsigned EmitMatcher(const Matcher *N, const unsigned Indent, + unsigned CurrentIdx, raw_ostream &OS); unsigned getNodePredicate(TreePredicateFn Pred) { // We use the first predicate. @@ -210,9 +209,8 @@ private: NodeXForms.push_back(Rec); Entry = NodeXForms.size(); } - return Entry-1; + return Entry - 1; } - }; } // end anonymous namespace. @@ -224,14 +222,15 @@ static std::string GetPatFromTreePatternNode(const TreePatternNode *N) { } static unsigned GetVBRSize(unsigned Val) { - if (Val <= 127) return 1; + if (Val <= 127) + return 1; unsigned NumBytes = 0; while (Val >= 128) { Val >>= 7; ++NumBytes; } - return NumBytes+1; + return NumBytes + 1; } /// EmitVBRValue - Emit the specified value as a VBR, returning the number of @@ -245,7 +244,7 @@ static unsigned EmitVBRValue(uint64_t Val, raw_ostream &OS) { uint64_t InVal = Val; unsigned NumBytes = 0; while (Val >= 128) { - OS << (Val&127) << "|128,"; + OS << (Val & 127) << "|128,"; Val >>= 7; ++NumBytes; } @@ -253,7 +252,7 @@ static unsigned EmitVBRValue(uint64_t Val, raw_ostream &OS) { if (!OmitComments) OS << "/*" << InVal << "*/"; OS << ", "; - return NumBytes+1; + return NumBytes + 1; } /// Emit the specified signed value as a VBR. To improve compression we encode @@ -290,8 +289,7 @@ static std::string getIncludePath(const Record *R) { /// This function traverses the matcher tree and sizes all the nodes /// that are children of the three kinds of nodes that have them. -unsigned MatcherTableEmitter:: -SizeMatcherList(Matcher *N, raw_ostream &OS) { +unsigned MatcherTableEmitter::SizeMatcherList(Matcher *N, raw_ostream &OS) { unsigned Size = 0; while (N) { Size += SizeMatcher(N, OS); @@ -303,8 +301,7 @@ SizeMatcherList(Matcher *N, raw_ostream &OS) { /// This function sizes the children of the three kinds of nodes that /// have them. It does so by using special cases for those three /// nodes, but sharing the code in EmitMatcher() for the other kinds. -unsigned MatcherTableEmitter:: -SizeMatcher(Matcher *N, raw_ostream &OS) { +unsigned MatcherTableEmitter::SizeMatcher(Matcher *N, raw_ostream &OS) { unsigned Idx = 0; ++OpcodeCounts[N->getKind()]; @@ -389,7 +386,7 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) { "The sizes of Pattern and include vectors should be the same"); BeginEmitFunction(OS, "StringRef", "getPatternForIndex(unsigned Index)", - true/*AddOverride*/); + true /*AddOverride*/); OS << "{\n"; OS << "static const char *PATTERN_MATCH_TABLE[] = {\n"; @@ -403,7 +400,7 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) { EndEmitFunction(OS); BeginEmitFunction(OS, "StringRef", "getIncludePathForIndex(unsigned Index)", - true/*AddOverride*/); + true /*AddOverride*/); OS << "{\n"; OS << "static const char *INCLUDE_PATH_TABLE[] = {\n"; @@ -419,9 +416,10 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) { /// EmitMatcher - Emit bytes for the specified matcher and return /// the number of bytes emitted. -unsigned MatcherTableEmitter:: -EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, - raw_ostream &OS) { +unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N, + const unsigned Indent, + unsigned CurrentIdx, + raw_ostream &OS) { OS.indent(Indent); switch (N->getKind()) { @@ -434,7 +432,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, if (i == 0) { OS << "OPC_Scope, "; ++CurrentIdx; - } else { + } else { if (!OmitComments) { OS << "/*" << format_decimal(CurrentIdx, IndexWidth) << "*/"; OS.indent(Indent) << "/*Scope*/ "; @@ -451,7 +449,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, } OS << '\n'; - ChildSize = EmitMatcherList(SM->getChild(i), Indent+1, + ChildSize = EmitMatcherList(SM->getChild(i), Indent + 1, CurrentIdx + VBRSize, OS); assert(ChildSize == SM->getChild(i)->getSize() && "Emitted child size does not match calculated size"); @@ -471,18 +469,15 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, case Matcher::RecordNode: OS << "OPC_RecordNode,"; if (!OmitComments) - OS << " // #" - << cast(N)->getResultNo() << " = " + OS << " // #" << cast(N)->getResultNo() << " = " << cast(N)->getWhatFor(); OS << '\n'; return 1; case Matcher::RecordChild: - OS << "OPC_RecordChild" << cast(N)->getChildNo() - << ','; + OS << "OPC_RecordChild" << cast(N)->getChildNo() << ','; if (!OmitComments) - OS << " // #" - << cast(N)->getResultNo() << " = " + OS << " // #" << cast(N)->getResultNo() << " = " << cast(N)->getWhatFor(); OS << '\n'; return 1; @@ -522,14 +517,13 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, return 1; case Matcher::CheckSame: - OS << "OPC_CheckSame, " - << cast(N)->getMatchNumber() << ",\n"; + OS << "OPC_CheckSame, " << cast(N)->getMatchNumber() + << ",\n"; return 2; case Matcher::CheckChildSame: - OS << "OPC_CheckChild" - << cast(N)->getChildNo() << "Same, " - << cast(N)->getMatchNumber() << ",\n"; + OS << "OPC_CheckChild" << cast(N)->getChildNo() + << "Same, " << cast(N)->getMatchNumber() << ",\n"; return 2; case Matcher::CheckPatternPredicate: { @@ -602,10 +596,10 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, unsigned IdxSize; if (const SwitchOpcodeMatcher *SOM = dyn_cast(N)) { Child = SOM->getCaseMatcher(i); - IdxSize = 2; // size of opcode in table is 2 bytes. + IdxSize = 2; // size of opcode in table is 2 bytes. } else { Child = cast(N)->getCaseMatcher(i); - IdxSize = 1; // size of type in table is 1 byte. + IdxSize = 1; // size of type in table is 1 byte. } if (i != 0) { @@ -613,8 +607,8 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, OS << "/*" << format_decimal(CurrentIdx, IndexWidth) << "*/"; OS.indent(Indent); if (!OmitComments) - OS << (isa(N) ? - "/*SwitchOpcode*/ " : "/*SwitchType*/ "); + OS << (isa(N) ? "/*SwitchOpcode*/ " + : "/*SwitchType*/ "); } unsigned ChildSize = Child->getSize(); @@ -627,7 +621,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, OS << "// ->" << CurrentIdx + ChildSize; OS << '\n'; - ChildSize = EmitMatcherList(Child, Indent+1, CurrentIdx, OS); + ChildSize = EmitMatcherList(Child, Indent + 1, CurrentIdx, OS); assert(ChildSize == Child->getSize() && "Emitted child size does not match calculated size"); CurrentIdx += ChildSize; @@ -638,8 +632,8 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, OS << "/*" << format_decimal(CurrentIdx, IndexWidth) << "*/"; OS.indent(Indent) << "0,"; if (!OmitComments) - OS << (isa(N) ? - " // EndSwitchOpcode" : " // EndSwitchType"); + OS << (isa(N) ? " // EndSwitchOpcode" + : " // EndSwitchType"); OS << '\n'; return CurrentIdx - StartIdx + 1; @@ -722,7 +716,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, OS << " // " << Pattern.getSelectFunc(); OS << ":$" << CCPM->getName(); for (unsigned i = 0, e = Pattern.getNumOperands(); i != e; ++i) - OS << " #" << CCPM->getFirstResult()+i; + OS << " #" << CCPM->getFirstResult() + i; if (Pattern.hasProperty(SDNPHasChain)) OS << " + chain result"; @@ -733,14 +727,16 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, case Matcher::CheckAndImm: { OS << "OPC_CheckAndImm, "; - unsigned Bytes=1+EmitVBRValue(cast(N)->getValue(), OS); + unsigned Bytes = + 1 + EmitVBRValue(cast(N)->getValue(), OS); OS << '\n'; return Bytes; } case Matcher::CheckOrImm: { OS << "OPC_CheckOrImm, "; - unsigned Bytes = 1+EmitVBRValue(cast(N)->getValue(), OS); + unsigned Bytes = + 1 + EmitVBRValue(cast(N)->getValue(), OS); OS << '\n'; return Bytes; } @@ -843,7 +839,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, case Matcher::EmitMergeInputChains: { const EmitMergeInputChainsMatcher *MN = - cast(N); + cast(N); // Handle the specialized forms OPC_EmitMergeInputChains1_0, 1_1, and 1_2. if (MN->getNumNodes() == 1 && MN->getNode(0) < 3) { @@ -855,7 +851,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, for (unsigned i = 0, e = MN->getNumNodes(); i != e; ++i) OS << MN->getNode(i) << ", "; OS << '\n'; - return 2+MN->getNumNodes(); + return 2 + MN->getNumNodes(); } case Matcher::EmitCopyToReg: { const auto *C2RMatcher = cast(N); @@ -884,8 +880,8 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, OS << "OPC_EmitNodeXForm, " << getNodeXFormID(XF->getNodeXForm()) << ", " << XF->getSlot() << ','; if (!OmitComments) - OS << " // "<getNodeXForm()->getName(); - OS <<'\n'; + OS << " // " << XF->getNodeXForm()->getName(); + OS << '\n'; return 3; } @@ -955,7 +951,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, } OS << ",\n"; - OS.indent(FullIndexWidth + Indent+4); + OS.indent(FullIndexWidth + Indent + 4); if (!CompressVTs) { OS << EN->getNumVTs(); if (!OmitComments) @@ -980,17 +976,18 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, OS << " // Results ="; unsigned First = E->getFirstResultSlot(); for (unsigned i = 0; i != NumResults; ++i) - OS << " #" << First+i; + OS << " #" << First + i; } } OS << '\n'; if (const MorphNodeToMatcher *SNT = dyn_cast(N)) { - OS.indent(FullIndexWidth + Indent) << "// Src: " - << *SNT->getPattern().getSrcPattern() << " - Complexity = " - << SNT->getPattern().getPatternComplexity(CGP) << '\n'; - OS.indent(FullIndexWidth + Indent) << "// Dst: " - << *SNT->getPattern().getDstPattern() << '\n'; + OS.indent(FullIndexWidth + Indent) + << "// Src: " << *SNT->getPattern().getSrcPattern() + << " - Complexity = " << SNT->getPattern().getPatternComplexity(CGP) + << '\n'; + OS.indent(FullIndexWidth + Indent) + << "// Dst: " << *SNT->getPattern().getDstPattern() << '\n'; } } else OS << '\n'; @@ -1021,11 +1018,12 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, NumResultBytes += EmitVBRValue(CM->getResult(i), OS); OS << '\n'; if (!OmitComments) { - OS.indent(FullIndexWidth + Indent) << " // Src: " - << *CM->getPattern().getSrcPattern() << " - Complexity = " - << CM->getPattern().getPatternComplexity(CGP) << '\n'; - OS.indent(FullIndexWidth + Indent) << " // Dst: " - << *CM->getPattern().getDstPattern(); + OS.indent(FullIndexWidth + Indent) + << " // Src: " << *CM->getPattern().getSrcPattern() + << " - Complexity = " << CM->getPattern().getPatternComplexity(CGP) + << '\n'; + OS.indent(FullIndexWidth + Indent) + << " // Dst: " << *CM->getPattern().getDstPattern(); } OS << '\n'; return 2 + NumResultBytes + NumCoveredBytes; @@ -1036,9 +1034,10 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, /// This function traverses the matcher tree and emits all the nodes. /// The nodes have already been sized. -unsigned MatcherTableEmitter:: -EmitMatcherList(const Matcher *N, const unsigned Indent, unsigned CurrentIdx, - raw_ostream &OS) { +unsigned MatcherTableEmitter::EmitMatcherList(const Matcher *N, + const unsigned Indent, + unsigned CurrentIdx, + raw_ostream &OS) { unsigned Size = 0; while (N) { if (!OmitComments) @@ -1059,7 +1058,7 @@ void MatcherTableEmitter::EmitNodePredicatesFunction( if (Preds.empty()) return; - BeginEmitFunction(OS, "bool", Decl, true/*AddOverride*/); + BeginEmitFunction(OS, "bool", Decl, true /*AddOverride*/); OS << "{\n"; OS << " switch (PredNo) {\n"; OS << " default: llvm_unreachable(\"Invalid predicate in table?\");\n"; @@ -1083,12 +1082,13 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) { // Emit pattern predicates. if (!PatternPredicates.empty()) { BeginEmitFunction(OS, "bool", - "CheckPatternPredicate(unsigned PredNo) const", true/*AddOverride*/); + "CheckPatternPredicate(unsigned PredNo) const", + true /*AddOverride*/); OS << "{\n"; OS << " switch (PredNo) {\n"; OS << " default: llvm_unreachable(\"Invalid predicate in table?\");\n"; for (unsigned i = 0, e = PatternPredicates.size(); i != e; ++i) - OS << " case " << i << ": return " << PatternPredicates[i] << ";\n"; + OS << " case " << i << ": return " << PatternPredicates[i] << ";\n"; OS << " }\n"; OS << "}\n"; EndEmitFunction(OS); @@ -1107,11 +1107,12 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) { // Emit CompletePattern matchers. // FIXME: This should be const. if (!ComplexPatterns.empty()) { - BeginEmitFunction(OS, "bool", - "CheckComplexPattern(SDNode *Root, SDNode *Parent,\n" - " SDValue N, unsigned PatternNo,\n" - " SmallVectorImpl> &Result)", - true/*AddOverride*/); + BeginEmitFunction( + OS, "bool", + "CheckComplexPattern(SDNode *Root, SDNode *Parent,\n" + " SDValue N, unsigned PatternNo,\n" + " SmallVectorImpl> &Result)", + true /*AddOverride*/); OS << "{\n"; OS << " unsigned NextRes = Result.size();\n"; OS << " switch (PatternNo) {\n"; @@ -1121,7 +1122,7 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) { unsigned NumOps = P.getNumOperands(); if (P.hasProperty(SDNPHasChain)) - ++NumOps; // Get the chained node too. + ++NumOps; // Get the chained node too. OS << " case " << i << ":\n"; if (InstrumentCoverage) @@ -1160,12 +1161,12 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) { EndEmitFunction(OS); } - // Emit SDNodeXForm handlers. // FIXME: This should be const. if (!NodeXForms.empty()) { BeginEmitFunction(OS, "SDValue", - "RunSDNodeXForm(SDValue V, unsigned XFormNo)", true/*AddOverride*/); + "RunSDNodeXForm(SDValue V, unsigned XFormNo)", + true /*AddOverride*/); OS << "{\n"; OS << " switch (XFormNo) {\n"; OS << " default: llvm_unreachable(\"Invalid xform # in table?\");\n"; @@ -1173,7 +1174,7 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) { // FIXME: The node xform could take SDValue's instead of SDNode*'s. for (unsigned i = 0, e = NodeXForms.size(); i != e; ++i) { const CodeGenDAGPatterns::NodeXForm &Entry = - CGP.getSDNodeTransform(NodeXForms[i]); + CGP.getSDNodeTransform(NodeXForms[i]); Record *SDNode = Entry.first; const std::string &Code = Entry.second; @@ -1281,8 +1282,7 @@ static StringRef getOpcodeString(Matcher::KindTy Kind) { llvm_unreachable("Unhandled opcode?"); } -void MatcherTableEmitter::EmitHistogram(const Matcher *M, - raw_ostream &OS) { +void MatcherTableEmitter::EmitHistogram(const Matcher *M, raw_ostream &OS) { if (OmitComments) return; @@ -1295,9 +1295,7 @@ void MatcherTableEmitter::EmitHistogram(const Matcher *M, OS << '\n'; } - -void llvm::EmitMatcherTable(Matcher *TheMatcher, - const CodeGenDAGPatterns &CGP, +void llvm::EmitMatcherTable(Matcher *TheMatcher, const CodeGenDAGPatterns &CGP, raw_ostream &OS) { OS << "#if defined(GET_DAGISEL_DECL) && defined(GET_DAGISEL_BODY)\n"; OS << "#error GET_DAGISEL_DECL and GET_DAGISEL_BODY cannot be both defined, "; @@ -1328,7 +1326,7 @@ void llvm::EmitMatcherTable(Matcher *TheMatcher, OS << "#define DAGISEL_CLASS_COLONCOLON\n"; OS << "#endif\n\n"; - BeginEmitFunction(OS, "void", "SelectCode(SDNode *N)", false/*AddOverride*/); + BeginEmitFunction(OS, "void", "SelectCode(SDNode *N)", false /*AddOverride*/); MatcherTableEmitter MatcherEmitter(TheMatcher, CGP); // First we size all the children of the three kinds of matchers that have @@ -1348,7 +1346,8 @@ void llvm::EmitMatcherTable(Matcher *TheMatcher, OS << " #define TARGET_VAL(X) X & 255, unsigned(X) >> 8\n"; OS << " static const unsigned char MatcherTable[] = {\n"; TotalSize = MatcherEmitter.EmitMatcherList(TheMatcher, 1, 0, OS); - OS << " 0\n }; // Total Array size is " << (TotalSize+1) << " bytes\n\n"; + OS << " 0\n }; // Total Array size is " << (TotalSize + 1) + << " bytes\n\n"; MatcherEmitter.EmitHistogram(TheMatcher, OS); diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp index 3526e97..8ca7aae 100644 --- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp @@ -20,7 +20,6 @@ #include using namespace llvm; - /// getRegisterValueType - Look up and return the ValueType of the specified /// register. If the register is a member of multiple register classes, they /// must all have the same type. @@ -52,96 +51,97 @@ static MVT::SimpleValueType getRegisterValueType(Record *R, return VT; } - namespace { - class MatcherGen { - const PatternToMatch &Pattern; - const CodeGenDAGPatterns &CGP; - - /// PatWithNoTypes - This is a clone of Pattern.getSrcPattern() that starts - /// out with all of the types removed. This allows us to insert type checks - /// as we scan the tree. - TreePatternNodePtr PatWithNoTypes; - - /// VariableMap - A map from variable names ('$dst') to the recorded operand - /// number that they were captured as. These are biased by 1 to make - /// insertion easier. - StringMap VariableMap; - - /// This maintains the recorded operand number that OPC_CheckComplexPattern - /// drops each sub-operand into. We don't want to insert these into - /// VariableMap because that leads to identity checking if they are - /// encountered multiple times. Biased by 1 like VariableMap for - /// consistency. - StringMap NamedComplexPatternOperands; - - /// NextRecordedOperandNo - As we emit opcodes to record matched values in - /// the RecordedNodes array, this keeps track of which slot will be next to - /// record into. - unsigned NextRecordedOperandNo; - - /// MatchedChainNodes - This maintains the position in the recorded nodes - /// array of all of the recorded input nodes that have chains. - SmallVector MatchedChainNodes; - - /// MatchedComplexPatterns - This maintains a list of all of the - /// ComplexPatterns that we need to check. The second element of each pair - /// is the recorded operand number of the input node. - SmallVector, 2> MatchedComplexPatterns; - - /// PhysRegInputs - List list has an entry for each explicitly specified - /// physreg input to the pattern. The first elt is the Register node, the - /// second is the recorded slot number the input pattern match saved it in. - SmallVector, 2> PhysRegInputs; - - /// Matcher - This is the top level of the generated matcher, the result. - Matcher *TheMatcher; - - /// CurPredicate - As we emit matcher nodes, this points to the latest check - /// which should have future checks stuck into its Next position. - Matcher *CurPredicate; - public: - MatcherGen(const PatternToMatch &pattern, const CodeGenDAGPatterns &cgp); - - bool EmitMatcherCode(unsigned Variant); - void EmitResultCode(); - - Matcher *GetMatcher() const { return TheMatcher; } - private: - void AddMatcher(Matcher *NewNode); - void InferPossibleTypes(); - - // Matcher Generation. - void EmitMatchCode(const TreePatternNode *N, TreePatternNode *NodeNoTypes); - void EmitLeafMatchCode(const TreePatternNode *N); - void EmitOperatorMatchCode(const TreePatternNode *N, - TreePatternNode *NodeNoTypes); - - /// If this is the first time a node with unique identifier Name has been - /// seen, record it. Otherwise, emit a check to make sure this is the same - /// node. Returns true if this is the first encounter. - bool recordUniqueNode(ArrayRef Names); - - // Result Code Generation. - unsigned getNamedArgumentSlot(StringRef Name) { - unsigned VarMapEntry = VariableMap[Name]; - assert(VarMapEntry != 0 && - "Variable referenced but not defined and not caught earlier!"); - return VarMapEntry-1; - } +class MatcherGen { + const PatternToMatch &Pattern; + const CodeGenDAGPatterns &CGP; + + /// PatWithNoTypes - This is a clone of Pattern.getSrcPattern() that starts + /// out with all of the types removed. This allows us to insert type checks + /// as we scan the tree. + TreePatternNodePtr PatWithNoTypes; + + /// VariableMap - A map from variable names ('$dst') to the recorded operand + /// number that they were captured as. These are biased by 1 to make + /// insertion easier. + StringMap VariableMap; + + /// This maintains the recorded operand number that OPC_CheckComplexPattern + /// drops each sub-operand into. We don't want to insert these into + /// VariableMap because that leads to identity checking if they are + /// encountered multiple times. Biased by 1 like VariableMap for + /// consistency. + StringMap NamedComplexPatternOperands; + + /// NextRecordedOperandNo - As we emit opcodes to record matched values in + /// the RecordedNodes array, this keeps track of which slot will be next to + /// record into. + unsigned NextRecordedOperandNo; + + /// MatchedChainNodes - This maintains the position in the recorded nodes + /// array of all of the recorded input nodes that have chains. + SmallVector MatchedChainNodes; + + /// MatchedComplexPatterns - This maintains a list of all of the + /// ComplexPatterns that we need to check. The second element of each pair + /// is the recorded operand number of the input node. + SmallVector, 2> + MatchedComplexPatterns; + + /// PhysRegInputs - List list has an entry for each explicitly specified + /// physreg input to the pattern. The first elt is the Register node, the + /// second is the recorded slot number the input pattern match saved it in. + SmallVector, 2> PhysRegInputs; + + /// Matcher - This is the top level of the generated matcher, the result. + Matcher *TheMatcher; + + /// CurPredicate - As we emit matcher nodes, this points to the latest check + /// which should have future checks stuck into its Next position. + Matcher *CurPredicate; + +public: + MatcherGen(const PatternToMatch &pattern, const CodeGenDAGPatterns &cgp); + + bool EmitMatcherCode(unsigned Variant); + void EmitResultCode(); + + Matcher *GetMatcher() const { return TheMatcher; } + +private: + void AddMatcher(Matcher *NewNode); + void InferPossibleTypes(); + + // Matcher Generation. + void EmitMatchCode(const TreePatternNode *N, TreePatternNode *NodeNoTypes); + void EmitLeafMatchCode(const TreePatternNode *N); + void EmitOperatorMatchCode(const TreePatternNode *N, + TreePatternNode *NodeNoTypes); + + /// If this is the first time a node with unique identifier Name has been + /// seen, record it. Otherwise, emit a check to make sure this is the same + /// node. Returns true if this is the first encounter. + bool recordUniqueNode(ArrayRef Names); + + // Result Code Generation. + unsigned getNamedArgumentSlot(StringRef Name) { + unsigned VarMapEntry = VariableMap[Name]; + assert(VarMapEntry != 0 && + "Variable referenced but not defined and not caught earlier!"); + return VarMapEntry - 1; + } - void EmitResultOperand(const TreePatternNode *N, - SmallVectorImpl &ResultOps); - void EmitResultOfNamedOperand(const TreePatternNode *N, - SmallVectorImpl &ResultOps); - void EmitResultLeafAsOperand(const TreePatternNode *N, - SmallVectorImpl &ResultOps); - void EmitResultInstructionAsOperand(const TreePatternNode *N, - SmallVectorImpl &ResultOps); - void EmitResultSDNodeXFormAsOperand(const TreePatternNode *N, - SmallVectorImpl &ResultOps); - }; + void EmitResultOperand(const TreePatternNode *N, + SmallVectorImpl &ResultOps); + void EmitResultOfNamedOperand(const TreePatternNode *N, + SmallVectorImpl &ResultOps); + void EmitResultLeafAsOperand(const TreePatternNode *N, + SmallVectorImpl &ResultOps); + void EmitResultInstructionAsOperand(const TreePatternNode *N, + SmallVectorImpl &ResultOps); + void EmitResultSDNodeXFormAsOperand(const TreePatternNode *N, + SmallVectorImpl &ResultOps); +}; } // end anonymous namespace @@ -180,11 +180,10 @@ void MatcherGen::InferPossibleTypes() { bool MadeChange = true; while (MadeChange) - MadeChange = PatWithNoTypes->ApplyTypeConstraints(TP, - true/*Ignore reg constraints*/); + MadeChange = PatWithNoTypes->ApplyTypeConstraints( + TP, true /*Ignore reg constraints*/); } - /// AddMatcher - Add a matcher node to the current graph we're building. void MatcherGen::AddMatcher(Matcher *NewNode) { if (CurPredicate) @@ -194,7 +193,6 @@ void MatcherGen::AddMatcher(Matcher *NewNode) { CurPredicate = NewNode; } - //===----------------------------------------------------------------------===// // Pattern Match Generation //===----------------------------------------------------------------------===// @@ -240,7 +238,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) { return AddMatcher(new CheckValueTypeMatcher(LeafRec->getName())); } - if (// Handle register references. Nothing to do here, they always match. + if ( // Handle register references. Nothing to do here, they always match. LeafRec->isSubClassOf("RegisterClass") || LeafRec->isSubClassOf("RegisterOperand") || LeafRec->isSubClassOf("PointerLikeRegClass") || @@ -252,7 +250,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) { // If we have a physreg reference like (mul gpr:$src, EAX) then we need to // record the register if (LeafRec->isSubClassOf("Register")) { - AddMatcher(new RecordMatcher("physreg input "+LeafRec->getName().str(), + AddMatcher(new RecordMatcher("physreg input " + LeafRec->getName().str(), NextRecordedOperandNo)); PhysRegInputs.push_back(std::make_pair(LeafRec, NextRecordedOperandNo++)); return; @@ -376,7 +374,7 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N, if (N->NodeHasProperty(SDNPHasChain, CGP)) { // Record the node and remember it in our chained nodes list. AddMatcher(new RecordMatcher("'" + N->getOperator()->getName().str() + - "' chained node", + "' chained node", NextRecordedOperandNo)); // Remember all of the input chains our pattern will match. MatchedChainNodes.push_back(NextRecordedOperandNo++); @@ -407,7 +405,7 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N, // this to be folded. // const TreePatternNode *Root = Pattern.getSrcPattern(); - if (N != Root) { // Not the root of the pattern. + if (N != Root) { // Not the root of the pattern. // If there is a node between the root and this node, then we definitely // need to emit the check. bool NeedCheck = !Root->hasChild(N); @@ -419,13 +417,11 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N, if (!NeedCheck) { const SDNodeInfo &PInfo = CGP.getSDNodeInfo(Root->getOperator()); NeedCheck = - Root->getOperator() == CGP.get_intrinsic_void_sdnode() || - Root->getOperator() == CGP.get_intrinsic_w_chain_sdnode() || - Root->getOperator() == CGP.get_intrinsic_wo_chain_sdnode() || - PInfo.getNumOperands() > 1 || - PInfo.hasProperty(SDNPHasChain) || - PInfo.hasProperty(SDNPInGlue) || - PInfo.hasProperty(SDNPOptInGlue); + Root->getOperator() == CGP.get_intrinsic_void_sdnode() || + Root->getOperator() == CGP.get_intrinsic_w_chain_sdnode() || + Root->getOperator() == CGP.get_intrinsic_wo_chain_sdnode() || + PInfo.getNumOperands() > 1 || PInfo.hasProperty(SDNPHasChain) || + PInfo.hasProperty(SDNPInGlue) || PInfo.hasProperty(SDNPOptInGlue); } if (NeedCheck) @@ -434,13 +430,12 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N, } // If this node has an output glue and isn't the root, remember it. - if (N->NodeHasProperty(SDNPOutGlue, CGP) && - N != Pattern.getSrcPattern()) { + if (N->NodeHasProperty(SDNPOutGlue, CGP) && N != Pattern.getSrcPattern()) { // TODO: This redundantly records nodes with both glues and chains. // Record the node and remember it in our chained nodes list. AddMatcher(new RecordMatcher("'" + N->getOperator()->getName().str() + - "' glue output node", + "' glue output node", NextRecordedOperandNo)); } @@ -485,7 +480,7 @@ bool MatcherGen::recordUniqueNode(ArrayRef Names) { // we already have checked that the first reference is valid, we don't // have to recursively match it, just check that it's the same as the // previously named thing. - AddMatcher(new CheckSameMatcher(Entry-1)); + AddMatcher(new CheckSameMatcher(Entry - 1)); } for (const std::string &Name : Names) @@ -502,7 +497,8 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N, SmallVector ResultsToTypeCheck; for (unsigned i = 0, e = NodeNoTypes->getNumTypes(); i != e; ++i) { - if (NodeNoTypes->getExtType(i) == N->getExtType(i)) continue; + if (NodeNoTypes->getExtType(i) == N->getExtType(i)) + continue; NodeNoTypes->setType(i, N->getExtType(i)); InferPossibleTypes(); ResultsToTypeCheck.push_back(i); @@ -515,7 +511,8 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N, Names.push_back(N->getName()); for (const ScopedName &Name : N->getNamesAsPredicateArg()) { - Names.push_back(("pred:" + Twine(Name.getScope()) + ":" + Name.getIdentifier()).str()); + Names.push_back( + ("pred:" + Twine(Name.getScope()) + ":" + Name.getIdentifier()).str()); } if (!Names.empty()) { @@ -557,14 +554,17 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) { // Depending on which variant we're generating code for, emit the root opcode // check. if (const ComplexPattern *CP = - Pattern.getSrcPattern()->getComplexPatternInfo(CGP)) { - const std::vector &OpNodes = CP->getRootNodes(); - assert(!OpNodes.empty() &&"Complex Pattern must specify what it can match"); - if (Variant >= OpNodes.size()) return true; + Pattern.getSrcPattern()->getComplexPatternInfo(CGP)) { + const std::vector &OpNodes = CP->getRootNodes(); + assert(!OpNodes.empty() && + "Complex Pattern must specify what it can match"); + if (Variant >= OpNodes.size()) + return true; AddMatcher(new CheckOpcodeMatcher(CGP.getSDNodeInfo(OpNodes[Variant]))); } else { - if (Variant != 0) return true; + if (Variant != 0) + return true; } // Emit the matcher for the pattern structure and types. @@ -616,7 +616,7 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) { // It is the last operand recorded. assert(NextRecordedOperandNo > 1 && "Should have recorded input/result chains at least!"); - MatchedChainNodes.push_back(NextRecordedOperandNo-1); + MatchedChainNodes.push_back(NextRecordedOperandNo - 1); } // TODO: Complex patterns can't have output glues, if they did, we'd want @@ -626,13 +626,12 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) { return false; } - //===----------------------------------------------------------------------===// // Node Result Generation //===----------------------------------------------------------------------===// -void MatcherGen::EmitResultOfNamedOperand(const TreePatternNode *N, - SmallVectorImpl &ResultOps){ +void MatcherGen::EmitResultOfNamedOperand( + const TreePatternNode *N, SmallVectorImpl &ResultOps) { assert(!N->getName().empty() && "Operand not named!"); if (unsigned SlotNo = NamedComplexPatternOperands[N->getName()]) { @@ -676,8 +675,7 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N, if (DefInit *DI = dyn_cast(N->getLeafValue())) { Record *Def = DI->getDef(); if (Def->isSubClassOf("Register")) { - const CodeGenRegister *Reg = - CGP.getTargetInfo().getRegBank().getReg(Def); + const CodeGenRegister *Reg = CGP.getTargetInfo().getRegBank().getReg(Def); AddMatcher(new EmitRegisterMatcher(Reg, N->getSimpleType(0))); ResultOps.push_back(NextRecordedOperandNo++); return; @@ -746,18 +744,16 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N, N->dump(); } -static bool -mayInstNodeLoadOrStore(const TreePatternNode *N, - const CodeGenDAGPatterns &CGP) { +static bool mayInstNodeLoadOrStore(const TreePatternNode *N, + const CodeGenDAGPatterns &CGP) { Record *Op = N->getOperator(); const CodeGenTarget &CGT = CGP.getTargetInfo(); CodeGenInstruction &II = CGT.getInstruction(Op); return II.mayLoad || II.mayStore; } -static unsigned -numNodesThatMayLoadOrStore(const TreePatternNode *N, - const CodeGenDAGPatterns &CGP) { +static unsigned numNodesThatMayLoadOrStore(const TreePatternNode *N, + const CodeGenDAGPatterns &CGP) { if (N->isLeaf()) return 0; @@ -775,9 +771,8 @@ numNodesThatMayLoadOrStore(const TreePatternNode *N, return Count; } -void MatcherGen:: -EmitResultInstructionAsOperand(const TreePatternNode *N, - SmallVectorImpl &OutputOps) { +void MatcherGen::EmitResultInstructionAsOperand( + const TreePatternNode *N, SmallVectorImpl &OutputOps) { Record *Op = N->getOperator(); const CodeGenTarget &CGT = CGP.getTargetInfo(); CodeGenInstruction &II = CGT.getInstruction(Op); @@ -823,11 +818,11 @@ EmitResultInstructionAsOperand(const TreePatternNode *N, // filled in with their defaults unconditionally. unsigned NonOverridableOperands = NumFixedOperands; while (NonOverridableOperands > NumResults && - CGP.operandHasDefault(II.Operands[NonOverridableOperands-1].Rec)) + CGP.operandHasDefault(II.Operands[NonOverridableOperands - 1].Rec)) --NonOverridableOperands; - for (unsigned InstOpNo = NumResults, e = NumFixedOperands; - InstOpNo != e; ++InstOpNo) { + for (unsigned InstOpNo = NumResults, e = NumFixedOperands; InstOpNo != e; + ++InstOpNo) { // Determine what to emit for this operand. Record *OperandNode = II.Operands[InstOpNo].Rec; if (CGP.operandHasDefault(OperandNode) && @@ -835,8 +830,7 @@ EmitResultInstructionAsOperand(const TreePatternNode *N, // This is a predicate or optional def operand which the pattern has not // overridden, or which we aren't letting it override; emit the 'default // ops' operands. - const DAGDefaultOperand &DefaultOp - = CGP.getDefaultOperand(OperandNode); + const DAGDefaultOperand &DefaultOp = CGP.getDefaultOperand(OperandNode); for (unsigned i = 0, e = DefaultOp.DefaultOps.size(); i != e; ++i) EmitResultOperand(DefaultOp.DefaultOps[i].get(), InstOps); continue; @@ -865,7 +859,7 @@ EmitResultInstructionAsOperand(const TreePatternNode *N, // If the operand is an instruction and it produced multiple results, just // take the first one. if (!Child->isLeaf() && Child->getOperator()->isSubClassOf("Instruction")) - InstOps.resize(BeforeAddingNumOps+1); + InstOps.resize(BeforeAddingNumOps + 1); ++ChildNo; } @@ -889,9 +883,8 @@ EmitResultInstructionAsOperand(const TreePatternNode *N, // occur in patterns like (mul:i8 AL:i8, GR8:i8:$src). for (unsigned i = 0, e = PhysRegInputs.size(); i != e; ++i) { const CodeGenRegister *Reg = - CGP.getTargetInfo().getRegBank().getReg(PhysRegInputs[i].first); - AddMatcher(new EmitCopyToRegMatcher(PhysRegInputs[i].second, - Reg)); + CGP.getTargetInfo().getRegBank().getReg(PhysRegInputs[i].first); + AddMatcher(new EmitCopyToRegMatcher(PhysRegInputs[i].second, Reg)); } // Even if the node has no other glue inputs, the resultant node must be @@ -919,7 +912,8 @@ EmitResultInstructionAsOperand(const TreePatternNode *N, HandledReg = II.ImplicitDefs[0]; for (Record *Reg : Pattern.getDstRegs()) { - if (!Reg->isSubClassOf("Register") || Reg == HandledReg) continue; + if (!Reg->isSubClassOf("Register") || Reg == HandledReg) + continue; ResultVTs.push_back(getRegisterValueType(Reg, CGT)); } } @@ -928,8 +922,7 @@ EmitResultInstructionAsOperand(const TreePatternNode *N, // a node that is variadic, mark the generated node as variadic so that it // gets the excess operands from the input DAG. int NumFixedArityOperands = -1; - if (isRoot && - Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP)) + if (isRoot && Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP)) NumFixedArityOperands = Pattern.getSrcPattern()->getNumChildren(); // If this is the root node and multiple matched nodes in the input pattern @@ -940,17 +933,17 @@ EmitResultInstructionAsOperand(const TreePatternNode *N, // FIXME3: This is actively incorrect for result patterns with multiple // memory-referencing instructions. bool PatternHasMemOperands = - Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP); + Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP); bool NodeHasMemRefs = false; if (PatternHasMemOperands) { unsigned NumNodesThatLoadOrStore = - numNodesThatMayLoadOrStore(Pattern.getDstPattern(), CGP); - bool NodeIsUniqueLoadOrStore = mayInstNodeLoadOrStore(N, CGP) && - NumNodesThatLoadOrStore == 1; + numNodesThatMayLoadOrStore(Pattern.getDstPattern(), CGP); + bool NodeIsUniqueLoadOrStore = + mayInstNodeLoadOrStore(N, CGP) && NumNodesThatLoadOrStore == 1; NodeHasMemRefs = - NodeIsUniqueLoadOrStore || (isRoot && (mayInstNodeLoadOrStore(N, CGP) || - NumNodesThatLoadOrStore != 1)); + NodeIsUniqueLoadOrStore || (isRoot && (mayInstNodeLoadOrStore(N, CGP) || + NumNodesThatLoadOrStore != 1)); } // Determine whether we need to attach a chain to this node. @@ -982,14 +975,14 @@ EmitResultInstructionAsOperand(const TreePatternNode *N, // The non-chain and non-glue results of the newly emitted node get recorded. for (unsigned i = 0, e = ResultVTs.size(); i != e; ++i) { - if (ResultVTs[i] == MVT::Other || ResultVTs[i] == MVT::Glue) break; + if (ResultVTs[i] == MVT::Other || ResultVTs[i] == MVT::Glue) + break; OutputOps.push_back(NextRecordedOperandNo++); } } -void MatcherGen:: -EmitResultSDNodeXFormAsOperand(const TreePatternNode *N, - SmallVectorImpl &ResultOps) { +void MatcherGen::EmitResultSDNodeXFormAsOperand( + const TreePatternNode *N, SmallVectorImpl &ResultOps) { assert(N->getOperator()->isSubClassOf("SDNodeXForm") && "Not SDNodeXForm?"); // Emit the operand. @@ -1051,7 +1044,8 @@ void MatcherGen::EmitResultCode() { // don't re-add it. Record *HandledReg = nullptr; const TreePatternNode *DstPat = Pattern.getDstPattern(); - if (!DstPat->isLeaf() &&DstPat->getOperator()->isSubClassOf("Instruction")){ + if (!DstPat->isLeaf() && + DstPat->getOperator()->isSubClassOf("Instruction")) { const CodeGenTarget &CGT = CGP.getTargetInfo(); CodeGenInstruction &II = CGT.getInstruction(DstPat->getOperator()); @@ -1060,7 +1054,8 @@ void MatcherGen::EmitResultCode() { } for (Record *Reg : Pattern.getDstRegs()) { - if (!Reg->isSubClassOf("Register") || Reg == HandledReg) continue; + if (!Reg->isSubClassOf("Register") || Reg == HandledReg) + continue; ++NumSrcResults; } } @@ -1077,7 +1072,6 @@ void MatcherGen::EmitResultCode() { AddMatcher(new CompleteMatchMatcher(Results, Pattern)); } - /// ConvertPatternToMatcher - Create the matcher for the specified pattern with /// the specified variant. If the variant number is invalid, this returns null. Matcher *llvm::ConvertPatternToMatcher(const PatternToMatch &Pattern, diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp index c4c25dc..b137492 100644 --- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp @@ -311,10 +311,9 @@ static void FactorNodes(std::unique_ptr &InputMatcherPtr) { // Don't print if it's obvious nothing extract could be merged anyway. std::next(J) != E) { LLVM_DEBUG(errs() << "Couldn't merge this:\n"; Optn->print(errs(), 4); - errs() << "into this:\n"; - (*J)->print(errs(), 4); + errs() << "into this:\n"; (*J)->print(errs(), 4); (*std::next(J))->printOne(errs()); - if (std::next(J, 2) != E) (*std::next(J, 2))->printOne(errs()); + if (std::next(J, 2) != E)(*std::next(J, 2))->printOne(errs()); errs() << "\n"); } diff --git a/llvm/utils/TableGen/DFAEmitter.cpp b/llvm/utils/TableGen/DFAEmitter.cpp index 54ad81c..0d22ad2 100644 --- a/llvm/utils/TableGen/DFAEmitter.cpp +++ b/llvm/utils/TableGen/DFAEmitter.cpp @@ -147,8 +147,8 @@ void DfaEmitter::emit(StringRef Name, raw_ostream &OS) { OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n"; OS << "// The initial state is 1, not zero.\n"; - OS << "const std::array<" << Name << "Transition, " - << DfaTransitions.size() << "> " << Name << "Transitions = {{\n"; + OS << "const std::array<" << Name << "Transition, " << DfaTransitions.size() + << "> " << Name << "Transitions = {{\n"; for (auto &KV : DfaTransitions) { dfa_state_type From = KV.first.first; dfa_state_type To = KV.second.first; @@ -284,7 +284,7 @@ void Automaton::emit(raw_ostream &OS) { } LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size() << " states with " << NumTransitions << " transitions.\n"); - (void) NumTransitions; + (void)NumTransitions; const auto &ActionTypes = Transitions.back().getTypes(); OS << "// The type of an action in the " << Name << " automaton.\n"; @@ -346,9 +346,7 @@ bool Transition::canTransitionFrom(uint64_t State) { return false; } -uint64_t Transition::transitionFrom(uint64_t State) { - return State | NewState; -} +uint64_t Transition::transitionFrom(uint64_t State) { return State | NewState; } void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; } diff --git a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp index 64c7884..26ea184 100644 --- a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp +++ b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp @@ -72,8 +72,7 @@ public: DFAPacketizerEmitter(RecordKeeper &R); // Construct a map of function unit names to bits. - int collectAllFuncUnits( - ArrayRef ProcModels); + int collectAllFuncUnits(ArrayRef ProcModels); // Construct a map from a combo function unit bit to the bits of all included // functional units. @@ -129,7 +128,8 @@ int DFAPacketizerEmitter::collectAllFuncUnits( return totalFUs; } -int DFAPacketizerEmitter::collectAllComboFuncs(ArrayRef ComboFuncList) { +int DFAPacketizerEmitter::collectAllComboFuncs( + ArrayRef ComboFuncList) { LLVM_DEBUG(dbgs() << "-------------------------------------------------------" "----------------------\n"); LLVM_DEBUG(dbgs() << "collectAllComboFuncs"); diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp index cb9f9c6..25e818a 100644 --- a/llvm/utils/TableGen/DXILEmitter.cpp +++ b/llvm/utils/TableGen/DXILEmitter.cpp @@ -42,22 +42,23 @@ struct DXILParameter { }; struct DXILOperationDesc { - StringRef OpName; // name of DXIL operation - int OpCode; // ID of DXIL operation - StringRef OpClass; // name of the opcode class - StringRef Category; // classification for this instruction - StringRef Doc; // the documentation description of this instruction + StringRef OpName; // name of DXIL operation + int OpCode; // ID of DXIL operation + StringRef OpClass; // name of the opcode class + StringRef Category; // classification for this instruction + StringRef Doc; // the documentation description of this instruction SmallVector Params; // the operands that this instruction takes - StringRef OverloadTypes; // overload types if applicable - StringRef FnAttr; // attribute shorthands: rn=does not access - // memory,ro=only reads from memory - StringRef Intrinsic; // The llvm intrinsic map to OpName. Default is "" which - // means no map exist - bool IsDeriv = false; // whether this is some kind of derivative + StringRef OverloadTypes; // overload types if applicable + StringRef FnAttr; // attribute shorthands: rn=does not access + // memory,ro=only reads from memory + StringRef Intrinsic; // The llvm intrinsic map to OpName. Default is "" which + // means no map exist + bool IsDeriv = false; // whether this is some kind of derivative bool IsGradient = false; // whether this requires a gradient calculation bool IsFeedback = false; // whether this is a sampler feedback op - bool IsWave = false; // whether this requires in-wave, cross-lane functionality + bool IsWave = + false; // whether this requires in-wave, cross-lane functionality bool RequiresUniformInputs = false; // whether this operation requires that // all of its inputs are uniform across // the wave diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index 591ee5c..2f28ccb 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -53,7 +53,8 @@ using namespace llvm; namespace { STATISTIC(NumEncodings, "Number of encodings considered"); -STATISTIC(NumEncodingsLackingDisasm, "Number of encodings without disassembler info"); +STATISTIC(NumEncodingsLackingDisasm, + "Number of encodings without disassembler info"); STATISTIC(NumInstructions, "Number of instructions considered"); STATISTIC(NumEncodingsSupported, "Number of encodings supported"); STATISTIC(NumEncodingsOmitted, "Number of encodings omitted"); @@ -61,7 +62,7 @@ STATISTIC(NumEncodingsOmitted, "Number of encodings omitted"); struct EncodingField { unsigned Base, Width, Offset; EncodingField(unsigned B, unsigned W, unsigned O) - : Base(B), Width(W), Offset(O) { } + : Base(B), Width(W), Offset(O) {} }; struct OperandInfo { @@ -82,7 +83,7 @@ struct OperandInfo { typedef std::vector::const_iterator const_iterator; const_iterator begin() const { return Fields.begin(); } - const_iterator end() const { return Fields.end(); } + const_iterator end() const { return Fields.end(); } }; typedef std::vector DecoderTable; @@ -141,8 +142,7 @@ public: void emitPredicateFunction(formatted_raw_ostream &OS, PredicateSet &Predicates, unsigned Indentation) const; - void emitDecoderFunction(formatted_raw_ostream &OS, - DecoderSet &Decoders, + void emitDecoderFunction(formatted_raw_ostream &OS, DecoderSet &Decoders, unsigned Indentation) const; // run - Output the code emitter @@ -173,9 +173,7 @@ static bool ValueSet(bit_value_t V) { return (V == BIT_TRUE || V == BIT_FALSE); } -static bool ValueNotSet(bit_value_t V) { - return (V == BIT_UNSET); -} +static bool ValueNotSet(bit_value_t V) { return (V == BIT_UNSET); } static int Value(bit_value_t V) { return ValueNotSet(V) ? -1 : (V == BIT_FALSE ? 0 : 1); @@ -280,14 +278,14 @@ class FilterChooser; /// version and return the Opcode since the two have the same Asm format string. class Filter { protected: - const FilterChooser *Owner;// points to the FilterChooser who owns this filter + const FilterChooser + *Owner; // points to the FilterChooser who owns this filter unsigned StartBit; // the starting bit position - unsigned NumBits; // number of bits to filter - bool Mixed; // a mixed region contains both set and unset bits + unsigned NumBits; // number of bits to filter + bool Mixed; // a mixed region contains both set and unset bits // Map of well-known segment value to the set of uid's with that value. - std::map> - FilteredInstructions; + std::map> FilteredInstructions; // Set of uid's with non-constant segment values. std::vector VariableInstructions; @@ -471,7 +469,7 @@ protected: /// dumpFilterArray - dumpFilterArray prints out debugging info for the given /// filter array as a series of chars. void dumpFilterArray(raw_ostream &o, - const std::vector & filter) const; + const std::vector &filter) const; /// dumpStack - dumpStack traverses the filter chooser chain and calls /// dumpFilterArray on each filter chooser up to the top level one. @@ -504,11 +502,9 @@ protected: bool doesOpcodeNeedPredicate(unsigned Opc) const; unsigned getPredicateIndex(DecoderTableInfo &TableInfo, StringRef P) const; - void emitPredicateTableEntry(DecoderTableInfo &TableInfo, - unsigned Opc) const; + void emitPredicateTableEntry(DecoderTableInfo &TableInfo, unsigned Opc) const; - void emitSoftFailTableEntry(DecoderTableInfo &TableInfo, - unsigned Opc) const; + void emitSoftFailTableEntry(DecoderTableInfo &TableInfo, unsigned Opc) const; // Emits table entries to decode the singleton. void emitSingletonTableEntry(DecoderTableInfo &TableInfo, @@ -560,16 +556,15 @@ public: /////////////////////////// Filter::Filter(Filter &&f) - : Owner(f.Owner), StartBit(f.StartBit), NumBits(f.NumBits), Mixed(f.Mixed), - FilteredInstructions(std::move(f.FilteredInstructions)), - VariableInstructions(std::move(f.VariableInstructions)), - FilterChooserMap(std::move(f.FilterChooserMap)), NumFiltered(f.NumFiltered), - LastOpcFiltered(f.LastOpcFiltered) { -} + : Owner(f.Owner), StartBit(f.StartBit), NumBits(f.NumBits), Mixed(f.Mixed), + FilteredInstructions(std::move(f.FilteredInstructions)), + VariableInstructions(std::move(f.VariableInstructions)), + FilterChooserMap(std::move(f.FilterChooserMap)), + NumFiltered(f.NumFiltered), LastOpcFiltered(f.LastOpcFiltered) {} Filter::Filter(FilterChooser &owner, unsigned startBit, unsigned numBits, bool mixed) - : Owner(&owner), StartBit(startBit), NumBits(numBits), Mixed(mixed) { + : Owner(&owner), StartBit(startBit), NumBits(numBits), Mixed(mixed) { assert(StartBit + NumBits - 1 < Owner->BitWidth); NumFiltered = 0; @@ -598,8 +593,8 @@ Filter::Filter(FilterChooser &owner, unsigned startBit, unsigned numBits, } } - assert((FilteredInstructions.size() + VariableInstructions.size() > 0) - && "Filter returns no instruction categories"); + assert((FilteredInstructions.size() + VariableInstructions.size() > 0) && + "Filter returns no instruction categories"); } // Divides the decoding task into sub tasks and delegates them to the @@ -619,9 +614,11 @@ void Filter::recurse() { // Delegates to an inferior filter chooser for further processing on this // group of instructions whose segment values are variable. - FilterChooserMap.insert(std::make_pair(NO_FIXED_SEGMENTS_SENTINEL, + FilterChooserMap.insert(std::make_pair( + NO_FIXED_SEGMENTS_SENTINEL, std::make_unique(Owner->AllInstructions, - VariableInstructions, Owner->Operands, BitValueArray, *Owner))); + VariableInstructions, Owner->Operands, + BitValueArray, *Owner))); } // No need to recurse for a singleton filtered instruction. @@ -646,8 +643,8 @@ void Filter::recurse() { // category of instructions. FilterChooserMap.insert(std::make_pair( Inst.first, std::make_unique( - Owner->AllInstructions, Inst.second, - Owner->Operands, BitValueArray, *Owner))); + Owner->AllInstructions, Inst.second, Owner->Operands, + BitValueArray, *Owner))); } } @@ -655,8 +652,7 @@ static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups, uint32_t DestIdx) { // Any NumToSkip fixups in the current scope can resolve to the // current location. - for (FixupList::const_reverse_iterator I = Fixups.rbegin(), - E = Fixups.rend(); + for (FixupList::const_reverse_iterator I = Fixups.rbegin(), E = Fixups.rend(); I != E; ++I) { // Calculate the distance from the byte following the fixup entry byte // to the destination. The Target is calculated from after the 16-bit @@ -705,7 +701,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const { // Resolve any NumToSkip fixups in the current scope. resolveTableFixups(Table, CurScope, Table.size()); CurScope.clear(); - PrevFilter = 0; // Don't re-process the filter's fallthrough. + PrevFilter = 0; // Don't re-process the filter's fallthrough. } else { Table.push_back(MCD::OPC_FilterValue); // Encode and emit the value to filter against. @@ -731,7 +727,8 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const { // two as to account for the width of the NumToSkip field itself. if (PrevFilter) { uint32_t NumToSkip = Table.size() - PrevFilter - 3; - assert(NumToSkip < (1u << 24) && "disassembler decoding table too large!"); + assert(NumToSkip < (1u << 24) && + "disassembler decoding table too large!"); Table[PrevFilter] = (uint8_t)NumToSkip; Table[PrevFilter + 1] = (uint8_t)(NumToSkip >> 8); Table[PrevFilter + 2] = (uint8_t)(NumToSkip >> 16); @@ -771,7 +768,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, unsigned Indentation, unsigned BitWidth, StringRef Namespace) const { OS.indent(Indentation) << "static const uint8_t DecoderTable" << Namespace - << BitWidth << "[] = {\n"; + << BitWidth << "[] = {\n"; Indentation += 2; @@ -807,7 +804,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, DecoderTable::const_iterator I = Table.begin(); DecoderTable::const_iterator E = Table.end(); while (I != E) { - assert (I < E && "incomplete decode table entry!"); + assert(I < E && "incomplete decode table entry!"); uint64_t Pos = I - Table.begin(); OS << "/* " << Pos << " */"; @@ -884,8 +881,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, Table.data() + Table.size(), &ErrMsg); assert(ErrMsg == nullptr && "ULEB128 value too large!"); - OS.indent(Indentation) << "MCD::OPC_" << (IsTry ? "Try" : "") - << "Decode, "; + OS.indent(Indentation) + << "MCD::OPC_" << (IsTry ? "Try" : "") << "Decode, "; I += emitULEB128(I, OS); // Decoder index. @@ -967,15 +964,16 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS, // The predicate function is just a big switch statement based on the // input predicate index. OS.indent(Indentation) << "static bool checkDecoderPredicate(unsigned Idx, " - << "const FeatureBitset &Bits) {\n"; + << "const FeatureBitset &Bits) {\n"; Indentation += 2; if (!Predicates.empty()) { OS.indent(Indentation) << "switch (Idx) {\n"; - OS.indent(Indentation) << "default: llvm_unreachable(\"Invalid index!\");\n"; + OS.indent(Indentation) + << "default: llvm_unreachable(\"Invalid index!\");\n"; unsigned Index = 0; for (const auto &Predicate : Predicates) { OS.indent(Indentation) << "case " << Index++ << ":\n"; - OS.indent(Indentation+2) << "return (" << Predicate << ");\n"; + OS.indent(Indentation + 2) << "return (" << Predicate << ");\n"; } OS.indent(Indentation) << "}\n"; } else { @@ -993,7 +991,7 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS, // input decoder index. OS.indent(Indentation) << "template \n"; OS.indent(Indentation) << "static DecodeStatus decodeToMCInst(DecodeStatus S," - << " unsigned Idx, InsnType insn, MCInst &MI,\n"; + << " unsigned Idx, InsnType insn, MCInst &MI,\n"; OS.indent(Indentation) << " uint64_t " << "Address, const MCDisassembler *Decoder, bool &DecodeComplete) {\n"; @@ -1012,7 +1010,7 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS, for (const auto &Decoder : Decoders) { OS.indent(Indentation) << "case " << Index++ << ":\n"; OS << Decoder; - OS.indent(Indentation+2) << "return S;\n"; + OS.indent(Indentation + 2) << "return S;\n"; } OS.indent(Indentation) << "}\n"; Indentation -= 2; @@ -1041,8 +1039,8 @@ bool FilterChooser::fieldFromInsn(uint64_t &Field, insn_t &Insn, /// dumpFilterArray - dumpFilterArray prints out debugging info for the given /// filter array as a series of chars. -void FilterChooser::dumpFilterArray(raw_ostream &o, - const std::vector &filter) const { +void FilterChooser::dumpFilterArray( + raw_ostream &o, const std::vector &filter) const { for (unsigned bitIndex = BitWidth; bitIndex > 0; bitIndex--) { switch (filter[bitIndex - 1]) { case BIT_UNFILTERED: @@ -1096,7 +1094,8 @@ unsigned FilterChooser::getIslands(std::vector &StartBits, int64_t Val = Value(Insn[i]); bool Filtered = PositionFiltered(i); switch (State) { - default: llvm_unreachable("Unreachable code!"); + default: + llvm_unreachable("Unreachable code!"); case 0: case 1: if (Filtered || Val == -1) @@ -1197,8 +1196,7 @@ void FilterChooser::emitDecoder(raw_ostream &OS, unsigned Indentation, } } -unsigned FilterChooser::getDecoderIndex(DecoderSet &Decoders, - unsigned Opc, +unsigned FilterChooser::getDecoderIndex(DecoderSet &Decoders, unsigned Opc, bool &HasCompleteDecoder) const { // Build up the predicate string. SmallString<256> Decoder; @@ -1343,7 +1341,8 @@ void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo, const RecordVal *RV = AllInstructions[Opc].EncodingDef->getValue("SoftFail"); BitsInit *SFBits = RV ? dyn_cast(RV->getValue()) : nullptr; - if (!SFBits) return; + if (!SFBits) + return; BitsInit *InstBits = AllInstructions[Opc].EncodingDef->getValueAsBitsInit("Inst"); @@ -1353,7 +1352,8 @@ void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo, bit_value_t B = bitFromBits(*SFBits, i); bit_value_t IB = bitFromBits(*InstBits, i); - if (B != BIT_TRUE) continue; + if (B != BIT_TRUE) + continue; switch (IB) { case BIT_FALSE: @@ -1458,12 +1458,12 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, // decoder method indicates that additional processing should be done to see // if there is any other instruction that also matches the bitpattern and // can decode it. - TableInfo.Table.push_back(HasCompleteDecoder ? MCD::OPC_Decode : - MCD::OPC_TryDecode); + TableInfo.Table.push_back(HasCompleteDecoder ? MCD::OPC_Decode + : MCD::OPC_TryDecode); NumEncodingsSupported++; uint8_t Buffer[16], *p; encodeULEB128(Opc.Opcode, Buffer); - for (p = Buffer; *p >= 128 ; ++p) + for (p = Buffer; *p >= 128; ++p) TableInfo.Table.push_back(*p); TableInfo.Table.push_back(*p); @@ -1825,8 +1825,8 @@ static std::string findOperandDecoderMethod(Record *Record) { std::string Decoder; RecordVal *DecoderString = Record->getValue("DecoderMethod"); - StringInit *String = DecoderString ? - dyn_cast(DecoderString->getValue()) : nullptr; + StringInit *String = + DecoderString ? dyn_cast(DecoderString->getValue()) : nullptr; if (String) { Decoder = std::string(String->getValue()); if (!Decoder.empty()) @@ -1840,7 +1840,7 @@ static std::string findOperandDecoderMethod(Record *Record) { Decoder = "Decode" + Record->getName().str() + "RegisterClass"; } else if (Record->isSubClassOf("PointerLikeRegClass")) { Decoder = "DecodePointerLikeRegClass" + - utostr(Record->getValueAsInt("RegClassKind")); + utostr(Record->getValueAsInt("RegClassKind")); } return Decoder; @@ -1986,7 +1986,8 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef, // of trying to auto-generate the decoder. StringRef InstDecoder = EncodingDef.getValueAsString("DecoderMethod"); if (InstDecoder != "") { - bool HasCompleteInstDecoder = EncodingDef.getValueAsBit("hasCompleteDecoder"); + bool HasCompleteInstDecoder = + EncodingDef.getValueAsBit("hasCompleteDecoder"); InsnOperands.push_back( OperandInfo(std::string(InstDecoder), HasCompleteInstDecoder)); Operands[Opc] = InsnOperands; @@ -2000,9 +2001,9 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef, // Gather the outputs/inputs of the instruction, so we can find their // positions in the encoding. This assumes for now that they appear in the // MCInst in the order that they're listed. - std::vector> InOutOperands; - DagInit *Out = Def.getValueAsDag("OutOperandList"); - DagInit *In = Def.getValueAsDag("InOperandList"); + std::vector> InOutOperands; + DagInit *Out = Def.getValueAsDag("OutOperandList"); + DagInit *In = Def.getValueAsDag("InOperandList"); for (unsigned i = 0; i < Out->getNumArgs(); ++i) InOutOperands.push_back( std::make_pair(Out->getArg(i), Out->getArgNameStr(i))); @@ -2042,7 +2043,8 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef, Init *OpInit = Op.first; StringRef OpName = Op.second; - // We're ready to find the instruction encoding locations for this operand. + // We're ready to find the instruction encoding locations for this + // operand. // First, find the operand type ("OpInit"), and sub-op names // ("SubArgDag") if present. @@ -2056,7 +2058,8 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef, ? OpTypeRec->getValueAsDag("MIOperandInfo") : nullptr; - // Lookup the decoder method and construct a new OperandInfo to hold our result. + // Lookup the decoder method and construct a new OperandInfo to hold our + // result. OperandInfo OpInfo = getOpInfo(OpTypeRec); // If we have named sub-operands... @@ -2490,7 +2493,8 @@ void DecoderEmitter::run(raw_ostream &o) { NumberedEncodings.emplace_back(NumberedInstruction->TheDef, NumberedInstruction, HwModeName); } - for (const auto &NumberedAlias : RK.getAllDerivedDefinitions("AdditionalEncoding")) + for (const auto &NumberedAlias : + RK.getAllDerivedDefinitions("AdditionalEncoding")) NumberedEncodings.emplace_back( NumberedAlias, &Target.getInstruction(NumberedAlias->getValueAsDef("AliasOf"))); @@ -2551,8 +2555,8 @@ void DecoderEmitter::run(raw_ostream &o) { DecoderTableInfo TableInfo; for (const auto &Opc : OpcMap) { // Emit the decoder for this namespace+width combination. - ArrayRef NumberedEncodingsRef( - NumberedEncodings.data(), NumberedEncodings.size()); + ArrayRef NumberedEncodingsRef(NumberedEncodings.data(), + NumberedEncodings.size()); FilterChooser FC(NumberedEncodingsRef, Opc.second, Operands, IsVarLenInst ? MaxInstLen : 8 * Opc.first.second, this); diff --git a/llvm/utils/TableGen/DisassemblerEmitter.cpp b/llvm/utils/TableGen/DisassemblerEmitter.cpp index 92f3721..ae6a8ef 100644 --- a/llvm/utils/TableGen/DisassemblerEmitter.cpp +++ b/llvm/utils/TableGen/DisassemblerEmitter.cpp @@ -102,8 +102,8 @@ static void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) { if (Target.getName() == "X86") { DisassemblerTables Tables; - ArrayRef numberedInstructions = - Target.getInstructionsByEnumValue(); + ArrayRef numberedInstructions = + Target.getInstructionsByEnumValue(); for (unsigned i = 0, e = numberedInstructions.size(); i != e; ++i) RecognizableInstr::processInstr(Tables, *numberedInstructions[i], i); diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp index b773a6b..dff6503 100644 --- a/llvm/utils/TableGen/FastISelEmitter.cpp +++ b/llvm/utils/TableGen/FastISelEmitter.cpp @@ -1,4 +1,4 @@ -///===- FastISelEmitter.cpp - Generate an instruction selector -------------===// +///===- FastISelEmitter.cpp - Generate an instruction selector ------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -30,7 +30,6 @@ #include using namespace llvm; - /// InstructionMemo - This class holds additional information about an /// instruction needed to emit code for it. /// @@ -61,15 +60,15 @@ namespace { class ImmPredicateSet { DenseMap ImmIDs; std::vector PredsByName; -public: +public: unsigned getIDFor(TreePredicateFn Pred) { unsigned &Entry = ImmIDs[Pred.getOrigPatFragRecord()]; if (Entry == 0) { PredsByName.push_back(Pred); Entry = PredsByName.size(); } - return Entry-1; + return Entry - 1; } const TreePredicateFn &getPredicate(unsigned i) { @@ -80,7 +79,6 @@ public: typedef std::vector::const_iterator iterator; iterator begin() const { return PredsByName.begin(); } iterator end() const { return PredsByName.end(); } - }; } // End anonymous namespace @@ -92,26 +90,39 @@ struct OperandsSignature { class OpKind { enum { OK_Reg, OK_FP, OK_Imm, OK_Invalid = -1 }; char Repr; - public: + public: OpKind() : Repr(OK_Invalid) {} bool operator<(OpKind RHS) const { return Repr < RHS.Repr; } bool operator==(OpKind RHS) const { return Repr == RHS.Repr; } - static OpKind getReg() { OpKind K; K.Repr = OK_Reg; return K; } - static OpKind getFP() { OpKind K; K.Repr = OK_FP; return K; } + static OpKind getReg() { + OpKind K; + K.Repr = OK_Reg; + return K; + } + static OpKind getFP() { + OpKind K; + K.Repr = OK_FP; + return K; + } static OpKind getImm(unsigned V) { - assert((unsigned)OK_Imm+V < 128 && + assert((unsigned)OK_Imm + V < 128 && "Too many integer predicates for the 'Repr' char"); - OpKind K; K.Repr = OK_Imm+V; return K; + OpKind K; + K.Repr = OK_Imm + V; + return K; } bool isReg() const { return Repr == OK_Reg; } - bool isFP() const { return Repr == OK_FP; } + bool isFP() const { return Repr == OK_FP; } bool isImm() const { return Repr >= OK_Imm; } - unsigned getImmCode() const { assert(isImm()); return Repr-OK_Imm; } + unsigned getImmCode() const { + assert(isImm()); + return Repr - OK_Imm; + } void printManglingSuffix(raw_ostream &OS, ImmPredicateSet &ImmPredicates, bool StripImmCodes) const { @@ -123,12 +134,11 @@ struct OperandsSignature { OS << 'i'; if (!StripImmCodes) if (unsigned Code = getImmCode()) - OS << "_" << ImmPredicates.getPredicate(Code-1).getFnName(); + OS << "_" << ImmPredicates.getPredicate(Code - 1).getFnName(); } } }; - SmallVector Operands; bool operator<(const OperandsSignature &O) const { @@ -162,15 +172,17 @@ struct OperandsSignature { void emitImmediatePredicate(raw_ostream &OS, ImmPredicateSet &ImmPredicates) { bool EmittedAnything = false; for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - if (!Operands[i].isImm()) continue; + if (!Operands[i].isImm()) + continue; unsigned Code = Operands[i].getImmCode(); - if (Code == 0) continue; + if (Code == 0) + continue; if (EmittedAnything) OS << " &&\n "; - TreePredicateFn PredFn = ImmPredicates.getPredicate(Code-1); + TreePredicateFn PredFn = ImmPredicates.getPredicate(Code - 1); // Emit the type check. TreePattern *TP = PredFn.getOrigPatFragRecord(); @@ -179,7 +191,7 @@ struct OperandsSignature { "Cannot use variable value types with fast isel"); OS << "VT == " << getEnumName(VVT.getSimple().SimpleTy) << " && "; - OS << PredFn.getFnName() << "(imm" << i <<')'; + OS << PredFn.getFnName() << "(imm" << i << ')'; EmittedAnything = true; } } @@ -189,8 +201,7 @@ struct OperandsSignature { /// are supported, false otherwise. /// bool initialize(TreePatternNode *InstPatNode, const CodeGenTarget &Target, - MVT::SimpleValueType VT, - ImmPredicateSet &ImmediatePredicates, + MVT::SimpleValueType VT, ImmPredicateSet &ImmediatePredicates, const CodeGenRegisterClass *OrigDstRC) { if (InstPatNode->isLeaf()) return false; @@ -229,21 +240,20 @@ struct OperandsSignature { if (Rec->getValueAsBit("FastIselShouldIgnore")) return false; - PredNo = ImmediatePredicates.getIDFor(PredFn)+1; + PredNo = ImmediatePredicates.getIDFor(PredFn) + 1; } Operands.push_back(OpKind::getImm(PredNo)); continue; } - // For now, filter out any operand with a predicate. // For now, filter out any operand with multiple values. if (!Op->getPredicateCalls().empty() || Op->getNumTypes() != 1) return false; if (!Op->isLeaf()) { - if (Op->getOperator()->getName() == "fpimm") { + if (Op->getOperator()->getName() == "fpimm") { Operands.push_back(OpKind::getFP()); continue; } @@ -347,7 +357,6 @@ struct OperandsSignature { } } - void PrintManglingSuffix(raw_ostream &OS, const std::vector &PR, ImmPredicateSet &ImmPredicates, bool StripImmCodes = false) const { @@ -380,7 +389,7 @@ class FastISelMap { typedef std::map TypeRetPredMap; typedef std::map OpcodeTypeRetPredMap; typedef std::map - OperandsOpcodeTypeRetPredMap; + OperandsOpcodeTypeRetPredMap; OperandsOpcodeTypeRetPredMap SimplePatterns; @@ -389,22 +398,22 @@ class FastISelMap { MVT::SimpleValueType, std::string>> SimplePatternsCheck; - std::map > - SignaturesWithConstantForms; + std::map> + SignaturesWithConstantForms; StringRef InstNS; ImmPredicateSet ImmediatePredicates; + public: explicit FastISelMap(StringRef InstNS); void collectPatterns(CodeGenDAGPatterns &CGP); void printImmediatePredicates(raw_ostream &OS); void printFunctionDefinitions(raw_ostream &OS); + private: - void emitInstructionCode(raw_ostream &OS, - const OperandsSignature &Operands, - const PredMap &PM, - const std::string &RetVTName); + void emitInstructionCode(raw_ostream &OS, const OperandsSignature &Operands, + const PredMap &PM, const std::string &RetVTName); }; } // End anonymous namespace @@ -433,7 +442,7 @@ static std::string PhyRegForNode(TreePatternNode *Op, return PhysReg; PhysReg += cast(OpLeafRec->getValue("Namespace")->getValue()) - ->getValue(); + ->getValue(); PhysReg += "::"; PhysReg += Target.getRegBank().getReg(OpLeafRec)->getName(); return PhysReg; @@ -443,14 +452,15 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { const CodeGenTarget &Target = CGP.getTargetInfo(); // Scan through all the patterns and record the simple ones. - for (CodeGenDAGPatterns::ptm_iterator I = CGP.ptm_begin(), - E = CGP.ptm_end(); I != E; ++I) { + for (CodeGenDAGPatterns::ptm_iterator I = CGP.ptm_begin(), E = CGP.ptm_end(); + I != E; ++I) { const PatternToMatch &Pattern = *I; // For now, just look at Instructions, so that we don't have to worry // about emitting multiple instructions for a pattern. TreePatternNode *Dst = Pattern.getDstPattern(); - if (Dst->isLeaf()) continue; + if (Dst->isLeaf()) + continue; Record *Op = Dst->getOperator(); if (!Op->isSubClassOf("Instruction")) continue; @@ -495,7 +505,8 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { } else { // If this isn't a leaf, then continue since the register classes are // a bit too complicated for now. - if (!Dst->getChild(1)->isLeaf()) continue; + if (!Dst->getChild(1)->isLeaf()) + continue; DefInit *SR = dyn_cast(Dst->getChild(1)->getLeafValue()); if (SR) @@ -506,16 +517,20 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { // Inspect the pattern. TreePatternNode *InstPatNode = Pattern.getSrcPattern(); - if (!InstPatNode) continue; - if (InstPatNode->isLeaf()) continue; + if (!InstPatNode) + continue; + if (InstPatNode->isLeaf()) + continue; // Ignore multiple result nodes for now. - if (InstPatNode->getNumTypes() > 1) continue; + if (InstPatNode->getNumTypes() > 1) + continue; Record *InstPatOp = InstPatNode->getOperator(); std::string OpcodeName = getOpcodeName(InstPatOp, CGP); MVT::SimpleValueType RetVT = MVT::isVoid; - if (InstPatNode->getNumTypes()) RetVT = InstPatNode->getSimpleType(0); + if (InstPatNode->getNumTypes()) + RetVT = InstPatNode->getSimpleType(0); MVT::SimpleValueType VT = RetVT; if (InstPatNode->getNumChildren()) { assert(InstPatNode->getChild(0)->getNumTypes() == 1); @@ -546,7 +561,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { if (PhysReg.empty()) { if (DstIndex >= Dst->getNumChildren() || Dst->getChild(DstIndex)->getName() != - InstPatNode->getChild(i)->getName()) { + InstPatNode->getChild(i)->getName()) { FoundNonSimplePattern = true; break; } @@ -568,21 +583,16 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { raw_string_ostream SuffixOS(ManglingSuffix); Operands.PrintManglingSuffix(SuffixOS, ImmediatePredicates, true); if (!StringSwitch(ManglingSuffix) - .Cases("", "r", "rr", "ri", "i", "f", true) - .Default(false)) + .Cases("", "r", "rr", "ri", "i", "f", true) + .Default(false)) continue; // Get the predicate that guards this pattern. std::string PredicateCheck = Pattern.getPredicateCheck(); // Ok, we found a pattern that we can handle. Remember it. - InstructionMemo Memo( - Pattern.getDstPattern()->getOperator()->getName(), - DstRC, - SubRegNo, - PhysRegInputs, - PredicateCheck - ); + InstructionMemo Memo(Pattern.getDstPattern()->getOperator()->getName(), + DstRC, SubRegNo, PhysRegInputs, PredicateCheck); int complexity = Pattern.getPatternComplexity(CGP); @@ -590,7 +600,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { std::make_tuple(Operands, OpcodeName, VT, RetVT, PredicateCheck)); if (!inserted_simple_pattern.second) { PrintFatalError(Pattern.getSrcRecord()->getLoc(), - "Duplicate predicate in FastISel table!"); + "Duplicate predicate in FastISel table!"); } // Note: Instructions with the same complexity will appear in the order @@ -602,8 +612,8 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { // them down to a signature that doesn't have predicates so that we can // associate them with the stripped predicate version. if (Operands.hasAnyImmediateCodes()) { - SignaturesWithConstantForms[Operands.getWithoutImmCodes()] - .push_back(Operands); + SignaturesWithConstantForms[Operands.getWithoutImmCodes()].push_back( + Operands); } } } @@ -645,7 +655,8 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS, if (OneHadNoPredicate) { PrintFatalError("Multiple instructions match and one with no " "predicate came before one with a predicate! " - "name:" + Memo.Name + " predicate: " + PredicateCheck); + "name:" + + Memo.Name + " predicate: " + PredicateCheck); } OS << " if (" + PredicateCheck + ") {\n"; OS << " "; @@ -669,8 +680,8 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS, Operands.PrintArguments(OS, Memo.PhysRegs); OS << ");\n"; } else { - OS << "extractsubreg(" << RetVTName - << ", Op0, " << Memo.SubRegNo << ");\n"; + OS << "extractsubreg(" << RetVTName << ", Op0, " << Memo.SubRegNo + << ");\n"; } if (!PredicateCheck.empty()) { @@ -685,7 +696,6 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS, OS << "\n"; } - void FastISelMap::printFunctionDefinitions(raw_ostream &OS) { // Now emit code for all the patterns that we collected. for (const auto &SimplePattern : SimplePatterns) { @@ -762,8 +772,7 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) { } // Emit one function for the opcode that demultiplexes based on the type. - OS << "unsigned fastEmit_" - << getLegalCName(Opcode) << "_"; + OS << "unsigned fastEmit_" << getLegalCName(Opcode) << "_"; Operands.PrintManglingSuffix(OS, ImmediatePredicates); OS << "(MVT VT, MVT RetVT"; if (!Operands.empty()) @@ -809,8 +818,8 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) { // constrained forms of the immediate (e.g., 32-bit sext immediate in a // 64-bit operand), check them first. - std::map >::iterator MI - = SignaturesWithConstantForms.find(Operands); + std::map>::iterator MI = + SignaturesWithConstantForms.find(Operands); if (MI != SignaturesWithConstantForms.end()) { // Unique any duplicates out of the list. llvm::sort(MI->second); @@ -840,8 +849,8 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) { for (const auto &I : OTM) { const std::string &Opcode = I.first; - OS << " case " << Opcode << ": return fastEmit_" - << getLegalCName(Opcode) << "_"; + OS << " case " << Opcode << ": return fastEmit_" << getLegalCName(Opcode) + << "_"; Operands.PrintManglingSuffix(OS, ImmediatePredicates); OS << "(VT, RetVT"; if (!Operands.empty()) @@ -862,7 +871,8 @@ static void EmitFastISel(RecordKeeper &RK, raw_ostream &OS) { CodeGenDAGPatterns CGP(RK); const CodeGenTarget &Target = CGP.getTargetInfo(); emitSourceFileHeader("\"Fast\" Instruction Selector for the " + - Target.getName().str() + " target", OS); + Target.getName().str() + " target", + OS); // Determine the target's namespace name. StringRef InstNS = Target.getInstNamespace(); diff --git a/llvm/utils/TableGen/InfoByHwMode.cpp b/llvm/utils/TableGen/InfoByHwMode.cpp index 7e4ab53..6d9a35a 100644 --- a/llvm/utils/TableGen/InfoByHwMode.cpp +++ b/llvm/utils/TableGen/InfoByHwMode.cpp @@ -11,8 +11,8 @@ // data). //===----------------------------------------------------------------------===// -#include "CodeGenTarget.h" #include "InfoByHwMode.h" +#include "CodeGenTarget.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/Debug.h" @@ -44,7 +44,7 @@ ValueTypeByHwMode::ValueTypeByHwMode(Record *R, MVT T) : ValueTypeByHwMode(T) { PtrAddrSpace = R->getValueAsInt("AddrSpace"); } -bool ValueTypeByHwMode::operator== (const ValueTypeByHwMode &T) const { +bool ValueTypeByHwMode::operator==(const ValueTypeByHwMode &T) const { assert(isValid() && T.isValid() && "Invalid type in assignment"); bool Simple = isSimple(); if (Simple != T.isSimple()) @@ -55,7 +55,7 @@ bool ValueTypeByHwMode::operator== (const ValueTypeByHwMode &T) const { return Map == T.Map; } -bool ValueTypeByHwMode::operator< (const ValueTypeByHwMode &T) const { +bool ValueTypeByHwMode::operator<(const ValueTypeByHwMode &T) const { assert(isValid() && T.isValid() && "Invalid type in comparison"); // Default order for maps. return Map < T.Map; @@ -86,7 +86,7 @@ void ValueTypeByHwMode::writeToStream(raw_ostream &OS) const { return; } - std::vector Pairs; + std::vector Pairs; for (const auto &P : Map) Pairs.push_back(&P); llvm::sort(Pairs, deref>()); @@ -100,9 +100,7 @@ void ValueTypeByHwMode::writeToStream(raw_ostream &OS) const { } LLVM_DUMP_METHOD -void ValueTypeByHwMode::dump() const { - dbgs() << *this << '\n'; -} +void ValueTypeByHwMode::dump() const { dbgs() << *this << '\n'; } ValueTypeByHwMode llvm::getValueTypeByHwMode(Record *Rec, const CodeGenHwModes &CGH) { @@ -123,24 +121,22 @@ RegSizeInfo::RegSizeInfo(Record *R, const CodeGenHwModes &CGH) { SpillAlignment = R->getValueAsInt("SpillAlignment"); } -bool RegSizeInfo::operator< (const RegSizeInfo &I) const { +bool RegSizeInfo::operator<(const RegSizeInfo &I) const { return std::tie(RegSize, SpillSize, SpillAlignment) < std::tie(I.RegSize, I.SpillSize, I.SpillAlignment); } bool RegSizeInfo::isSubClassOf(const RegSizeInfo &I) const { - return RegSize <= I.RegSize && - SpillAlignment && I.SpillAlignment % SpillAlignment == 0 && - SpillSize <= I.SpillSize; + return RegSize <= I.RegSize && SpillAlignment && + I.SpillAlignment % SpillAlignment == 0 && SpillSize <= I.SpillSize; } void RegSizeInfo::writeToStream(raw_ostream &OS) const { - OS << "[R=" << RegSize << ",S=" << SpillSize - << ",A=" << SpillAlignment << ']'; + OS << "[R=" << RegSize << ",S=" << SpillSize << ",A=" << SpillAlignment + << ']'; } -RegSizeInfoByHwMode::RegSizeInfoByHwMode(Record *R, - const CodeGenHwModes &CGH) { +RegSizeInfoByHwMode::RegSizeInfoByHwMode(Record *R, const CodeGenHwModes &CGH) { const HwModeSelect &MS = CGH.getHwModeSelect(R); for (const HwModeSelect::PairType &P : MS.Items) { auto I = Map.insert({P.first, RegSizeInfo(P.second, CGH)}); @@ -149,12 +145,12 @@ RegSizeInfoByHwMode::RegSizeInfoByHwMode(Record *R, } } -bool RegSizeInfoByHwMode::operator< (const RegSizeInfoByHwMode &I) const { +bool RegSizeInfoByHwMode::operator<(const RegSizeInfoByHwMode &I) const { unsigned M0 = Map.begin()->first; return get(M0) < I.get(M0); } -bool RegSizeInfoByHwMode::operator== (const RegSizeInfoByHwMode &I) const { +bool RegSizeInfoByHwMode::operator==(const RegSizeInfoByHwMode &I) const { unsigned M0 = Map.begin()->first; return get(M0) == I.get(M0); } @@ -164,8 +160,8 @@ bool RegSizeInfoByHwMode::isSubClassOf(const RegSizeInfoByHwMode &I) const { return get(M0).isSubClassOf(I.get(M0)); } -bool RegSizeInfoByHwMode::hasStricterSpillThan(const RegSizeInfoByHwMode &I) - const { +bool RegSizeInfoByHwMode::hasStricterSpillThan( + const RegSizeInfoByHwMode &I) const { unsigned M0 = Map.begin()->first; const RegSizeInfo &A0 = get(M0); const RegSizeInfo &B0 = I.get(M0); @@ -175,7 +171,7 @@ bool RegSizeInfoByHwMode::hasStricterSpillThan(const RegSizeInfoByHwMode &I) void RegSizeInfoByHwMode::writeToStream(raw_ostream &OS) const { typedef typename decltype(Map)::value_type PairType; - std::vector Pairs; + std::vector Pairs; for (const auto &P : Map) Pairs.push_back(&P); llvm::sort(Pairs, deref>()); @@ -187,7 +183,8 @@ void RegSizeInfoByHwMode::writeToStream(raw_ostream &OS) const { OS << '}'; } -EncodingInfoByHwMode::EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH) { +EncodingInfoByHwMode::EncodingInfoByHwMode(Record *R, + const CodeGenHwModes &CGH) { const HwModeSelect &MS = CGH.getHwModeSelect(R); for (const HwModeSelect::PairType &P : MS.Items) { assert(P.second && P.second->isSubClassOf("InstructionEncoding") && @@ -199,18 +196,18 @@ EncodingInfoByHwMode::EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH) } namespace llvm { - raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T) { - T.writeToStream(OS); - return OS; - } +raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T) { + T.writeToStream(OS); + return OS; +} - raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T) { - T.writeToStream(OS); - return OS; - } +raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T) { + T.writeToStream(OS); + return OS; +} - raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T) { - T.writeToStream(OS); - return OS; - } +raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T) { + T.writeToStream(OS); + return OS; } +} // namespace llvm diff --git a/llvm/utils/TableGen/InfoByHwMode.h b/llvm/utils/TableGen/InfoByHwMode.h index 4692ab2..5f53295 100644 --- a/llvm/utils/TableGen/InfoByHwMode.h +++ b/llvm/utils/TableGen/InfoByHwMode.h @@ -40,8 +40,7 @@ enum : unsigned { }; template -void union_modes(const InfoByHwMode &A, - const InfoByHwMode &B, +void union_modes(const InfoByHwMode &A, const InfoByHwMode &B, SmallVectorImpl &Modes) { auto AI = A.begin(); auto BI = B.begin(); @@ -85,9 +84,8 @@ void union_modes(const InfoByHwMode &A, Modes.push_back(DefaultMode); } -template -struct InfoByHwMode { - typedef std::map MapType; +template struct InfoByHwMode { + typedef std::map MapType; typedef typename MapType::value_type PairType; typedef typename MapType::iterator iterator; typedef typename MapType::const_iterator const_iterator; @@ -98,11 +96,11 @@ struct InfoByHwMode { LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin() { return Map.begin(); } LLVM_ATTRIBUTE_ALWAYS_INLINE - iterator end() { return Map.end(); } + iterator end() { return Map.end(); } LLVM_ATTRIBUTE_ALWAYS_INLINE const_iterator begin() const { return Map.begin(); } LLVM_ATTRIBUTE_ALWAYS_INLINE - const_iterator end() const { return Map.end(); } + const_iterator end() const { return Map.end(); } LLVM_ATTRIBUTE_ALWAYS_INLINE bool empty() const { return Map.empty(); } @@ -156,15 +154,13 @@ protected: struct ValueTypeByHwMode : public InfoByHwMode { ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH); ValueTypeByHwMode(Record *R, MVT T); - ValueTypeByHwMode(MVT T) { Map.insert({DefaultMode,T}); } + ValueTypeByHwMode(MVT T) { Map.insert({DefaultMode, T}); } ValueTypeByHwMode() = default; - bool operator== (const ValueTypeByHwMode &T) const; - bool operator< (const ValueTypeByHwMode &T) const; + bool operator==(const ValueTypeByHwMode &T) const; + bool operator<(const ValueTypeByHwMode &T) const; - bool isValid() const { - return !Map.empty(); - } + bool isValid() const { return !Map.empty(); } MVT getType(unsigned Mode) const { return get(Mode); } MVT &getOrCreateTypeForMode(unsigned Mode, MVT Type); @@ -178,8 +174,7 @@ struct ValueTypeByHwMode : public InfoByHwMode { } }; -ValueTypeByHwMode getValueTypeByHwMode(Record *Rec, - const CodeGenHwModes &CGH); +ValueTypeByHwMode getValueTypeByHwMode(Record *Rec, const CodeGenHwModes &CGH); struct RegSizeInfo { unsigned RegSize; @@ -188,14 +183,12 @@ struct RegSizeInfo { RegSizeInfo(Record *R, const CodeGenHwModes &CGH); RegSizeInfo() = default; - bool operator< (const RegSizeInfo &I) const; - bool operator== (const RegSizeInfo &I) const { + bool operator<(const RegSizeInfo &I) const; + bool operator==(const RegSizeInfo &I) const { return std::tie(RegSize, SpillSize, SpillAlignment) == std::tie(I.RegSize, I.SpillSize, I.SpillAlignment); } - bool operator!= (const RegSizeInfo &I) const { - return !(*this == I); - } + bool operator!=(const RegSizeInfo &I) const { return !(*this == I); } bool isSubClassOf(const RegSizeInfo &I) const; void writeToStream(raw_ostream &OS) const; @@ -204,9 +197,9 @@ struct RegSizeInfo { struct RegSizeInfoByHwMode : public InfoByHwMode { RegSizeInfoByHwMode(Record *R, const CodeGenHwModes &CGH); RegSizeInfoByHwMode() = default; - bool operator< (const RegSizeInfoByHwMode &VI) const; - bool operator== (const RegSizeInfoByHwMode &VI) const; - bool operator!= (const RegSizeInfoByHwMode &VI) const { + bool operator<(const RegSizeInfoByHwMode &VI) const; + bool operator==(const RegSizeInfoByHwMode &VI) const; + bool operator!=(const RegSizeInfoByHwMode &VI) const { return !(*this == VI); } @@ -224,7 +217,7 @@ raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T); raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T); raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T); -struct EncodingInfoByHwMode : public InfoByHwMode { +struct EncodingInfoByHwMode : public InfoByHwMode { EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH); EncodingInfoByHwMode() = default; }; diff --git a/llvm/utils/TableGen/InstrDocsEmitter.cpp b/llvm/utils/TableGen/InstrDocsEmitter.cpp index 616e7b5..efabf6b 100644 --- a/llvm/utils/TableGen/InstrDocsEmitter.cpp +++ b/llvm/utils/TableGen/InstrDocsEmitter.cpp @@ -44,11 +44,18 @@ static std::string escapeForRST(StringRef Str) { for (char C : Str) { switch (C) { // We want special characters to be shown as their C escape codes. - case '\n': Result += "\\n"; break; - case '\t': Result += "\\t"; break; + case '\n': + Result += "\\n"; + break; + case '\t': + Result += "\\t"; + break; // Underscore at the end of a line has a special meaning in rst. - case '_': Result += "\\_"; break; - default: Result += C; + case '_': + Result += "\\_"; + break; + default: + Result += C; } } return Result; @@ -96,7 +103,10 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) { std::vector FlagStrings; #define xstr(s) str(s) #define str(s) #s -#define FLAG(f) if (II->f) { FlagStrings.push_back(str(f)); } +#define FLAG(f) \ + if (II->f) { \ + FlagStrings.push_back(str(f)); \ + } FLAG(isReturn) FLAG(isEHScopeReturn) FLAG(isBranch) @@ -111,9 +121,9 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) { FLAG(isTrap) FLAG(canFoldAsLoad) FLAG(mayLoad) - //FLAG(mayLoad_Unset) // Deliberately omitted. + // FLAG(mayLoad_Unset) // Deliberately omitted. FLAG(mayStore) - //FLAG(mayStore_Unset) // Deliberately omitted. + // FLAG(mayStore_Unset) // Deliberately omitted. FLAG(isPredicable) FLAG(isConvertibleToThreeAddress) FLAG(isCommutable) @@ -125,7 +135,7 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) { FLAG(hasCtrlDep) FLAG(isNotDuplicable) FLAG(hasSideEffects) - //FLAG(hasSideEffects_Unset) // Deliberately omitted. + // FLAG(hasSideEffects_Unset) // Deliberately omitted. FLAG(isAsCheapAsAMove) FLAG(hasExtraSrcRegAllocReq) FLAG(hasExtraDefRegAllocReq) diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index dbc5c22..2d08447 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -53,8 +53,8 @@ class InstrInfoEmitter { const CodeGenSchedModels &SchedModels; public: - InstrInfoEmitter(RecordKeeper &R): - Records(R), CDP(R), SchedModels(CDP.getTargetInfo().getSchedModels()) {} + InstrInfoEmitter(RecordKeeper &R) + : Records(R), CDP(R), SchedModels(CDP.getTargetInfo().getSchedModels()) {} // run - Output the instruction set description. void run(raw_ostream &OS); @@ -69,8 +69,8 @@ private: /// The keys of this map are maps which have OpName enum values as their keys /// and instruction operand indices as their values. The values of this map /// are lists of instruction names. - typedef std::map, - std::vector> OpNameMapTy; + typedef std::map, std::vector> + OpNameMapTy; typedef std::map::iterator StrUintMapIter; /// Generate member functions in the target-specific GenInstrInfo class. @@ -94,13 +94,14 @@ private: void emitOperandTypeMappings( raw_ostream &OS, const CodeGenTarget &Target, ArrayRef NumberedInstructions); - void initOperandMapData( - ArrayRef NumberedInstructions, - StringRef Namespace, - std::map &Operands, - OpNameMapTy &OperandMap); - void emitOperandNameMappings(raw_ostream &OS, const CodeGenTarget &Target, - ArrayRef NumberedInstructions); + void + initOperandMapData(ArrayRef NumberedInstructions, + StringRef Namespace, + std::map &Operands, + OpNameMapTy &OperandMap); + void emitOperandNameMappings( + raw_ostream &OS, const CodeGenTarget &Target, + ArrayRef NumberedInstructions); void emitLogicalOperandSizeMappings( raw_ostream &OS, StringRef Namespace, @@ -193,8 +194,7 @@ InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) { // Fill in constraint info. Res += ", "; - const CGIOperandList::ConstraintInfo &Constraint = - Op.Constraints[j]; + const CGIOperandList::ConstraintInfo &Constraint = Op.Constraints[j]; if (Constraint.isNone()) Res += "0"; else if (Constraint.isEarlyClobber()) @@ -246,10 +246,9 @@ void InstrInfoEmitter::EmitOperandInfo(raw_ostream &OS, /// each instructions. This is used to generate the OperandMap table as /// well as the getNamedOperandIdx() function. void InstrInfoEmitter::initOperandMapData( - ArrayRef NumberedInstructions, - StringRef Namespace, - std::map &Operands, - OpNameMapTy &OperandMap) { + ArrayRef NumberedInstructions, + StringRef Namespace, std::map &Operands, + OpNameMapTy &OperandMap) { unsigned NumOperands = 0; for (const CodeGenInstruction *Inst : NumberedInstructions) { if (!Inst->TheDef->getValueAsBit("UseNamedOperandTable")) @@ -259,13 +258,13 @@ void InstrInfoEmitter::initOperandMapData( StrUintMapIter I = Operands.find(Info.Name); if (I == Operands.end()) { - I = Operands.insert(Operands.begin(), - std::pair(Info.Name, NumOperands++)); + I = Operands.insert(Operands.begin(), std::pair( + Info.Name, NumOperands++)); } OpList[I->second] = Info.MIOperandNo; } - OperandMap[OpList].push_back(Namespace.str() + "::" + - Inst->TheDef->getName().str()); + OperandMap[OpList].push_back(Namespace.str() + + "::" + Inst->TheDef->getName().str()); } } @@ -280,9 +279,9 @@ void InstrInfoEmitter::initOperandMapData( /// - A function called getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx) /// for looking up the operand index for an instruction, given a value from /// OpName enum -void InstrInfoEmitter::emitOperandNameMappings(raw_ostream &OS, - const CodeGenTarget &Target, - ArrayRef NumberedInstructions) { +void InstrInfoEmitter::emitOperandNameMappings( + raw_ostream &OS, const CodeGenTarget &Target, + ArrayRef NumberedInstructions) { StringRef Namespace = Target.getInstNamespace(); std::string OpNameNS = "OpName"; // Map of operand names to their enumeration value. This will be used to @@ -380,7 +379,8 @@ void InstrInfoEmitter::emitOperandTypeMappings( } } - OS << " OPERAND_TYPE_LIST_END" << "\n};\n"; + OS << " OPERAND_TYPE_LIST_END" + << "\n};\n"; OS << "} // end namespace OpTypes\n"; OS << "} // end namespace " << Namespace << "\n"; OS << "} // end namespace llvm\n"; @@ -685,7 +685,7 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS, for (const Record *Rec : TIIPredicates) { OS << "bool " << Rec->getValueAsString("FunctionName") - << "(const MCInst &MI);\n"; + << "(const MCInst &MI);\n"; } OS << "void verifyInstructionPredicates(unsigned Opcode, const FeatureBitset " @@ -939,7 +939,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) { // Collect all of the instruction's implicit uses and defs. Records.startTimer("Collect uses/defs"); - std::map, unsigned> EmittedLists; + std::map, unsigned> EmittedLists; std::vector> ImplicitLists; unsigned ImplicitListSize = 0; for (const CodeGenInstruction *II : Target.getInstructionsByEnumValue()) { @@ -1017,7 +1017,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) { InstrNames.emitStringLiteralDef(OS, Twine("extern const char ") + TargetName + "InstrNameData[]"); - OS << "extern const unsigned " << TargetName <<"InstrNameIndices[] = {"; + OS << "extern const unsigned " << TargetName << "InstrNameIndices[] = {"; Num = 0; for (const CodeGenInstruction *Inst : NumberedInstructions) { // Newline every eight entries. @@ -1104,7 +1104,6 @@ void InstrInfoEmitter::run(raw_ostream &OS) { "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n" << " ~" << ClassName << "() override = default;\n"; - OS << "\n};\n} // end namespace llvm\n"; OS << "#endif // GET_INSTRINFO_HEADER\n\n"; @@ -1180,8 +1179,8 @@ void InstrInfoEmitter::emitRecord( int MinOperands = 0; if (!Inst.Operands.empty()) // Each logical operand can be multiple MI operands. - MinOperands = Inst.Operands.back().MIOperandNo + - Inst.Operands.back().MINumOperands; + MinOperands = + Inst.Operands.back().MIOperandNo + Inst.Operands.back().MINumOperands; OS << " { "; OS << Num << ",\t" << MinOperands << ",\t" << Inst.Operands.NumDefs << ",\t" @@ -1202,49 +1201,88 @@ void InstrInfoEmitter::emitRecord( OS << OperandInfoMap.find(OperandInfo)->second << ",\t0"; // Emit all of the target independent flags... - if (Inst.isPreISelOpcode) OS << "|(1ULL<getValueAsBitsInit("TSFlags"); diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp index 28604c5..f7ae5ed 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -60,8 +60,8 @@ public: raw_ostream &OS); void EmitGenerator(const CodeGenIntrinsicTable &Ints, raw_ostream &OS); void EmitAttributes(const CodeGenIntrinsicTable &Ints, raw_ostream &OS); - void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints, bool IsClang, - raw_ostream &OS); + void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints, + bool IsClang, raw_ostream &OS); }; } // End anonymous namespace @@ -204,7 +204,7 @@ void IntrinsicEmitter::EmitIITInfo(raw_ostream &OS) { } void IntrinsicEmitter::EmitTargetInfo(const CodeGenIntrinsicTable &Ints, - raw_ostream &OS) { + raw_ostream &OS) { OS << "// Target mapping\n"; OS << "#ifdef GET_INTRINSIC_TARGET_DATA\n"; OS << "struct IntrinsicTargetInfo {\n" @@ -238,10 +238,10 @@ void IntrinsicEmitter::EmitIntrinsicToOverloadTable( OS << " 0"; for (unsigned i = 0, e = Ints.size(); i != e; ++i) { // Add one to the index so we emit a null bit for the invalid #0 intrinsic. - if ((i+1)%8 == 0) + if ((i + 1) % 8 == 0) OS << ",\n 0"; if (Ints[i].isOverloaded) - OS << " | (1<<" << (i+1)%8 << ')'; + OS << " | (1<<" << (i + 1) % 8 << ')'; } OS << "\n};\n\n"; // OTable contains a true bit at the position if the intrinsic is overloaded. @@ -271,7 +271,7 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints, // capture it in this vector, otherwise store a ~0U. std::vector FixedEncodings; - SequenceToOffsetTable > LongEncodingTable; + SequenceToOffsetTable> LongEncodingTable; std::vector TypeSig; @@ -292,7 +292,7 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints, Failed = true; break; } - Result = (Result << 4) | TypeSig[e-i-1]; + Result = (Result << 4) | TypeSig[e - i - 1]; } // If this could be encoded into a 31-bit word, return it. @@ -330,7 +330,6 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints, TypeSig.clear(); ComputeFixedEncoding(Ints[i], TypeSig); - // Otherwise, emit the offset into the long encoding table. We emit it this // way so that it is easier to read the offset in the .def file. OS << "(1U<<31) | " << LongEncodingTable.get(TypeSig) << ", "; @@ -344,7 +343,7 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints, LongEncodingTable.emit(OS, printIITEntry); OS << " 255\n};\n\n"; - OS << "#endif\n\n"; // End of GET_INTRINSIC_GENERATOR_GLOBAL + OS << "#endif\n\n"; // End of GET_INTRINSIC_GENERATOR_GLOBAL } namespace { @@ -393,7 +392,8 @@ std::optional compareFnAttributes(const CodeGenIntrinsic *L, // Try to order by readonly/readnone attribute. uint32_t LK = L->ME.toIntValue(); uint32_t RK = R->ME.toIntValue(); - if (LK != RK) return (LK > RK); + if (LK != RK) + return (LK > RK); return std::nullopt; } @@ -438,8 +438,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, if (!UniqArgAttributes.try_emplace(Attrs, ID).second) continue; - assert(is_sorted(Attrs) && - "Argument attributes are not sorted"); + assert(is_sorted(Attrs) && "Argument attributes are not sorted"); OS << " case " << ID << ":\n"; OS << " return AttributeSet::get(C, {\n"; @@ -473,8 +472,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, OS << " Attribute::get(C, Attribute::ImmArg),\n"; break; case CodeGenIntrinsic::Alignment: - OS << " Attribute::get(C, Attribute::Alignment, " - << Attr.Value << "),\n"; + OS << " Attribute::get(C, Attribute::Alignment, " << Attr.Value + << "),\n"; break; case CodeGenIntrinsic::Dereferenceable: OS << " Attribute::get(C, Attribute::Dereferenceable, " @@ -489,7 +488,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, OS << "}\n\n"; // Compute unique function attribute sets. - std::map + std::map UniqFnAttributes; OS << "static AttributeSet getIntrinsicFnAttributeSet(" << "LLVMContext &C, unsigned ID) {\n" @@ -542,17 +541,18 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, OS << "AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id) {\n"; // Compute the maximum number of attribute arguments and the map - typedef std::map UniqAttrMapTy; + typedef std::map + UniqAttrMapTy; UniqAttrMapTy UniqAttributes; unsigned maxArgAttrs = 0; unsigned AttrNum = 0; for (unsigned i = 0, e = Ints.size(); i != e; ++i) { const CodeGenIntrinsic &intrinsic = Ints[i]; maxArgAttrs = - std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size())); + std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size())); unsigned &N = UniqAttributes[&intrinsic]; - if (N) continue; + if (N) + continue; N = ++AttrNum; assert(N < 65536 && "Too many unique attributes for table!"); } @@ -564,8 +564,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, for (unsigned i = 0, e = Ints.size(); i != e; ++i) { const CodeGenIntrinsic &intrinsic = Ints[i]; - OS << " " << UniqAttributes[&intrinsic] << ", // " - << intrinsic.Name << "\n"; + OS << " " << UniqAttributes[&intrinsic] << ", // " << intrinsic.Name + << "\n"; } OS << " };\n\n"; diff --git a/llvm/utils/TableGen/OptParserEmitter.cpp b/llvm/utils/TableGen/OptParserEmitter.cpp index 257cd44..0f08119 100644 --- a/llvm/utils/TableGen/OptParserEmitter.cpp +++ b/llvm/utils/TableGen/OptParserEmitter.cpp @@ -196,9 +196,9 @@ static MarshallingInfo createMarshallingInfo(const Record &R) { /// working with those options when given an input command line. static void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) { // Get the option groups and options. - const std::vector &Groups = - Records.getAllDerivedDefinitions("OptionGroup"); - std::vector Opts = Records.getAllDerivedDefinitions("Option"); + const std::vector &Groups = + Records.getAllDerivedDefinitions("OptionGroup"); + std::vector Opts = Records.getAllDerivedDefinitions("Option"); emitSourceFileHeader("Option Parsing Definitions", OS); @@ -423,8 +423,7 @@ static void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) { write_cstring(OS, R.getValueAsString("Values")); else if (!isa(R.getValueInit("ValuesCode"))) { OS << getOptionName(R) << "_Values"; - } - else + } else OS << "nullptr"; }; diff --git a/llvm/utils/TableGen/PredicateExpander.cpp b/llvm/utils/TableGen/PredicateExpander.cpp index 0b9b6389f..d0a35ff 100644 --- a/llvm/utils/TableGen/PredicateExpander.cpp +++ b/llvm/utils/TableGen/PredicateExpander.cpp @@ -101,7 +101,6 @@ void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex, OS << Reg->getName(); } - void PredicateExpander::expandCheckRegOperandSimple(raw_ostream &OS, int OpIndex, StringRef FunctionMapper) { @@ -487,7 +486,8 @@ void STIPredicateExpander::expandPrologue(raw_ostream &OS, OS << "unsigned ProcessorID = getSchedModel().getProcessorID();\n"; } -void STIPredicateExpander::expandOpcodeGroup(raw_ostream &OS, const OpcodeGroup &Group, +void STIPredicateExpander::expandOpcodeGroup(raw_ostream &OS, + const OpcodeGroup &Group, bool ShouldUpdateOpcodeMask) { const OpcodeInfo &OI = Group.getOpcodeInfo(); for (const PredicateInfo &PI : OI.getPredicates()) { diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp index e07fb91..7f692f2 100644 --- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp +++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp @@ -27,19 +27,19 @@ class PseudoLoweringEmitter { enum MapKind { Operand, Imm, Reg }; MapKind Kind; union { - unsigned Operand; // Operand number mapped to. - uint64_t Imm; // Integer immedate value. - Record *Reg; // Physical register. + unsigned Operand; // Operand number mapped to. + uint64_t Imm; // Integer immedate value. + Record *Reg; // Physical register. } Data; }; struct PseudoExpansion { - CodeGenInstruction Source; // The source pseudo instruction definition. - CodeGenInstruction Dest; // The destination instruction to lower to. + CodeGenInstruction Source; // The source pseudo instruction definition. + CodeGenInstruction Dest; // The destination instruction to lower to. IndexedMap OperandMap; PseudoExpansion(CodeGenInstruction &s, CodeGenInstruction &d, - IndexedMap &m) : - Source(s), Dest(d), OperandMap(m) {} + IndexedMap &m) + : Source(s), Dest(d), OperandMap(m) {} }; RecordKeeper &Records; @@ -57,6 +57,7 @@ class PseudoLoweringEmitter { unsigned BaseIdx); void evaluateExpansion(Record *Pseudo); void emitLoweringEmitter(raw_ostream &o); + public: PseudoLoweringEmitter(RecordKeeper &R) : Records(R), Target(R) {} @@ -69,9 +70,9 @@ public: // The pseudo expansion really should take a list of dags, not just // a single dag, so we can do fancier things. -unsigned PseudoLoweringEmitter:: -addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn, - IndexedMap &OperandMap, unsigned BaseIdx) { +unsigned PseudoLoweringEmitter::addDagOperandMapping( + Record *Rec, DagInit *Dag, CodeGenInstruction &Insn, + IndexedMap &OperandMap, unsigned BaseIdx) { unsigned OpsAdded = 0; for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) { if (DefInit *DI = dyn_cast(Dag->getArg(i))) { @@ -92,9 +93,9 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn, // FIXME: Are the message operand types backward? if (DI->getDef() != Insn.Operands[BaseIdx + i].Rec) { PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', operand type '" + DI->getDef()->getName() + - "' does not match expansion operand type '" + - Insn.Operands[BaseIdx + i].Rec->getName() + "'"); + "', operand type '" + DI->getDef()->getName() + + "' does not match expansion operand type '" + + Insn.Operands[BaseIdx + i].Rec->getName() + "'"); PrintFatalNote(DI->getDef(), "Value was assigned at the following location:"); } @@ -118,7 +119,7 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn, // Just add the operands recursively. This is almost certainly // a constant value for a complex operand (> 1 MI operand). unsigned NewOps = - addDagOperandMapping(Rec, SubDag, Insn, OperandMap, BaseIdx + i); + addDagOperandMapping(Rec, SubDag, Insn, OperandMap, BaseIdx + i); OpsAdded += NewOps; // Since we added more than one, we also need to adjust the base. BaseIdx += NewOps - 1; @@ -140,15 +141,15 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) { DefInit *OpDef = dyn_cast(Dag->getOperator()); if (!OpDef) { PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', result operator is not a record"); + "', result operator is not a record"); PrintFatalNote(Rec->getValue("ResultInst"), "Result was assigned at the following location:"); } Record *Operator = OpDef->getDef(); if (!Operator->isSubClassOf("Instruction")) { PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', result operator '" + Operator->getName() + - "' is not an instruction"); + "', result operator '" + Operator->getName() + + "' is not an instruction"); PrintFatalNote(Rec->getValue("ResultInst"), "Result was assigned at the following location:"); } @@ -157,16 +158,16 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) { if (Insn.isCodeGenOnly || Insn.isPseudo) { PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', result operator '" + Operator->getName() + - "' cannot be a pseudo instruction"); + "', result operator '" + Operator->getName() + + "' cannot be a pseudo instruction"); PrintFatalNote(Rec->getValue("ResultInst"), "Result was assigned at the following location:"); } if (Insn.Operands.size() != Dag->getNumArgs()) { PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', result operator '" + Operator->getName() + - "' has the wrong number of operands"); + "', result operator '" + Operator->getName() + + "' has the wrong number of operands"); PrintFatalNote(Rec->getValue("ResultInst"), "Result was assigned at the following location:"); } @@ -201,11 +202,11 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) { if (OperandMap[Insn.Operands[i].MIOperandNo].Kind != OpData::Operand) continue; StringMap::iterator SourceOp = - SourceOperands.find(Dag->getArgNameStr(i)); + SourceOperands.find(Dag->getArgNameStr(i)); if (SourceOp == SourceOperands.end()) { PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', output operand '" + Dag->getArgNameStr(i) + - "' has no matching source operand"); + "', output operand '" + Dag->getArgNameStr(i) + + "' has no matching source operand"); PrintFatalNote(Rec->getValue("ResultInst"), "Value was assigned at the following location:"); } @@ -213,7 +214,7 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) { // MachineInstr operand. for (unsigned I = 0, E = Insn.Operands[i].MINumOperands; I != E; ++I) OperandMap[Insn.Operands[i].MIOperandNo + I].Data.Operand = - SourceOp->getValue(); + SourceOp->getValue(); LLVM_DEBUG(dbgs() << " " << SourceOp->getValue() << " ==> " << i << "\n"); @@ -226,7 +227,8 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) { // Emit file header. emitSourceFileHeader("Pseudo-instruction MC lowering Source Fragment", o); - o << "bool " << Target.getName() + "AsmPrinter" << "::\n" + o << "bool " << Target.getName() + "AsmPrinter" + << "::\n" << "emitPseudoExpansionLowering(MCStreamer &OutStreamer,\n" << " const MachineInstr *MI) {\n"; @@ -236,12 +238,12 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) { for (auto &Expansion : Expansions) { CodeGenInstruction &Source = Expansion.Source; CodeGenInstruction &Dest = Expansion.Dest; - o << " case " << Source.Namespace << "::" - << Source.TheDef->getName() << ": {\n" + o << " case " << Source.Namespace << "::" << Source.TheDef->getName() + << ": {\n" << " MCInst TmpInst;\n" << " MCOperand MCOp;\n" - << " TmpInst.setOpcode(" << Dest.Namespace << "::" - << Dest.TheDef->getName() << ");\n"; + << " TmpInst.setOpcode(" << Dest.Namespace + << "::" << Dest.TheDef->getName() << ");\n"; // Copy the operands from the source instruction. // FIXME: Instruction operands with defaults values (predicates and cc_out @@ -252,29 +254,29 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) { o << " // Operand: " << DestOperand.Name << "\n"; for (unsigned i = 0, e = DestOperand.MINumOperands; i != e; ++i) { switch (Expansion.OperandMap[MIOpNo + i].Kind) { - case OpData::Operand: + case OpData::Operand: o << " lowerOperand(MI->getOperand(" - << Source.Operands[Expansion.OperandMap[MIOpNo].Data - .Operand].MIOperandNo + i + << Source.Operands[Expansion.OperandMap[MIOpNo].Data.Operand] + .MIOperandNo + + i << "), MCOp);\n" << " TmpInst.addOperand(MCOp);\n"; break; - case OpData::Imm: + case OpData::Imm: o << " TmpInst.addOperand(MCOperand::createImm(" << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n"; break; - case OpData::Reg: { - Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg; - o << " TmpInst.addOperand(MCOperand::createReg("; - // "zero_reg" is special. - if (Reg->getName() == "zero_reg") - o << "0"; - else - o << Reg->getValueAsString("Namespace") << "::" - << Reg->getName(); - o << "));\n"; - break; - } + case OpData::Reg: { + Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg; + o << " TmpInst.addOperand(MCOperand::createReg("; + // "zero_reg" is special. + if (Reg->getName() == "zero_reg") + o << "0"; + else + o << Reg->getValueAsString("Namespace") << "::" << Reg->getName(); + o << "));\n"; + break; + } } } MIOpNo += DestOperand.MINumOperands; diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp index f851d9a..8b59411 100644 --- a/llvm/utils/TableGen/RegisterBankEmitter.cpp +++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp @@ -46,7 +46,9 @@ public: /// Get the human-readable name for the bank. StringRef getName() const { return TheDef.getValueAsString("Name"); } /// Get the name of the enumerator in the ID enumeration. - std::string getEnumeratorName() const { return (TheDef.getName() + "ID").str(); } + std::string getEnumeratorName() const { + return (TheDef.getName() + "ID").str(); + } /// Get the name of the array holding the register class coverage data; std::string getCoverageArrayName() const { @@ -212,8 +214,7 @@ static void visitRegisterBankClasses( } void RegisterBankEmitter::emitBaseClassImplementation( - raw_ostream &OS, StringRef TargetName, - std::vector &Banks) { + raw_ostream &OS, StringRef TargetName, std::vector &Banks) { const CodeGenRegBank &RegisterClassHierarchy = Target.getRegBank(); const CodeGenHwModes &CGH = Target.getHwModes(); @@ -229,7 +230,8 @@ void RegisterBankEmitter::emitBaseClassImplementation( OS << "const uint32_t " << Bank.getCoverageArrayName() << "[] = {\n"; unsigned LowestIdxInWord = 0; for (const auto &RCs : RCsGroupedByWord) { - OS << " // " << LowestIdxInWord << "-" << (LowestIdxInWord + 31) << "\n"; + OS << " // " << LowestIdxInWord << "-" << (LowestIdxInWord + 31) + << "\n"; for (const auto &RC : RCs) { OS << " (1u << (" << RC->getQualifiedIdName() << " - " << LowestIdxInWord << ")) |\n"; diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index cff9777..8919e07 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -101,8 +101,8 @@ private: } // end anonymous namespace // runEnums - Print out enum values for all of the registers. -void RegisterInfoEmitter::runEnums(raw_ostream &OS, - CodeGenTarget &Target, CodeGenRegBank &Bank) { +void RegisterInfoEmitter::runEnums(raw_ostream &OS, CodeGenTarget &Target, + CodeGenRegBank &Bank) { const auto &Registers = Bank.getRegisters(); // Register enums are stored as uint16_t in the tables. Make sure we'll fit. @@ -129,7 +129,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS, OS << " " << Reg.getName() << " = " << Reg.EnumValue << ",\n"; assert(Registers.size() == Registers.back().EnumValue && "Register enum value mismatch!"); - OS << " NUM_TARGET_REGS // " << Registers.size()+1 << "\n"; + OS << " NUM_TARGET_REGS // " << Registers.size() + 1 << "\n"; OS << "};\n"; if (!Namespace.empty()) OS << "} // end namespace " << Namespace << "\n"; @@ -152,7 +152,8 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS, OS << "} // end namespace " << Namespace << "\n\n"; } - const std::vector &RegAltNameIndices = Target.getRegAltNameIndices(); + const std::vector &RegAltNameIndices = + Target.getRegAltNameIndices(); // If the only definition is the default NoRegAltName, we don't need to // emit anything. if (RegAltNameIndices.size() > 1) { @@ -188,7 +189,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS, OS << "namespace " << Namespace << " {\n"; OS << "enum RegisterPressureSets {\n"; unsigned NumSets = Bank.getNumRegPressureSets(); - for (unsigned i = 0; i < NumSets; ++i ) { + for (unsigned i = 0; i < NumSets; ++i) { const RegUnitSet &RegUnits = Bank.getRegSetAt(i); OS << " " << RegUnits.Name << " = " << i << ",\n"; } @@ -201,13 +202,11 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS, OS << "#endif // GET_REGINFO_ENUM\n\n"; } -static void printInt(raw_ostream &OS, int Val) { - OS << Val; -} +static void printInt(raw_ostream &OS, int Val) { OS << Val; } -void RegisterInfoEmitter:: -EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank, - const std::string &ClassName) { +void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS, + const CodeGenRegBank &RegBank, + const std::string &ClassName) { unsigned NumRCs = RegBank.getRegClasses().size(); unsigned NumSets = RegBank.getNumRegPressureSets(); @@ -254,8 +253,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank, } OS << "};\n" << " return RUWeightTable[RegUnit];\n"; - } - else { + } else { OS << " // All register units have unit weight.\n" << " return 1;\n"; } @@ -271,7 +269,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank, << "getRegPressureSetName(unsigned Idx) const {\n" << " static const char *PressureNameTable[] = {\n"; unsigned MaxRegUnitWeight = 0; - for (unsigned i = 0; i < NumSets; ++i ) { + for (unsigned i = 0; i < NumSets; ++i) { const RegUnitSet &RegUnits = RegBank.getRegSetAt(i); MaxRegUnitWeight = std::max(MaxRegUnitWeight, RegUnits.Weight); OS << " \"" << RegUnits.Name << "\",\n"; @@ -287,10 +285,10 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank, "{\n" << " static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32) << " PressureLimitTable[] = {\n"; - for (unsigned i = 0; i < NumSets; ++i ) { + for (unsigned i = 0; i < NumSets; ++i) { const RegUnitSet &RegUnits = RegBank.getRegSetAt(i); - OS << " " << RegUnits.Weight << ", \t// " << i << ": " - << RegUnits.Name << "\n"; + OS << " " << RegUnits.Weight << ", \t// " << i << ": " << RegUnits.Name + << "\n"; } OS << " };\n" << " return PressureLimitTable[Idx];\n" @@ -353,7 +351,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank, << "}\n\n"; } -using DwarfRegNumsMapPair = std::pair>; +using DwarfRegNumsMapPair = std::pair>; using DwarfRegNumsVecTy = std::vector; static void finalizeDwarfRegNumsKeys(DwarfRegNumsVecTy &DwarfRegNums) { @@ -419,7 +417,7 @@ void RegisterInfoEmitter::EmitRegMappingTables( // Store the mapping sorted by the LLVM reg num so lookup can be done // with a binary search. - std::map Dwarf2LMap; + std::map Dwarf2LMap; for (auto &DwarfRegNum : DwarfRegNums) { int DwarfRegNo = DwarfRegNum.second[I]; if (DwarfRegNo < 0) @@ -531,8 +529,8 @@ void RegisterInfoEmitter::EmitRegMapping( else OS << "EHFlavour"; OS << ") {\n" - << " default:\n" - << " llvm_unreachable(\"Unknown DWARF flavour\");\n"; + << " default:\n" + << " llvm_unreachable(\"Unknown DWARF flavour\");\n"; for (unsigned i = 0, e = maxLength; i != e; ++i) { OS << " case " << i << ":\n"; @@ -540,14 +538,14 @@ void RegisterInfoEmitter::EmitRegMapping( if (!isCtor) OS << "RI->"; std::string Tmp; - raw_string_ostream(Tmp) << Namespace - << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i - << "Dwarf2L"; + raw_string_ostream(Tmp) + << Namespace << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i + << "Dwarf2L"; OS << "mapDwarfRegsToLLVMRegs(" << Tmp << ", " << Tmp << "Size, "; if (j == 0) - OS << "false"; - else - OS << "true"; + OS << "false"; + else + OS << "true"; OS << ");\n"; OS << " break;\n"; } @@ -571,14 +569,14 @@ void RegisterInfoEmitter::EmitRegMapping( if (!isCtor) OS << "RI->"; std::string Tmp; - raw_string_ostream(Tmp) << Namespace - << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i - << "L2Dwarf"; + raw_string_ostream(Tmp) + << Namespace << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i + << "L2Dwarf"; OS << "mapLLVMRegsToDwarfRegs(" << Tmp << ", " << Tmp << "Size, "; if (j == 0) - OS << "false"; - else - OS << "true"; + OS << "false"; + else + OS << "true"; OS << ");\n"; OS << " break;\n"; } @@ -588,8 +586,7 @@ void RegisterInfoEmitter::EmitRegMapping( // Print a BitVector as a sequence of hex numbers using a little-endian mapping. // Width is the number of bits per hex number. -static void printBitVectorAsHex(raw_ostream &OS, - const BitVector &Bits, +static void printBitVectorAsHex(raw_ostream &OS, const BitVector &Bits, unsigned Width) { assert(Width <= 32 && "Width too large"); unsigned Digits = (Width + 3) / 4; @@ -604,16 +601,15 @@ static void printBitVectorAsHex(raw_ostream &OS, // Helper to emit a set of bits into a constant byte array. class BitVectorEmitter { BitVector Values; + public: void add(unsigned v) { if (v >= Values.size()) - Values.resize(((v/8)+1)*8); // Round up to the next byte. + Values.resize(((v / 8) + 1) * 8); // Round up to the next byte. Values[v] = true; } - void print(raw_ostream &OS) { - printBitVectorAsHex(OS, Values, 8); - } + void print(raw_ostream &OS) { printBitVectorAsHex(OS, Values, 8); } }; static void printSimpleValueType(raw_ostream &OS, MVT::SimpleValueType VT) { @@ -650,9 +646,8 @@ static DiffVec &diffEncode(DiffVec &V, SparseBitVector<> List) { return V; } -template -static -DiffVec &diffEncode(DiffVec &V, unsigned InitVal, Iter Begin, Iter End) { +template +static DiffVec &diffEncode(DiffVec &V, unsigned InitVal, Iter Begin, Iter End) { assert(V.empty() && "Clear DiffVec before diffEncode."); unsigned Val = InitVal; for (Iter I = Begin; I != End; ++I) { @@ -672,7 +667,7 @@ static void printMask(raw_ostream &OS, LaneBitmask Val) { // Try to combine Idx's compose map into Vec if it is compatible. // Return false if it's not possible. static bool combine(const CodeGenSubRegIndex *Idx, - SmallVectorImpl &Vec) { + SmallVectorImpl &Vec) { const CodeGenSubRegIndex::CompMap &Map = Idx->getComposites(); for (const auto &I : Map) { CodeGenSubRegIndex *&Entry = Vec[I.first->EnumValue - 1]; @@ -683,17 +678,15 @@ static bool combine(const CodeGenSubRegIndex *Idx, // All entries are compatible. Make it so. for (const auto &I : Map) { auto *&Entry = Vec[I.first->EnumValue - 1]; - assert((!Entry || Entry == I.second) && - "Expected EnumValue to be unique"); + assert((!Entry || Entry == I.second) && "Expected EnumValue to be unique"); Entry = I.second; } return true; } -void -RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, - CodeGenRegBank &RegBank, - const std::string &ClName) { +void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, + CodeGenRegBank &RegBank, + const std::string &ClName) { const auto &SubRegIndices = RegBank.getSubRegIndices(); OS << "unsigned " << ClName << "::composeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const {\n"; @@ -707,7 +700,7 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, // Map each Sub-register index to a compatible table row. SmallVector RowMap; - SmallVector, 4> Rows; + SmallVector, 4> Rows; auto SubRegIndicesSize = std::distance(SubRegIndices.begin(), SubRegIndices.end()); @@ -760,10 +753,8 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, OS << "}\n\n"; } -void -RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, - CodeGenRegBank &RegBank, - const std::string &ClName) { +void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask( + raw_ostream &OS, CodeGenRegBank &RegBank, const std::string &ClName) { // See the comments in computeSubRegLaneMasks() for our goal here. const auto &SubRegIndices = RegBank.getSubRegIndices(); @@ -771,8 +762,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, SmallVector SubReg2SequenceIndexMap; SmallVector, 4> Sequences; for (const auto &Idx : SubRegIndices) { - const SmallVector &IdxSequence - = Idx.CompositionLaneMaskTransform; + const SmallVector &IdxSequence = + Idx.CompositionLaneMaskTransform; unsigned Found = ~0u; unsigned SIdx = 0; @@ -807,7 +798,7 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, OS << format(", %2u }, ", P.RotateLeft); } OS << "{ LaneBitmask::getNone(), 0 }"; - if (s+1 != se) + if (s + 1 != se) OS << ", "; OS << " // Sequence " << Idx << "\n"; Idx += Sequence.size() + 1; @@ -820,7 +811,7 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, for (size_t i = 0, e = SubRegIndices.size(); i != e; ++i) { OS << " "; OS << SubReg2SequenceIndexMap[i]; - if (i+1 != e) + if (i + 1 != e) OS << ","; OS << " // to " << SubRegIndices[i].getName() << "\n"; } @@ -829,15 +820,18 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, OS << "LaneBitmask " << ClName << "::composeSubRegIndexLaneMaskImpl(unsigned IdxA, LaneBitmask LaneMask)" " const {\n" - " --IdxA; assert(IdxA < " << SubRegIndices.size() + " --IdxA; assert(IdxA < " + << SubRegIndices.size() << " && \"Subregister index out of bounds\");\n" " LaneBitmask Result;\n" " for (const MaskRolOp *Ops =\n" " &LaneMaskComposeSequences[CompositeSequences[IdxA]];\n" " Ops->Mask.any(); ++Ops) {\n" - " LaneBitmask::Type M = LaneMask.getAsInteger() & Ops->Mask.getAsInteger();\n" + " LaneBitmask::Type M = LaneMask.getAsInteger() & " + "Ops->Mask.getAsInteger();\n" " if (unsigned S = Ops->RotateLeft)\n" - " Result |= LaneBitmask((M << S) | (M >> (LaneBitmask::BitWidth - S)));\n" + " Result |= LaneBitmask((M << S) | (M >> (LaneBitmask::BitWidth - " + "S)));\n" " else\n" " Result |= LaneBitmask(M);\n" " }\n" @@ -848,7 +842,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, << "::reverseComposeSubRegIndexLaneMaskImpl(unsigned IdxA, " " LaneBitmask LaneMask) const {\n" " LaneMask &= getSubRegIndexLaneMask(IdxA);\n" - " --IdxA; assert(IdxA < " << SubRegIndices.size() + " --IdxA; assert(IdxA < " + << SubRegIndices.size() << " && \"Subregister index out of bounds\");\n" " LaneBitmask Result;\n" " for (const MaskRolOp *Ops =\n" @@ -856,7 +851,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, " Ops->Mask.any(); ++Ops) {\n" " LaneBitmask::Type M = LaneMask.getAsInteger();\n" " if (unsigned S = Ops->RotateLeft)\n" - " Result |= LaneBitmask((M >> S) | (M << (LaneBitmask::BitWidth - S)));\n" + " Result |= LaneBitmask((M >> S) | (M << (LaneBitmask::BitWidth - " + "S)));\n" " else\n" " Result |= LaneBitmask(M);\n" " }\n" @@ -867,9 +863,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, // // runMCDesc - Print out MC register descriptions. // -void -RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, - CodeGenRegBank &RegBank) { +void RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, + CodeGenRegBank &RegBank) { emitSourceFileHeader("MC Register Information", OS); OS << "\n#ifdef GET_REGINFO_MC_DESC\n"; @@ -880,7 +875,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, auto &SubRegIndices = RegBank.getSubRegIndices(); // The lists of sub-registers and super-registers go in the same array. That // allows us to share suffixes. - typedef std::vector RegVec; + typedef std::vector RegVec; // Differentially encoded lists. SequenceToOffsetTable DiffSeqs; @@ -894,7 +889,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, // Keep track of sub-register names as well. These are not differentially // encoded. - typedef SmallVector SubRegIdxVec; + typedef SmallVector SubRegIdxVec; SequenceToOffsetTable>> SubRegIdxSeqs; SmallVector SubRegIdxLists(Regs.size()); @@ -907,7 +902,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, RegStrings.add(std::string(Reg.getName())); // Compute the ordered sub-register list. - SetVector SR; + SetVector SR; Reg.addSubRegsPreOrder(SR, RegBank); diffEncode(SubRegLists[i], Reg.EnumValue, SR.begin(), SR.end()); DiffSeqs.add(SubRegLists[i]); @@ -961,8 +956,8 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, OS << "};\n\n"; // Emit the table of sub-register index sizes. - OS << "extern const MCRegisterInfo::SubRegCoveredBits " - << TargetName << "SubRegIdxRanges[] = {\n"; + OS << "extern const MCRegisterInfo::SubRegCoveredBits " << TargetName + << "SubRegIdxRanges[] = {\n"; OS << " { " << (uint16_t)-1 << ", " << (uint16_t)-1 << " },\n"; for (const auto &Idx : SubRegIndices) { OS << " { " << Idx.Offset << ", " << Idx.Size << " },\t// " @@ -995,13 +990,13 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, << LaneMaskSeqs.get(RegUnitLaneMasks[i]) << " },\n"; ++i; } - OS << "};\n\n"; // End of register descriptors... + OS << "};\n\n"; // End of register descriptors... // Emit the table of register unit roots. Each regunit has one or two root // registers. OS << "extern const MCPhysReg " << TargetName << "RegUnitRoots[][2] = {\n"; for (unsigned i = 0, e = RegBank.getNumNativeRegUnits(); i != e; ++i) { - ArrayRef Roots = RegBank.getRegUnit(i).getRoots(); + ArrayRef Roots = RegBank.getRegUnit(i).getRoots(); assert(!Roots.empty() && "All regunits must have a root register."); assert(Roots.size() <= 2 && "More than two roots not supported yet."); OS << " { "; @@ -1021,7 +1016,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, // Emit the register enum value arrays for each RegisterClass for (const auto &RC : RegisterClasses) { - ArrayRef Order = RC.getOrder(); + ArrayRef Order = RC.getOrder(); // Give the register class a legal C name if it's anonymous. const std::string &Name = RC.getName(); @@ -1092,7 +1087,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, } OS << " " << Value << ",\n"; } - OS << "};\n"; // End of HW encoding table + OS << "};\n"; // End of HW encoding table // MCRegisterInfo initialization routine. OS << "static inline void Init" << TargetName @@ -1117,9 +1112,9 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, OS << "#endif // GET_REGINFO_MC_DESC\n\n"; } -void -RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target, - CodeGenRegBank &RegBank) { +void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, + CodeGenTarget &Target, + CodeGenRegBank &RegBank) { emitSourceFileHeader("Register Information Header Fragment", OS); OS << "\n#ifdef GET_REGINFO_HEADER\n"; @@ -1175,8 +1170,10 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target, << " const MachineFunction &MF);\n"; const auto &RegisterClasses = RegBank.getRegClasses(); - if (llvm::any_of(RegisterClasses, [](const auto &RC) { return RC.getBaseClassOrder(); })) { - OS << " const TargetRegisterClass *getPhysRegBaseClass(MCRegister Reg) const override;\n"; + if (llvm::any_of(RegisterClasses, + [](const auto &RC) { return RC.getBaseClassOrder(); })) { + OS << " const TargetRegisterClass *getPhysRegBaseClass(MCRegister Reg) " + "const override;\n"; } OS << "};\n\n"; @@ -1200,9 +1197,8 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target, // // runTargetDesc - Output the target register and register file descriptions. // -void -RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, - CodeGenRegBank &RegBank){ +void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, + CodeGenRegBank &RegBank) { emitSourceFileHeader("Target Register and Register Classes Information", OS); OS << "\n#ifdef GET_REGINFO_TARGET_DESC\n"; @@ -1219,11 +1215,11 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, const auto &SubRegIndices = RegBank.getSubRegIndices(); // Collect all registers belonging to any allocatable class. - std::set AllocatableRegs; + std::set AllocatableRegs; // Collect allocatable registers. for (const auto &RC : RegisterClasses) { - ArrayRef Order = RC.getOrder(); + ArrayRef Order = RC.getOrder(); if (RC.Allocatable) AllocatableRegs.insert(Order.begin(), Order.end()); @@ -1297,7 +1293,6 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, } OS << "};\n"; - OS << "\nstatic const TargetRegisterClass *const " << "NullRegClasses[] = { nullptr };\n\n"; @@ -1320,7 +1315,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, // Every bit mask present in the list has at least one bit set. // Compress the sub-reg index lists. - typedef std::vector IdxList; + typedef std::vector IdxList; SmallVector SuperRegIdxLists(RegisterClasses.size()); SequenceToOffsetTable>> SuperRegIdxSeqs; BitVector MaskBV(RegisterClasses.size()); @@ -1354,14 +1349,14 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, // Emit NULL terminated super-class lists. for (const auto &RC : RegisterClasses) { - ArrayRef Supers = RC.getSuperClasses(); + ArrayRef Supers = RC.getSuperClasses(); // Skip classes without supers. We can reuse NullRegClasses. if (Supers.empty()) continue; - OS << "static const TargetRegisterClass *const " - << RC.getName() << "Superclasses[] = {\n"; + OS << "static const TargetRegisterClass *const " << RC.getName() + << "Superclasses[] = {\n"; for (const auto *Super : Supers) OS << " &" << Super->getQualifiedName() << "RegClass,\n"; OS << " nullptr\n};\n\n"; @@ -1371,12 +1366,12 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, for (const auto &RC : RegisterClasses) { if (!RC.AltOrderSelect.empty()) { OS << "\nstatic inline unsigned " << RC.getName() - << "AltOrderSelect(const MachineFunction &MF) {" - << RC.AltOrderSelect << "}\n\n" + << "AltOrderSelect(const MachineFunction &MF) {" << RC.AltOrderSelect + << "}\n\n" << "static ArrayRef " << RC.getName() << "GetRawAllocationOrder(const MachineFunction &MF) {\n"; - for (unsigned oi = 1 , oe = RC.getNumOrders(); oi != oe; ++oi) { - ArrayRef Elems = RC.getOrder(oi); + for (unsigned oi = 1, oe = RC.getNumOrders(); oi != oe; ++oi) { + ArrayRef Elems = RC.getOrder(oi); if (!Elems.empty()) { OS << " static const MCPhysReg AltOrder" << oi << "[] = {"; for (unsigned elem = 0; elem != Elems.size(); ++elem) @@ -1556,8 +1551,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, EnumValue = SubRegClass->EnumValue + 1; } - OS << " " << EnumValue << ",\t// " - << RC.getName() << ':' << Idx.getName(); + OS << " " << EnumValue << ",\t// " << RC.getName() << ':' + << Idx.getName(); if (MatchingSubClass) { CodeGenRegisterClass *SubRegClass = MatchingSubClass->second; @@ -1581,7 +1576,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, // Emit register base class mapper if (!RegisterClasses.empty()) { // Collect base classes - SmallVector BaseClasses; + SmallVector BaseClasses; for (const auto &RC : RegisterClasses) { if (RC.getBaseClassOrder()) BaseClasses.push_back(&RC); @@ -1592,9 +1587,10 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, // Apply order struct BaseClassOrdering { - bool operator()(const CodeGenRegisterClass *LHS, const CodeGenRegisterClass *RHS) const { - return std::pair(*LHS->getBaseClassOrder(), LHS->EnumValue) - < std::pair(*RHS->getBaseClassOrder(), RHS->EnumValue); + bool operator()(const CodeGenRegisterClass *LHS, + const CodeGenRegisterClass *RHS) const { + return std::pair(*LHS->getBaseClassOrder(), LHS->EnumValue) < + std::pair(*RHS->getBaseClassOrder(), RHS->EnumValue); } }; llvm::stable_sort(BaseClasses, BaseClassOrdering()); @@ -1638,8 +1634,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, OS << "extern const char " << TargetName << "RegClassStrings[];\n"; OS << "extern const MCPhysReg " << TargetName << "RegUnitRoots[][2];\n"; OS << "extern const uint16_t " << TargetName << "SubRegIdxLists[];\n"; - OS << "extern const MCRegisterInfo::SubRegCoveredBits " - << TargetName << "SubRegIdxRanges[];\n"; + OS << "extern const MCRegisterInfo::SubRegCoveredBits " << TargetName + << "SubRegIdxRanges[];\n"; OS << "extern const uint16_t " << TargetName << "RegEncodingTable[];\n"; EmitRegMappingTables(OS, Regs, true); @@ -1673,16 +1669,15 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, OS << "}\n\n"; // Emit CalleeSavedRegs information. - std::vector CSRSets = - Records.getAllDerivedDefinitions("CalleeSavedRegs"); + std::vector CSRSets = + Records.getAllDerivedDefinitions("CalleeSavedRegs"); for (unsigned i = 0, e = CSRSets.size(); i != e; ++i) { Record *CSRSet = CSRSets[i]; const SetTheory::RecVec *Regs = RegBank.getSets().expand(CSRSet); assert(Regs && "Cannot expand CalleeSavedRegs instance"); // Emit the *_SaveList list of callee-saved registers. - OS << "static const MCPhysReg " << CSRSet->getName() - << "_SaveList[] = { "; + OS << "static const MCPhysReg " << CSRSet->getName() << "_SaveList[] = { "; for (unsigned r = 0, re = Regs->size(); r != re; ++r) OS << getQualifiedName((*Regs)[r]) << ", "; OS << "0 };\n"; @@ -1693,11 +1688,11 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, // Check for an optional OtherPreserved set. // Add those registers to RegMask, but not to SaveList. if (DagInit *OPDag = - dyn_cast(CSRSet->getValueInit("OtherPreserved"))) { + dyn_cast(CSRSet->getValueInit("OtherPreserved"))) { SetTheory::RecSet OPSet; RegBank.getSets().evaluate(OPDag, OPSet, CSRSet->getLoc()); Covered |= RegBank.computeCoveredRegisters( - ArrayRef(OPSet.begin(), OPSet.end())); + ArrayRef(OPSet.begin(), OPSet.end())); } // Add all constant physical registers to the preserved mask: @@ -1709,8 +1704,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, Covered |= RegBank.computeCoveredRegisters( ArrayRef(ConstantSet.begin(), ConstantSet.end())); - OS << "static const uint32_t " << CSRSet->getName() - << "_RegMask[] = { "; + OS << "static const uint32_t " << CSRSet->getName() << "_RegMask[] = { "; printBitVectorAsHex(OS, Covered, 32); OS << "};\n"; } @@ -1795,7 +1789,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, } OS << "}\n\n"; - OS << "const " << TargetName << "FrameLowering *\n" << TargetName + OS << "const " << TargetName << "FrameLowering *\n" + << TargetName << "GenRegisterInfo::getFrameLowering(const MachineFunction &MF) {\n" << " return static_cast(\n" << " MF.getSubtarget().getFrameLowering());\n" @@ -1827,7 +1822,7 @@ void RegisterInfoEmitter::debugDump(raw_ostream &OS) { CodeGenRegBank &RegBank = Target.getRegBank(); const CodeGenHwModes &CGH = Target.getHwModes(); unsigned NumModes = CGH.getNumModeIds(); - auto getModeName = [CGH] (unsigned M) -> StringRef { + auto getModeName = [CGH](unsigned M) -> StringRef { if (M == 0) return "Default"; return CGH.getMode(M).Name; @@ -1883,9 +1878,10 @@ void RegisterInfoEmitter::debugDump(raw_ostream &OS) { OS << '\n'; OS << "\tCoveredBySubregs: " << R.CoveredBySubRegs << '\n'; OS << "\tHasDisjunctSubRegs: " << R.HasDisjunctSubRegs << '\n'; - for (std::pair P : R.getSubRegs()) { - OS << "\tSubReg " << P.first->getName() - << " = " << P.second->getName() << '\n'; + for (std::pair P : + R.getSubRegs()) { + OS << "\tSubReg " << P.first->getName() << " = " << P.second->getName() + << '\n'; } } } diff --git a/llvm/utils/TableGen/SDNodeProperties.h b/llvm/utils/TableGen/SDNodeProperties.h index 66a04e6..5715423 100644 --- a/llvm/utils/TableGen/SDNodeProperties.h +++ b/llvm/utils/TableGen/SDNodeProperties.h @@ -34,6 +34,6 @@ enum SDNP { unsigned parseSDPatternOperatorProperties(Record *R); -} +} // namespace llvm #endif diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp index d75a9e9..0cce798a 100644 --- a/llvm/utils/TableGen/SearchableTableEmitter.cpp +++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp @@ -134,7 +134,7 @@ private: Twine("Entry for field '") + Field.Name + "' is null"); return std::string(Entry->first); } - PrintFatalError(Loc, Twine("invalid field type for field '") + Field.Name + + PrintFatalError(Loc, Twine("invalid field type for field '") + Field.Name + "'; expected: bit, bits, string, or code"); } @@ -173,7 +173,7 @@ private: return "uint32_t"; if (NumBits <= 64) return "uint64_t"; - PrintFatalError(Index.Loc, Twine("In table '") + Table.Name + + PrintFatalError(Index.Loc, Twine("In table '") + Table.Name + "' lookup method '" + Index.Name + "', key field '" + Field.Name + "' of type bits is too large"); @@ -425,7 +425,7 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table, OS << " struct KeyType {\n"; for (const auto &Field : Index.Fields) { - OS << " " << searchableFieldType(Table, Index, Field, TypeInTempStruct) + OS << " " << searchableFieldType(Table, Index, Field, TypeInTempStruct) << " " << Field.Name << ";\n"; } OS << " };\n"; @@ -436,7 +436,7 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table, if (isa(Field.RecType)) { OS << ".upper()"; if (IsPrimary) - PrintFatalError(Index.Loc, + PrintFatalError(Index.Loc, Twine("In table '") + Table.Name + "', use a secondary lookup method for " "case-insensitive comparison of field '" + @@ -580,7 +580,7 @@ std::unique_ptr SearchableTableEmitter::parseSearchIndex( Twine("In table '") + Table.Name + "', 'PrimaryKey' or 'Key' refers to nonexistent field '" + FieldName + "'"); - + Index->Fields.push_back(*Field); } @@ -643,11 +643,11 @@ void SearchableTableEmitter::collectTableEntries( } else { RecTy *Ty = resolveTypes(Field.RecType, TI->getType()); if (!Ty) - PrintFatalError(EntryRec->getValue(Field.Name), + PrintFatalError(EntryRec->getValue(Field.Name), Twine("Field '") + Field.Name + "' of table '" + - Table.Name + "' entry has incompatible type: " + - TI->getType()->getAsString() + " vs. " + - Field.RecType->getAsString()); + Table.Name + "' entry has incompatible type: " + + TI->getType()->getAsString() + " vs. " + + Field.RecType->getAsString()); Field.RecType = Ty; } } @@ -702,7 +702,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) { StringRef FilterClass = EnumRec->getValueAsString("FilterClass"); Enum->Class = Records.getClass(FilterClass); if (!Enum->Class) - PrintFatalError(EnumRec->getValue("FilterClass"), + PrintFatalError(EnumRec->getValue("FilterClass"), Twine("Enum FilterClass '") + FilterClass + "' does not exist"); @@ -723,11 +723,13 @@ void SearchableTableEmitter::run(raw_ostream &OS) { for (const auto &FieldName : Fields) { Table->Fields.emplace_back(FieldName); // Construct a GenericField. - if (auto TypeOfRecordVal = TableRec->getValue(("TypeOf_" + FieldName).str())) { - if (!parseFieldType(Table->Fields.back(), TypeOfRecordVal->getValue())) { - PrintError(TypeOfRecordVal, - Twine("Table '") + Table->Name + - "' has invalid 'TypeOf_" + FieldName + + if (auto TypeOfRecordVal = + TableRec->getValue(("TypeOf_" + FieldName).str())) { + if (!parseFieldType(Table->Fields.back(), + TypeOfRecordVal->getValue())) { + PrintError(TypeOfRecordVal, + Twine("Table '") + Table->Name + "' has invalid 'TypeOf_" + + FieldName + "': " + TypeOfRecordVal->getValue()->getAsString()); PrintFatalNote("The 'TypeOf_xxx' field must be a string naming a " "GenericEnum record, or \"code\""); @@ -737,9 +739,9 @@ void SearchableTableEmitter::run(raw_ostream &OS) { StringRef FilterClass = TableRec->getValueAsString("FilterClass"); if (!Records.getClass(FilterClass)) - PrintFatalError(TableRec->getValue("FilterClass"), - Twine("Table FilterClass '") + - FilterClass + "' does not exist"); + PrintFatalError(TableRec->getValue("FilterClass"), + Twine("Table FilterClass '") + FilterClass + + "' does not exist"); RecordVal *FilterClassFieldVal = TableRec->getValue("FilterClassField"); std::vector Definitions = @@ -779,14 +781,14 @@ void SearchableTableEmitter::run(raw_ostream &OS) { Record *TableRec = IndexRec->getValueAsDef("Table"); auto It = TableMap.find(TableRec); if (It == TableMap.end()) - PrintFatalError(IndexRec->getValue("Table"), + PrintFatalError(IndexRec->getValue("Table"), Twine("SearchIndex '") + IndexRec->getName() + "' refers to nonexistent table '" + TableRec->getName()); GenericTable &Table = *It->second; Table.Indices.push_back( - parseSearchIndex(Table, IndexRec->getValue("Key"), IndexRec->getName(), + parseSearchIndex(Table, IndexRec->getValue("Key"), IndexRec->getName(), IndexRec->getValueAsListOfStrings("Key"), IndexRec->getValueAsBit("EarlyOut"))); } diff --git a/llvm/utils/TableGen/SequenceToOffsetTable.h b/llvm/utils/TableGen/SequenceToOffsetTable.h index 77a404d..7db39a9 100644 --- a/llvm/utils/TableGen/SequenceToOffsetTable.h +++ b/llvm/utils/TableGen/SequenceToOffsetTable.h @@ -44,7 +44,7 @@ static inline void printChar(raw_ostream &OS, char C) { /// /// @tparam SeqT The sequence container. (vector or string). /// @tparam Less A stable comparator for SeqT elements. -template > +template > class SequenceToOffsetTable { typedef typename SeqT::value_type ElemT; @@ -53,8 +53,8 @@ class SequenceToOffsetTable { struct SeqLess { Less L; bool operator()(const SeqT &A, const SeqT &B) const { - return std::lexicographical_compare(A.rbegin(), A.rend(), - B.rbegin(), B.rend(), L); + return std::lexicographical_compare(A.rbegin(), A.rend(), B.rbegin(), + B.rend(), L); } }; @@ -153,15 +153,15 @@ public: /// emit - Print out the table as the body of an array initializer. /// Use the Print function to print elements. - void emit(raw_ostream &OS, - void (*Print)(raw_ostream&, ElemT), + void emit(raw_ostream &OS, void (*Print)(raw_ostream &, ElemT), const char *Term = "0") const { assert((empty() || Entries) && "Call layout() before emit()"); for (typename SeqMap::const_iterator I = Seqs.begin(), E = Seqs.end(); I != E; ++I) { OS << " /* " << I->second << " */ "; for (typename SeqT::const_iterator SI = I->first.begin(), - SE = I->first.end(); SI != SE; ++SI) { + SE = I->first.end(); + SI != SE; ++SI) { Print(OS, *SI); OS << ", "; } diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp index 3922518..b1502ea 100644 --- a/llvm/utils/TableGen/SubtargetEmitter.cpp +++ b/llvm/utils/TableGen/SubtargetEmitter.cpp @@ -51,9 +51,9 @@ struct LessRecordFieldFieldName { }; class SubtargetEmitter { - // Each processor has a SchedClassDesc table with an entry for each SchedClass. - // The SchedClassDesc table indexes into a global write resource table, write - // latency table, and read advance table. + // Each processor has a SchedClassDesc table with an entry for each + // SchedClass. The SchedClassDesc table indexes into a global write resource + // table, write latency table, and read advance table. struct SchedClassTables { std::vector> ProcSchedClasses; std::vector WriteProcResources; @@ -89,20 +89,18 @@ class SubtargetEmitter { const DenseMap &FeatureMap); unsigned CPUKeyValues(raw_ostream &OS, const DenseMap &FeatureMap); - void FormItineraryStageString(const std::string &Names, - Record *ItinData, std::string &ItinString, - unsigned &NStages); - void FormItineraryOperandCycleString(Record *ItinData, std::string &ItinString, + void FormItineraryStageString(const std::string &Names, Record *ItinData, + std::string &ItinString, unsigned &NStages); + void FormItineraryOperandCycleString(Record *ItinData, + std::string &ItinString, unsigned &NOperandCycles); - void FormItineraryBypassString(const std::string &Names, - Record *ItinData, - std::string &ItinString, unsigned NOperandCycles); - void EmitStageAndOperandCycleData(raw_ostream &OS, - std::vector> - &ProcItinLists); + void FormItineraryBypassString(const std::string &Names, Record *ItinData, + std::string &ItinString, + unsigned NOperandCycles); + void EmitStageAndOperandCycleData( + raw_ostream &OS, std::vector> &ProcItinLists); void EmitItineraries(raw_ostream &OS, - std::vector> - &ProcItinLists); + std::vector> &ProcItinLists); unsigned EmitRegisterFileTables(const CodeGenProcModel &ProcModel, raw_ostream &OS); void EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel, @@ -153,15 +151,16 @@ public: void SubtargetEmitter::Enumeration(raw_ostream &OS, DenseMap &FeatureMap) { // Get all records of class and sort - std::vector DefList = - Records.getAllDerivedDefinitions("SubtargetFeature"); + std::vector DefList = + Records.getAllDerivedDefinitions("SubtargetFeature"); llvm::sort(DefList, LessRecord()); unsigned N = DefList.size(); if (N == 0) return; if (N + 1 > MAX_SUBTARGET_FEATURES) - PrintFatalError("Too many subtarget features! Bump MAX_SUBTARGET_FEATURES."); + PrintFatalError( + "Too many subtarget features! Bump MAX_SUBTARGET_FEATURES."); OS << "namespace " << Target << " {\n"; @@ -248,8 +247,8 @@ void SubtargetEmitter::EmitSubtargetInfoMacroCalls(raw_ostream &OS) { unsigned SubtargetEmitter::FeatureKeyValues( raw_ostream &OS, const DenseMap &FeatureMap) { // Gather and sort all the features - std::vector FeatureList = - Records.getAllDerivedDefinitions("SubtargetFeature"); + std::vector FeatureList = + Records.getAllDerivedDefinitions("SubtargetFeature"); if (FeatureList.empty()) return 0; @@ -269,13 +268,14 @@ unsigned SubtargetEmitter::FeatureKeyValues( StringRef CommandLineName = Feature->getValueAsString("Name"); StringRef Desc = Feature->getValueAsString("Desc"); - if (CommandLineName.empty()) continue; + if (CommandLineName.empty()) + continue; - // Emit as { "feature", "description", { featureEnum }, { i1 , i2 , ... , in } } + // Emit as { "feature", "description", { featureEnum }, { i1 , i2 , ... , in + // } } OS << " { " << "\"" << CommandLineName << "\", " - << "\"" << Desc << "\", " - << Target << "::" << Name << ", "; + << "\"" << Desc << "\", " << Target << "::" << Name << ", "; RecVec ImpliesList = Feature->getValueAsListOfDefs("Implies"); @@ -299,8 +299,8 @@ unsigned SubtargetEmitter::CPUKeyValues(raw_ostream &OS, const DenseMap &FeatureMap) { // Gather and sort processor information - std::vector ProcessorList = - Records.getAllDerivedDefinitions("Processor"); + std::vector ProcessorList = + Records.getAllDerivedDefinitions("Processor"); llvm::sort(ProcessorList, LessRecordFieldName()); // Begin processor table @@ -324,7 +324,7 @@ SubtargetEmitter::CPUKeyValues(raw_ostream &OS, // Emit the scheduler model pointer. const std::string &ProcModelName = - SchedModels.getModelForProc(Processor).ModelName; + SchedModels.getModelForProc(Processor).ModelName; OS << ", &" << ProcModelName << " },\n"; } @@ -363,7 +363,8 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name, for (unsigned j = 0, M = UnitList.size(); j < M;) { // Add name and bitwise or ItinString += Name + "FU::" + UnitList[j]->getName().str(); - if (++j < M) ItinString += " | "; + if (++j < M) + ItinString += " | "; } int TimeInc = Stage->getValueAsInt("TimeInc"); @@ -374,7 +375,8 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name, // Close off stage ItinString += " }"; - if (++i < N) ItinString += ", "; + if (++i < N) + ItinString += ", "; } } @@ -383,11 +385,11 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name, // operand cycle initialization for the specified itinerary. N is the // number of operands that has cycles specified. // -void SubtargetEmitter::FormItineraryOperandCycleString(Record *ItinData, - std::string &ItinString, unsigned &NOperandCycles) { +void SubtargetEmitter::FormItineraryOperandCycleString( + Record *ItinData, std::string &ItinString, unsigned &NOperandCycles) { // Get operand cycle list std::vector OperandCycleList = - ItinData->getValueAsListOfInts("OperandCycles"); + ItinData->getValueAsListOfInts("OperandCycles"); // For each operand cycle NOperandCycles = OperandCycleList.size(); @@ -422,12 +424,10 @@ void SubtargetEmitter::FormItineraryBypassString(const std::string &Name, // cycle tables. Create a list of InstrItinerary objects (ProcItinLists) indexed // by CodeGenSchedClass::Index. // -void SubtargetEmitter:: -EmitStageAndOperandCycleData(raw_ostream &OS, - std::vector> - &ProcItinLists) { +void SubtargetEmitter::EmitStageAndOperandCycleData( + raw_ostream &OS, std::vector> &ProcItinLists) { // Multiple processor models may share an itinerary record. Emit it once. - SmallPtrSet ItinsDefSet; + SmallPtrSet ItinsDefSet; // Emit functional units for all the itineraries. for (const CodeGenProcModel &ProcModel : SchedModels.procModels()) { @@ -452,30 +452,31 @@ EmitStageAndOperandCycleData(raw_ostream &OS, RecVec BPs = ProcModel.ItinsDef->getValueAsListOfDefs("BP"); if (!BPs.empty()) { OS << "\n// Pipeline forwarding paths for itineraries \"" << Name - << "\"\n" << "namespace " << Name << "Bypass {\n"; + << "\"\n" + << "namespace " << Name << "Bypass {\n"; OS << " const unsigned NoBypass = 0;\n"; for (unsigned j = 0, BPN = BPs.size(); j < BPN; ++j) - OS << " const unsigned " << BPs[j]->getName() - << " = 1 << " << j << ";\n"; + OS << " const unsigned " << BPs[j]->getName() << " = 1 << " << j + << ";\n"; OS << "} // end namespace " << Name << "Bypass\n"; } } // Begin stages table - std::string StageTable = "\nextern const llvm::InstrStage " + Target + - "Stages[] = {\n"; + std::string StageTable = + "\nextern const llvm::InstrStage " + Target + "Stages[] = {\n"; StageTable += " { 0, 0, 0, llvm::InstrStage::Required }, // No itinerary\n"; // Begin operand cycle table - std::string OperandCycleTable = "extern const unsigned " + Target + - "OperandCycles[] = {\n"; + std::string OperandCycleTable = + "extern const unsigned " + Target + "OperandCycles[] = {\n"; OperandCycleTable += " 0, // No itinerary\n"; // Begin pipeline bypass table - std::string BypassTable = "extern const unsigned " + Target + - "ForwardingPaths[] = {\n"; + std::string BypassTable = + "extern const unsigned " + Target + "ForwardingPaths[] = {\n"; BypassTable += " 0, // No itinerary\n"; // For each Itinerary across all processors, add a unique entry to the stages, @@ -485,7 +486,7 @@ EmitStageAndOperandCycleData(raw_ostream &OS, std::map ItinStageMap, ItinOperandMap; for (const CodeGenProcModel &ProcModel : SchedModels.procModels()) { // Add process itinerary to the list. - ProcItinLists.resize(ProcItinLists.size()+1); + ProcItinLists.resize(ProcItinLists.size() + 1); // If this processor defines no itineraries, then leave the itinerary list // empty. @@ -542,19 +543,20 @@ EmitStageAndOperandCycleData(raw_ostream &OS, // Check to see if operand cycle already exists and create if it doesn't uint16_t FindOperandCycle = 0; if (NOperandCycles > 0) { - std::string ItinOperandString = ItinOperandCycleString+ItinBypassString; + std::string ItinOperandString = + ItinOperandCycleString + ItinBypassString; FindOperandCycle = ItinOperandMap[ItinOperandString]; if (FindOperandCycle == 0) { // Emit as cycle, // index OperandCycleTable += ItinOperandCycleString + ", // "; std::string OperandIdxComment = itostr(OperandCycleCount); if (NOperandCycles > 1) - OperandIdxComment += "-" - + itostr(OperandCycleCount + NOperandCycles - 1); + OperandIdxComment += + "-" + itostr(OperandCycleCount + NOperandCycles - 1); OperandCycleTable += OperandIdxComment + "\n"; // Record Itin class number. - ItinOperandMap[ItinOperandCycleString] = - FindOperandCycle = OperandCycleCount; + ItinOperandMap[ItinOperandCycleString] = FindOperandCycle = + OperandCycleCount; // Emit as bypass, // index BypassTable += ItinBypassString + ", // " + OperandIdxComment + "\n"; OperandCycleCount += NOperandCycles; @@ -599,17 +601,17 @@ EmitStageAndOperandCycleData(raw_ostream &OS, // Itineraries for each processor. The Itinerary lists are indexed on // CodeGenSchedClass::Index. // -void SubtargetEmitter:: -EmitItineraries(raw_ostream &OS, - std::vector> &ProcItinLists) { +void SubtargetEmitter::EmitItineraries( + raw_ostream &OS, std::vector> &ProcItinLists) { // Multiple processor models may share an itinerary record. Emit it once. - SmallPtrSet ItinsDefSet; + SmallPtrSet ItinsDefSet; // For each processor's machine model - std::vector>::iterator - ProcItinListsIter = ProcItinLists.begin(); + std::vector>::iterator ProcItinListsIter = + ProcItinLists.begin(); for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(), - PE = SchedModels.procModelEnd(); PI != PE; ++PI, ++ProcItinListsIter) { + PE = SchedModels.procModelEnd(); + PI != PE; ++PI, ++ProcItinListsIter) { Record *ItinsDef = PI->ItinsDef; if (!ItinsDefSet.insert(ItinsDef).second) @@ -636,13 +638,10 @@ EmitItineraries(raw_ostream &OS, // Emit Itinerary in the form of // { firstStage, lastStage, firstCycle, lastCycle } // index - OS << " { " << - Intinerary.NumMicroOps << ", " << - Intinerary.FirstStage << ", " << - Intinerary.LastStage << ", " << - Intinerary.FirstOperandCycle << ", " << - Intinerary.LastOperandCycle << " }" << - ", // " << j << " " << SchedModels.getSchedClass(j).Name << "\n"; + OS << " { " << Intinerary.NumMicroOps << ", " << Intinerary.FirstStage + << ", " << Intinerary.LastStage << ", " << Intinerary.FirstOperandCycle + << ", " << Intinerary.LastOperandCycle << " }" + << ", // " << j << " " << SchedModels.getSchedClass(j).Name << "\n"; } // End processor itinerary table OS << " { 0, uint16_t(~0U), uint16_t(~0U), uint16_t(~0U), uint16_t(~0U) }" @@ -840,13 +839,11 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel, NumUnits += RU->getValueAsInt("NumUnits"); SubUnitsOffset += RU->getValueAsInt("NumUnits"); } - } - else { + } else { // Find the SuperIdx if (PRDef->getValueInit("Super")->isComplete()) { - SuperDef = - SchedModels.findProcResUnits(PRDef->getValueAsDef("Super"), - ProcModel, PRDef->getLoc()); + SuperDef = SchedModels.findProcResUnits(PRDef->getValueAsDef("Super"), + ProcModel, PRDef->getLoc()); SuperIdx = ProcModel.getProcResourceIdx(SuperDef); } NumUnits = PRDef->getValueAsInt("NumUnits"); @@ -862,7 +859,7 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel, } else { OS << "nullptr"; } - OS << "}, // #" << i+1; + OS << "}, // #" << i + 1; if (SuperDef) OS << ", Super=" << SuperDef->getName(); OS << "\n"; @@ -872,8 +869,9 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel, // Find the WriteRes Record that defines processor resources for this // SchedWrite. -Record *SubtargetEmitter::FindWriteResources( - const CodeGenSchedRW &SchedWrite, const CodeGenProcModel &ProcModel) { +Record * +SubtargetEmitter::FindWriteResources(const CodeGenSchedRW &SchedWrite, + const CodeGenProcModel &ProcModel) { // Check if the SchedWrite is already subtarget-specific and directly // specifies a set of processor resources. @@ -883,16 +881,18 @@ Record *SubtargetEmitter::FindWriteResources( Record *AliasDef = nullptr; for (Record *A : SchedWrite.Aliases) { const CodeGenSchedRW &AliasRW = - SchedModels.getSchedRW(A->getValueAsDef("AliasRW")); + SchedModels.getSchedRW(A->getValueAsDef("AliasRW")); if (AliasRW.TheDef->getValueInit("SchedModel")->isComplete()) { Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel"); if (&SchedModels.getProcModel(ModelDef) != &ProcModel) continue; } if (AliasDef) - PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases " - "defined for processor " + ProcModel.ModelName + - " Ensure only one SchedAlias exists per RW."); + PrintFatalError(AliasRW.TheDef->getLoc(), + "Multiple aliases " + "defined for processor " + + ProcModel.ModelName + + " Ensure only one SchedAlias exists per RW."); AliasDef = AliasRW.TheDef; } if (AliasDef && AliasDef->isSubClassOf("SchedWriteRes")) @@ -903,12 +903,12 @@ Record *SubtargetEmitter::FindWriteResources( for (Record *WR : ProcModel.WriteResDefs) { if (!WR->isSubClassOf("WriteRes")) continue; - if (AliasDef == WR->getValueAsDef("WriteType") - || SchedWrite.TheDef == WR->getValueAsDef("WriteType")) { + if (AliasDef == WR->getValueAsDef("WriteType") || + SchedWrite.TheDef == WR->getValueAsDef("WriteType")) { if (ResDef) { PrintFatalError(WR->getLoc(), "Resources are defined for both " - "SchedWrite and its alias on processor " + - ProcModel.ModelName); + "SchedWrite and its alias on processor " + + ProcModel.ModelName); } ResDef = WR; } @@ -918,7 +918,7 @@ Record *SubtargetEmitter::FindWriteResources( if (!ResDef) { PrintFatalError(ProcModel.ModelDef->getLoc(), Twine("Processor does not define resources for ") + - SchedWrite.TheDef->getName()); + SchedWrite.TheDef->getName()); } return ResDef; } @@ -935,16 +935,18 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead, Record *AliasDef = nullptr; for (Record *A : SchedRead.Aliases) { const CodeGenSchedRW &AliasRW = - SchedModels.getSchedRW(A->getValueAsDef("AliasRW")); + SchedModels.getSchedRW(A->getValueAsDef("AliasRW")); if (AliasRW.TheDef->getValueInit("SchedModel")->isComplete()) { Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel"); if (&SchedModels.getProcModel(ModelDef) != &ProcModel) continue; } if (AliasDef) - PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases " - "defined for processor " + ProcModel.ModelName + - " Ensure only one SchedAlias exists per RW."); + PrintFatalError(AliasRW.TheDef->getLoc(), + "Multiple aliases " + "defined for processor " + + ProcModel.ModelName + + " Ensure only one SchedAlias exists per RW."); AliasDef = AliasRW.TheDef; } if (AliasDef && AliasDef->isSubClassOf("SchedReadAdvance")) @@ -955,12 +957,12 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead, for (Record *RA : ProcModel.ReadAdvanceDefs) { if (!RA->isSubClassOf("ReadAdvance")) continue; - if (AliasDef == RA->getValueAsDef("ReadType") - || SchedRead.TheDef == RA->getValueAsDef("ReadType")) { + if (AliasDef == RA->getValueAsDef("ReadType") || + SchedRead.TheDef == RA->getValueAsDef("ReadType")) { if (ResDef) { PrintFatalError(RA->getLoc(), "Resources are defined for both " - "SchedRead and its alias on processor " + - ProcModel.ModelName); + "SchedRead and its alias on processor " + + ProcModel.ModelName); } ResDef = RA; } @@ -970,7 +972,7 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead, if (!ResDef && SchedRead.TheDef->getName() != "ReadDefault") { PrintFatalError(ProcModel.ModelDef->getLoc(), Twine("Processor does not define resources for ") + - SchedRead.TheDef->getName()); + SchedRead.TheDef->getName()); } return ResDef; } @@ -994,11 +996,10 @@ void SubtargetEmitter::ExpandProcResources( if (SubDef->isSubClassOf("ProcResGroup")) { // Disallow this for simplicitly. PrintFatalError(SubDef->getLoc(), "Processor resource group " - " cannot be a super resources."); + " cannot be a super resources."); } - Record *SuperDef = - SchedModels.findProcResUnits(SubDef->getValueAsDef("Super"), PM, - SubDef->getLoc()); + Record *SuperDef = SchedModels.findProcResUnits( + SubDef->getValueAsDef("Super"), PM, SubDef->getLoc()); PRVec.push_back(SuperDef); ReleaseAtCycles.push_back(ReleaseAtCycles[i]); AcquireAtCycles.push_back(AcquireAtCycles[i]); @@ -1010,7 +1011,7 @@ void SubtargetEmitter::ExpandProcResources( continue; RecVec SuperResources = PR->getValueAsListOfDefs("Resources"); RecIter SubI = SubResources.begin(), SubE = SubResources.end(); - for( ; SubI != SubE; ++SubI) { + for (; SubI != SubE; ++SubI) { if (!is_contained(SuperResources, *SubI)) { break; } @@ -1051,7 +1052,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, // A Variant SchedClass has no resources of its own. bool HasVariants = false; for (const CodeGenSchedTransition &CGT : - make_range(SC.Transitions.begin(), SC.Transitions.end())) { + make_range(SC.Transitions.begin(), SC.Transitions.end())) { if (CGT.ProcIndex == ProcModel.Index) { HasVariants = true; break; @@ -1114,8 +1115,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, std::vector ReadAdvanceEntries; for (unsigned W : Writes) { IdxVec WriteSeq; - SchedModels.expandRWSeqForProc(W, WriteSeq, /*IsRead=*/false, - ProcModel); + SchedModels.expandRWSeqForProc(W, WriteSeq, /*IsRead=*/false, ProcModel); // For each operand, create a latency entry. MCWriteLatencyEntry WLEntry; @@ -1125,7 +1125,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, // If this Write is not referenced by a ReadAdvance, don't distinguish it // from other WriteLatency entries. if (!SchedModels.hasReadOfWrite( - SchedModels.getSchedWrite(WriteID).TheDef)) { + SchedModels.getSchedWrite(WriteID).TheDef)) { WriteID = 0; } WLEntry.WriteResourceID = WriteID; @@ -1133,7 +1133,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, for (unsigned WS : WriteSeq) { Record *WriteRes = - FindWriteResources(SchedModels.getSchedWrite(WS), ProcModel); + FindWriteResources(SchedModels.getSchedWrite(WS), ProcModel); // Mark the parent class as invalid for unsupported write types. if (WriteRes->getValueAsBit("Unsupported")) { @@ -1170,7 +1170,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, .concat(Twine(ReleaseAtCycles.size()))); } - if (!AcquireAtCycles.empty() && AcquireAtCycles.size() != PRVec.size()) { + if (!AcquireAtCycles.empty() && + AcquireAtCycles.size() != PRVec.size()) { PrintFatalError( WriteRes->getLoc(), Twine("Inconsistent resource cycles: size(AcquireAtCycles) != " @@ -1197,8 +1198,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, ExpandProcResources(PRVec, ReleaseAtCycles, AcquireAtCycles, ProcModel); assert(AcquireAtCycles.size() == ReleaseAtCycles.size()); - for (unsigned PRIdx = 0, PREnd = PRVec.size(); - PRIdx != PREnd; ++PRIdx) { + for (unsigned PRIdx = 0, PREnd = PRVec.size(); PRIdx != PREnd; + ++PRIdx) { MCWriteProcResEntry WPREntry; WPREntry.ProcResourceIdx = ProcModel.getProcResourceIdx(PRVec[PRIdx]); assert(WPREntry.ProcResourceIdx && "Bad ProcResourceIdx"); @@ -1220,9 +1221,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, // serially, rather than multiple parallel uses. This is important for // in-order machine where the resource consumption is a hazard. unsigned WPRIdx = 0, WPREnd = WriteProcResources.size(); - for( ; WPRIdx != WPREnd; ++WPRIdx) { - if (WriteProcResources[WPRIdx].ProcResourceIdx - == WPREntry.ProcResourceIdx) { + for (; WPRIdx != WPREnd; ++WPRIdx) { + if (WriteProcResources[WPRIdx].ProcResourceIdx == + WPREntry.ProcResourceIdx) { // TODO: multiple use of the same resources would // require either 1. thinking of how to handle multiple // intervals for the same resource in @@ -1245,10 +1246,10 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, } // Create an entry for each operand Read in this SchedClass. // Entries must be sorted first by UseIdx then by WriteResourceID. - for (unsigned UseIdx = 0, EndIdx = Reads.size(); - UseIdx != EndIdx; ++UseIdx) { + for (unsigned UseIdx = 0, EndIdx = Reads.size(); UseIdx != EndIdx; + ++UseIdx) { Record *ReadAdvance = - FindReadAdvance(SchedModels.getSchedRead(Reads[UseIdx]), ProcModel); + FindReadAdvance(SchedModels.getSchedRead(Reads[UseIdx]), ProcModel); if (!ReadAdvance) continue; @@ -1267,7 +1268,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, } } llvm::sort(WriteIDs); - for(unsigned W : WriteIDs) { + for (unsigned W : WriteIDs) { MCReadAdvanceEntry RAEntry; RAEntry.UseIdx = UseIdx; RAEntry.WriteResourceID = W; @@ -1288,9 +1289,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, SCDesc.NumWriteProcResEntries = WriteProcResources.size(); std::vector::iterator WPRPos = - std::search(SchedTables.WriteProcResources.begin(), - SchedTables.WriteProcResources.end(), - WriteProcResources.begin(), WriteProcResources.end()); + std::search(SchedTables.WriteProcResources.begin(), + SchedTables.WriteProcResources.end(), + WriteProcResources.begin(), WriteProcResources.end()); if (WPRPos != SchedTables.WriteProcResources.end()) SCDesc.WriteProcResIdx = WPRPos - SchedTables.WriteProcResources.begin(); else { @@ -1300,10 +1301,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, } // Latency entries must remain in operand order. SCDesc.NumWriteLatencyEntries = WriteLatencies.size(); - std::vector::iterator WLPos = - std::search(SchedTables.WriteLatencies.begin(), - SchedTables.WriteLatencies.end(), - WriteLatencies.begin(), WriteLatencies.end()); + std::vector::iterator WLPos = std::search( + SchedTables.WriteLatencies.begin(), SchedTables.WriteLatencies.end(), + WriteLatencies.begin(), WriteLatencies.end()); if (WLPos != SchedTables.WriteLatencies.end()) { unsigned idx = WLPos - SchedTables.WriteLatencies.begin(); SCDesc.WriteLatencyIdx = idx; @@ -1312,8 +1312,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, std::string::npos) { SchedTables.WriterNames[idx + i] += std::string("_") + WriterNames[i]; } - } - else { + } else { SCDesc.WriteLatencyIdx = SchedTables.WriteLatencies.size(); llvm::append_range(SchedTables.WriteLatencies, WriteLatencies); llvm::append_range(SchedTables.WriterNames, WriterNames); @@ -1321,9 +1320,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, // ReadAdvanceEntries must remain in operand order. SCDesc.NumReadAdvanceEntries = ReadAdvanceEntries.size(); std::vector::iterator RAPos = - std::search(SchedTables.ReadAdvanceEntries.begin(), - SchedTables.ReadAdvanceEntries.end(), - ReadAdvanceEntries.begin(), ReadAdvanceEntries.end()); + std::search(SchedTables.ReadAdvanceEntries.begin(), + SchedTables.ReadAdvanceEntries.end(), + ReadAdvanceEntries.begin(), ReadAdvanceEntries.end()); if (RAPos != SchedTables.ReadAdvanceEntries.end()) SCDesc.ReadAdvanceIdx = RAPos - SchedTables.ReadAdvanceEntries.begin(); else { @@ -1355,8 +1354,8 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables, // Emit global WriteLatencyTable. OS << "\n// {Cycles, WriteResourceID}\n" - << "extern const llvm::MCWriteLatencyEntry " - << Target << "WriteLatencyTable[] = {\n" + << "extern const llvm::MCWriteLatencyEntry " << Target + << "WriteLatencyTable[] = {\n" << " { 0, 0}, // Invalid\n"; for (unsigned WLIdx = 1, WLEnd = SchedTables.WriteLatencies.size(); WLIdx != WLEnd; ++WLIdx) { @@ -1371,8 +1370,8 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables, // Emit global ReadAdvanceTable. OS << "\n// {UseIdx, WriteResourceID, Cycles}\n" - << "extern const llvm::MCReadAdvanceEntry " - << Target << "ReadAdvanceTable[] = {\n" + << "extern const llvm::MCReadAdvanceEntry " << Target + << "ReadAdvanceTable[] = {\n" << " {0, 0, 0}, // Invalid\n"; for (unsigned RAIdx = 1, RAEnd = SchedTables.ReadAdvanceEntries.size(); RAIdx != RAEnd; ++RAIdx) { @@ -1388,22 +1387,23 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables, // Emit a SchedClass table for each processor. for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(), - PE = SchedModels.procModelEnd(); PI != PE; ++PI) { + PE = SchedModels.procModelEnd(); + PI != PE; ++PI) { if (!PI->hasInstrSchedModel()) continue; std::vector &SCTab = - SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())]; + SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())]; OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup, RetireOOO," << " WriteProcResIdx,#, WriteLatencyIdx,#, ReadAdvanceIdx,#}\n"; - OS << "static const llvm::MCSchedClassDesc " - << PI->ModelName << "SchedClasses[] = {\n"; + OS << "static const llvm::MCSchedClassDesc " << PI->ModelName + << "SchedClasses[] = {\n"; // The first class is always invalid. We no way to distinguish it except by // name and position. - assert(SchedModels.getSchedClass(0).Name == "NoInstrModel" - && "invalid class not first"); + assert(SchedModels.getSchedClass(0).Name == "NoInstrModel" && + "invalid class not first"); OS << " {DBGFIELD(\"InvalidSchedClass\") " << MCSchedClassDesc::InvalidNumMicroOps << ", false, false, false, 0, 0, 0, 0, 0, 0},\n"; @@ -1414,17 +1414,15 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables, OS << " {DBGFIELD(\"" << SchedClass.Name << "\") "; if (SchedClass.Name.size() < 18) OS.indent(18 - SchedClass.Name.size()); - OS << MCDesc.NumMicroOps - << ", " << ( MCDesc.BeginGroup ? "true" : "false" ) - << ", " << ( MCDesc.EndGroup ? "true" : "false" ) - << ", " << ( MCDesc.RetireOOO ? "true" : "false" ) - << ", " << format("%2d", MCDesc.WriteProcResIdx) - << ", " << MCDesc.NumWriteProcResEntries - << ", " << format("%2d", MCDesc.WriteLatencyIdx) - << ", " << MCDesc.NumWriteLatencyEntries - << ", " << format("%2d", MCDesc.ReadAdvanceIdx) - << ", " << MCDesc.NumReadAdvanceEntries - << "}, // #" << SCIdx << '\n'; + OS << MCDesc.NumMicroOps << ", " << (MCDesc.BeginGroup ? "true" : "false") + << ", " << (MCDesc.EndGroup ? "true" : "false") << ", " + << (MCDesc.RetireOOO ? "true" : "false") << ", " + << format("%2d", MCDesc.WriteProcResIdx) << ", " + << MCDesc.NumWriteProcResEntries << ", " + << format("%2d", MCDesc.WriteLatencyIdx) << ", " + << MCDesc.NumWriteLatencyEntries << ", " + << format("%2d", MCDesc.ReadAdvanceIdx) << ", " + << MCDesc.NumReadAdvanceEntries << "}, // #" << SCIdx << '\n'; } OS << "}; // " << PI->ModelName << "SchedClasses\n"; } @@ -1439,9 +1437,10 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) { // Emit processor resource table. if (PM.hasInstrSchedModel()) EmitProcessorResources(PM, OS); - else if(!PM.ProcResourceDefs.empty()) - PrintFatalError(PM.ModelDef->getLoc(), "SchedMachineModel defines " - "ProcResources without defining WriteRes SchedWriteRes"); + else if (!PM.ProcResourceDefs.empty()) + PrintFatalError(PM.ModelDef->getLoc(), + "SchedMachineModel defines " + "ProcResources without defining WriteRes SchedWriteRes"); // Begin processor itinerary properties OS << "\n"; @@ -1454,13 +1453,13 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) { EmitProcessorProp(OS, PM.ModelDef, "MispredictPenalty", ','); bool PostRAScheduler = - (PM.ModelDef ? PM.ModelDef->getValueAsBit("PostRAScheduler") : false); + (PM.ModelDef ? PM.ModelDef->getValueAsBit("PostRAScheduler") : false); - OS << " " << (PostRAScheduler ? "true" : "false") << ", // " + OS << " " << (PostRAScheduler ? "true" : "false") << ", // " << "PostRAScheduler\n"; bool CompleteModel = - (PM.ModelDef ? PM.ModelDef->getValueAsBit("CompleteModel") : false); + (PM.ModelDef ? PM.ModelDef->getValueAsBit("CompleteModel") : false); OS << " " << (CompleteModel ? "true" : "false") << ", // " << "CompleteModel\n"; @@ -1473,11 +1472,14 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) { OS << " " << PM.Index << ", // Processor ID\n"; if (PM.hasInstrSchedModel()) - OS << " " << PM.ModelName << "ProcResources" << ",\n" - << " " << PM.ModelName << "SchedClasses" << ",\n" - << " " << PM.ProcResourceDefs.size()+1 << ",\n" - << " " << (SchedModels.schedClassEnd() - - SchedModels.schedClassBegin()) << ",\n"; + OS << " " << PM.ModelName << "ProcResources" + << ",\n" + << " " << PM.ModelName << "SchedClasses" + << ",\n" + << " " << PM.ProcResourceDefs.size() + 1 << ",\n" + << " " + << (SchedModels.schedClassEnd() - SchedModels.schedClassBegin()) + << ",\n"; else OS << " nullptr, nullptr, 0, 0," << " // No instruction-level machine model.\n"; @@ -1669,8 +1671,8 @@ void SubtargetEmitter::emitSchedModelHelpersImpl( // Construct a switch statement where the condition is a check on the // scheduling class identifier. There is a `case` for every variant class // defined by the processor models of this target. - // Each `case` implements a number of rules to resolve (i.e. to transition from) - // a variant scheduling class to another scheduling class. Rules are + // Each `case` implements a number of rules to resolve (i.e. to transition + // from) a variant scheduling class to another scheduling class. Rules are // described by instances of CodeGenSchedTransition. Note that transitions may // not be valid for all processors. OS << " switch (SchedClass) {\n"; @@ -1781,8 +1783,8 @@ void SubtargetEmitter::EmitHwModeCheck(const std::string &ClassName, OS << "unsigned " << ClassName << "::getHwMode() const {\n"; for (unsigned M = 1, NumModes = CGH.getNumModeIds(); M != NumModes; ++M) { const HwMode &HM = CGH.getMode(M); - OS << " if (checkFeatures(\"" << HM.Features - << "\")) return " << M << ";\n"; + OS << " if (checkFeatures(\"" << HM.Features << "\")) return " << M + << ";\n"; } OS << " return 0;\n}\n"; } @@ -1808,8 +1810,8 @@ void SubtargetEmitter::emitGetMacroFusions(const std::string &ClassName, // Produces a subtarget specific function for parsing // the subtarget features string. void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) { - std::vector Features = - Records.getAllDerivedDefinitions("SubtargetFeature"); + std::vector Features = + Records.getAllDerivedDefinitions("SubtargetFeature"); llvm::sort(Features, LessRecord()); OS << "// ParseSubtargetFeatures - Parses features string setting specified\n" @@ -1836,15 +1838,12 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) { StringRef Value = R->getValueAsString("Value"); StringRef FieldName = R->getValueAsString("FieldName"); - if (Value=="true" || Value=="false") - OS << " if (Bits[" << Target << "::" - << Instance << "]) " - << FieldName << " = " << Value << ";\n"; + if (Value == "true" || Value == "false") + OS << " if (Bits[" << Target << "::" << Instance << "]) " << FieldName + << " = " << Value << ";\n"; else - OS << " if (Bits[" << Target << "::" - << Instance << "] && " - << FieldName << " < " << Value << ") " - << FieldName << " = " << Value << ";\n"; + OS << " if (Bits[" << Target << "::" << Instance << "] && " << FieldName + << " < " << Value << ") " << FieldName << " = " << Value << ";\n"; } OS << "}\n"; @@ -1955,15 +1954,15 @@ void SubtargetEmitter::run(raw_ostream &OS) { OS << Target << "SubTypeKV, "; else OS << "std::nullopt, "; - OS << '\n'; OS.indent(22); - OS << Target << "WriteProcResTable, " - << Target << "WriteLatencyTable, " + OS << '\n'; + OS.indent(22); + OS << Target << "WriteProcResTable, " << Target << "WriteLatencyTable, " << Target << "ReadAdvanceTable, "; - OS << '\n'; OS.indent(22); + OS << '\n'; + OS.indent(22); if (SchedModels.hasItineraries()) { - OS << Target << "Stages, " - << Target << "OperandCycles, " - << Target << "ForwardingPaths"; + OS << Target << "Stages, " << Target << "OperandCycles, " << Target + << "ForwardingPaths"; } else OS << "nullptr, nullptr, nullptr"; OS << ");\n}\n\n"; @@ -2027,12 +2026,12 @@ void SubtargetEmitter::run(raw_ostream &OS) { OS << "namespace llvm {\n"; OS << "extern const llvm::SubtargetFeatureKV " << Target << "FeatureKV[];\n"; OS << "extern const llvm::SubtargetSubTypeKV " << Target << "SubTypeKV[];\n"; - OS << "extern const llvm::MCWriteProcResEntry " - << Target << "WriteProcResTable[];\n"; - OS << "extern const llvm::MCWriteLatencyEntry " - << Target << "WriteLatencyTable[];\n"; - OS << "extern const llvm::MCReadAdvanceEntry " - << Target << "ReadAdvanceTable[];\n"; + OS << "extern const llvm::MCWriteProcResEntry " << Target + << "WriteProcResTable[];\n"; + OS << "extern const llvm::MCWriteLatencyEntry " << Target + << "WriteLatencyTable[];\n"; + OS << "extern const llvm::MCReadAdvanceEntry " << Target + << "ReadAdvanceTable[];\n"; if (SchedModels.hasItineraries()) { OS << "extern const llvm::InstrStage " << Target << "Stages[];\n"; @@ -2051,15 +2050,15 @@ void SubtargetEmitter::run(raw_ostream &OS) { OS << "ArrayRef(" << Target << "SubTypeKV, " << NumProcs << "), "; else OS << "std::nullopt, "; - OS << '\n'; OS.indent(24); - OS << Target << "WriteProcResTable, " - << Target << "WriteLatencyTable, " + OS << '\n'; + OS.indent(24); + OS << Target << "WriteProcResTable, " << Target << "WriteLatencyTable, " << Target << "ReadAdvanceTable, "; - OS << '\n'; OS.indent(24); + OS << '\n'; + OS.indent(24); if (SchedModels.hasItineraries()) { - OS << Target << "Stages, " - << Target << "OperandCycles, " - << Target << "ForwardingPaths"; + OS << Target << "Stages, " << Target << "OperandCycles, " << Target + << "ForwardingPaths"; } else OS << "nullptr, nullptr, nullptr"; OS << ") {}\n\n"; diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp index 52afb4d..819abfa 100644 --- a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp +++ b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp @@ -81,7 +81,7 @@ void SubtargetFeatureInfo::emitNameTable( uint64_t IndexUB = 0; for (const auto &SF : SubtargetFeatures) if (IndexUB <= SF.second.Index) - IndexUB = SF.second.Index+1; + IndexUB = SF.second.Index + 1; std::vector Names; if (IndexUB > 0) diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.h b/llvm/utils/TableGen/SubtargetFeatureInfo.h index 9401004..b1016ff 100644 --- a/llvm/utils/TableGen/SubtargetFeatureInfo.h +++ b/llvm/utils/TableGen/SubtargetFeatureInfo.h @@ -18,7 +18,8 @@ namespace llvm { struct SubtargetFeatureInfo; -using SubtargetFeatureInfoMap = std::map; +using SubtargetFeatureInfoMap = + std::map; /// Helper class for storing information on a subtarget feature which /// participates in instruction matching. diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h index 3afe6b0..e0d12ab 100644 --- a/llvm/utils/TableGen/TableGenBackends.h +++ b/llvm/utils/TableGen/TableGenBackends.h @@ -46,7 +46,6 @@ // backends, this means that the EmitFoo function is the only thing not in // the anonymous namespace. - // FIXME: Reorganize TableGen so that build dependencies can be more // accurately expressed. Currently, touching any of the emitters (or // anything that they transitively depend on) causes everything dependent @@ -57,7 +56,6 @@ // TableGen binary with as few dependencies as possible on the rest of // LLVM. - namespace llvm { class raw_ostream; diff --git a/llvm/utils/TableGen/Types.cpp b/llvm/utils/TableGen/Types.cpp index aca8e36..35b79b3 100644 --- a/llvm/utils/TableGen/Types.cpp +++ b/llvm/utils/TableGen/Types.cpp @@ -15,7 +15,9 @@ using namespace llvm; -const char *llvm::getMinimalTypeForRange(uint64_t Range, unsigned MaxSize LLVM_ATTRIBUTE_UNUSED) { +const char * +llvm::getMinimalTypeForRange(uint64_t Range, + unsigned MaxSize LLVM_ATTRIBUTE_UNUSED) { // TODO: The original callers only used 32 and 64 so these are the only // values permitted. Rather than widen the supported values we should // allow 64 for the callers that currently use 32 and remove the diff --git a/llvm/utils/TableGen/Types.h b/llvm/utils/TableGen/Types.h index f369d61..74f0f9f 100644 --- a/llvm/utils/TableGen/Types.h +++ b/llvm/utils/TableGen/Types.h @@ -16,6 +16,6 @@ namespace llvm { /// MaxSize indicates the largest size of integer to consider (in bits) and only /// supports values of at least 32. const char *getMinimalTypeForRange(uint64_t Range, unsigned MaxSize = 64); -} +} // namespace llvm #endif diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp index dc037e4..2cf86d3 100644 --- a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp +++ b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp @@ -63,7 +63,8 @@ void emitWebAssemblyDisassemblerTables( // should be the canonical one. This determines which variant gets // printed in a disassembly. We want e.g. "call" not "i32.call", and // "end" when we don't know if its "end_loop" or "end_block" etc. - bool IsCanonicalExisting = CGIP.second->TheDef->getValueAsBit("IsCanonical"); + bool IsCanonicalExisting = + CGIP.second->TheDef->getValueAsBit("IsCanonical"); // We already have one marked explicitly as canonical, so keep it. if (IsCanonicalExisting) continue; @@ -126,7 +127,8 @@ void emitWebAssemblyDisassemblerTables( ++J) { size_t K = 0; for (; K < CurOperandList.size(); ++K) { - if (OperandTable[J + K] != CurOperandList[K]) break; + if (OperandTable[J + K] != CurOperandList[K]) + break; } if (K == CurOperandList.size()) { OperandStart = J; diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp index fef8dc7..e4db995 100644 --- a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp @@ -83,7 +83,8 @@ void X86CompressEVEXTablesEmitter::printTable(const std::vector &Table, void X86CompressEVEXTablesEmitter::printCheckPredicate( const PredicateInstMap &PredicateInsts, raw_ostream &OS) { - OS << "static bool checkPredicate(unsigned Opc, const X86Subtarget *Subtarget) {\n" + OS << "static bool checkPredicate(unsigned Opc, const X86Subtarget " + "*Subtarget) {\n" << " switch (Opc) {\n" << " default: return true;\n"; for (const auto &[Key, Val] : PredicateInsts) { @@ -207,9 +208,9 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) { NewInst = &TempInst; } } else { - // For each pre-compression instruction look for a match in the appropriate - // vector (instructions with the same opcode) using function object - // IsMatch. + // For each pre-compression instruction look for a match in the + // appropriate vector (instructions with the same opcode) using function + // object IsMatch. auto Match = llvm::find_if(CompressedInsts[Opcode], IsMatch(Inst)); if (Match != CompressedInsts[Opcode].end()) NewInst = *Match; @@ -225,7 +226,7 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) { return Name == "HasAVXNECONVERT" || Name == "HasAVXVNNI" || Name == "HasAVXIFMA"; }); - if(It!= Predicates.end()) + if (It != Predicates.end()) PredicateInsts[*It].push_back(NewInst); } diff --git a/llvm/utils/TableGen/X86DisassemblerShared.h b/llvm/utils/TableGen/X86DisassemblerShared.h index 093f220..f60fd47 100644 --- a/llvm/utils/TableGen/X86DisassemblerShared.h +++ b/llvm/utils/TableGen/X86DisassemblerShared.h @@ -49,9 +49,7 @@ struct OpcodeDecision { struct ContextDecision { OpcodeDecision opcodeDecisions[llvm::X86Disassembler::IC_max]; - ContextDecision() { - memset(opcodeDecisions, 0, sizeof(opcodeDecisions)); - } + ContextDecision() { memset(opcodeDecisions, 0, sizeof(opcodeDecisions)); } }; #endif diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp index 23886a3..588d9b2 100644 --- a/llvm/utils/TableGen/X86DisassemblerTables.cpp +++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp @@ -41,8 +41,9 @@ static inline const char *stringForContext(InstructionContext insnContext) { break; #define ENUM_ENTRY_K_B(n, r, d) \ ENUM_ENTRY(n, r, d) \ - ENUM_ENTRY(n##_K_B, r, d) ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d) \ - ENUM_ENTRY(n##_B, r, d) ENUM_ENTRY(n##_KZ_B, r, d) + ENUM_ENTRY(n##_K_B, r, d) \ + ENUM_ENTRY(n##_KZ, r, d) \ + ENUM_ENTRY(n##_K, r, d) ENUM_ENTRY(n##_B, r, d) ENUM_ENTRY(n##_KZ_B, r, d) INSTRUCTION_CONTEXTS #undef ENUM_ENTRY #undef ENUM_ENTRY_K_B @@ -595,8 +596,8 @@ static inline bool outranks(InstructionContext upper, #define ENUM_ENTRY_K_B(n, r, d) \ ENUM_ENTRY(n, r, d) \ ENUM_ENTRY(n##_K_B, r, d) \ - ENUM_ENTRY(n##_KZ_B, r, d) ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d) \ - ENUM_ENTRY(n##_B, r, d) + ENUM_ENTRY(n##_KZ_B, r, d) \ + ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d) ENUM_ENTRY(n##_B, r, d) static int ranks[IC_max] = {INSTRUCTION_CONTEXTS}; #undef ENUM_ENTRY #undef ENUM_ENTRY_K_B @@ -822,7 +823,8 @@ void DisassemblerTables::emitContextDecision(raw_ostream &o1, raw_ostream &o2, } i2--; - o2.indent(i2) << "}};" << "\n"; + o2.indent(i2) << "}};" + << "\n"; } void DisassemblerTables::emitInstructionInfo(raw_ostream &o, @@ -859,7 +861,8 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o, } o << " },\n"; } - o << "};" << "\n\n"; + o << "};" + << "\n\n"; o.indent(i * 2) << "static const struct InstructionSpecifier "; o << INSTRUCTIONS_STR "[" << InstructionSpecifiers.size() << "] = {\n"; @@ -885,7 +888,8 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o, } i--; - o.indent(i * 2) << "};" << "\n"; + o.indent(i * 2) << "};" + << "\n"; } void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const { @@ -1004,7 +1008,8 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const { } i--; - o.indent(i * 2) << "};" << "\n"; + o.indent(i * 2) << "};" + << "\n"; } void DisassemblerTables::emitContextDecisions(raw_ostream &o1, raw_ostream &o2, diff --git a/llvm/utils/TableGen/X86DisassemblerTables.h b/llvm/utils/TableGen/X86DisassemblerTables.h index 4fbc58b..0f38274 100644 --- a/llvm/utils/TableGen/X86DisassemblerTables.h +++ b/llvm/utils/TableGen/X86DisassemblerTables.h @@ -91,8 +91,8 @@ private: /// @param ModRMTableNum - next table number for adding to ModRMTable. /// @param decision - The ModR/M decision to emit. This decision has 256 /// entries - emitModRMDecision decides how to compact it. - void emitModRMDecision(raw_ostream &o1, raw_ostream &o2, - unsigned &i1, unsigned &i2, unsigned &ModRMTableNum, + void emitModRMDecision(raw_ostream &o1, raw_ostream &o2, unsigned &i1, + unsigned &i2, unsigned &ModRMTableNum, ModRMDecision &decision) const; /// emitOpcodeDecision - Emits an OpcodeDecision and all its subsidiary ModR/M @@ -119,8 +119,8 @@ private: /// @param ModRMTableNum - next table number for adding to ModRMTable. /// @param decision - The OpcodeDecision to emit along with its subsidiary /// structures. - void emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2, - unsigned &i1, unsigned &i2, unsigned &ModRMTableNum, + void emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2, unsigned &i1, + unsigned &i2, unsigned &ModRMTableNum, OpcodeDecision &decision) const; /// emitContextDecision - Emits a ContextDecision and all its subsidiary @@ -153,9 +153,9 @@ private: /// @param decision - The ContextDecision to emit along with its subsidiary /// structures. /// @param name - The name for the ContextDecision. - void emitContextDecision(raw_ostream &o1, raw_ostream &o2, - unsigned &i1, unsigned &i2, unsigned &ModRMTableNum, - ContextDecision &decision, const char* name) const; + void emitContextDecision(raw_ostream &o1, raw_ostream &o2, unsigned &i1, + unsigned &i2, unsigned &ModRMTableNum, + ContextDecision &decision, const char *name) const; /// emitInstructionInfo - Prints the instruction specifier table, which has /// one entry for each instruction, and contains name and operand @@ -200,7 +200,8 @@ private: /// IC is the context corresponding to the mask 0x00, and there are 256 /// possible masks. /// - /// @param o - The output stream to which the context table should be written. + /// @param o - The output stream to which the context table should be + /// written. /// @param i - The indent level for use with the stream. void emitContextTable(raw_ostream &o, uint32_t &i) const; @@ -213,9 +214,8 @@ private: /// @param i1 - The indent level to use with stream o1. /// @param i2 - The indent level to use with stream o2. /// @param ModRMTableNum - next table number for adding to ModRMTable. - void emitContextDecisions(raw_ostream &o1, raw_ostream &o2, - unsigned &i1, unsigned &i2, - unsigned &ModRMTableNum) const; + void emitContextDecisions(raw_ostream &o1, raw_ostream &o2, unsigned &i1, + unsigned &i2, unsigned &ModRMTableNum) const; /// setTableFields - Uses a ModRMFilter to set the appropriate entries in a /// ModRMDecision to refer to a particular instruction ID. @@ -224,10 +224,9 @@ private: /// @param filter - The filter to use in deciding which entries to populate. /// @param uid - The unique ID to set matching entries to. /// @param opcode - The opcode of the instruction, for error reporting. - void setTableFields(ModRMDecision &decision, - const ModRMFilter &filter, - InstrUID uid, - uint8_t opcode); + void setTableFields(ModRMDecision &decision, const ModRMFilter &filter, + InstrUID uid, uint8_t opcode); + public: /// Constructor - Allocates space for the class decisions and clears them. DisassemblerTables(); @@ -247,7 +246,8 @@ public: /// @param insnContext - The context to use (IC, IC_64BIT, etc.) /// @param opcode - The last byte of the opcode (not counting any escape /// or extended opcodes). - /// @param filter - The ModRMFilter that decides which ModR/M byte values + /// @param filter - The ModRMFilter that decides which ModR/M byte + /// values /// correspond to the desired instruction. /// @param uid - The unique ID of the instruction. /// @param is32bit - Instructon is only 32-bit @@ -255,23 +255,17 @@ public: /// @param ignoresVEX_L - Instruction ignores VEX.L /// @param ignoresVEX_W - Instruction ignores VEX.W /// @param AddrSize - Instructions address size 16/32/64. 0 is unspecified - void setTableFields(OpcodeType type, - InstructionContext insnContext, - uint8_t opcode, - const ModRMFilter &filter, - InstrUID uid, - bool is32bit, - bool noPrefix, - bool ignoresVEX_L, - bool ignoresVEX_W, - unsigned AddrSize); + void setTableFields(OpcodeType type, InstructionContext insnContext, + uint8_t opcode, const ModRMFilter &filter, InstrUID uid, + bool is32bit, bool noPrefix, bool ignoresVEX_L, + bool ignoresVEX_W, unsigned AddrSize); /// specForUID - Returns the instruction specifier for a given unique /// instruction ID. Used when resolving collisions. /// /// @param uid - The unique ID of the instruction. /// @return - A reference to the instruction specifier. - InstructionSpecifier& specForUID(InstrUID uid) { + InstructionSpecifier &specForUID(InstrUID uid) { if (uid >= InstructionSpecifiers.size()) InstructionSpecifiers.resize(uid + 1); @@ -282,9 +276,7 @@ public: // from any instructions added to the tables. // @return - true if there were; false otherwise. - bool hasConflicts() { - return HasConflicts; - } + bool hasConflicts() { return HasConflicts; } }; } // namespace X86Disassembler diff --git a/llvm/utils/TableGen/X86ModRMFilters.cpp b/llvm/utils/TableGen/X86ModRMFilters.cpp index cf75070..9cfb91c 100644 --- a/llvm/utils/TableGen/X86ModRMFilters.cpp +++ b/llvm/utils/TableGen/X86ModRMFilters.cpp @@ -10,14 +10,14 @@ using namespace llvm::X86Disassembler; -void ModRMFilter::anchor() { } +void ModRMFilter::anchor() {} -void DumbFilter::anchor() { } +void DumbFilter::anchor() {} -void ModFilter::anchor() { } +void ModFilter::anchor() {} -void ExtendedFilter::anchor() { } +void ExtendedFilter::anchor() {} -void ExtendedRMFilter::anchor() { } +void ExtendedRMFilter::anchor() {} -void ExactFilter::anchor() { } +void ExactFilter::anchor() {} diff --git a/llvm/utils/TableGen/X86ModRMFilters.h b/llvm/utils/TableGen/X86ModRMFilters.h index d2169a8..b579f22 100644 --- a/llvm/utils/TableGen/X86ModRMFilters.h +++ b/llvm/utils/TableGen/X86ModRMFilters.h @@ -27,9 +27,10 @@ namespace X86Disassembler { /// ModR/M bytes. class ModRMFilter { virtual void anchor(); + public: /// Destructor - Override as necessary. - virtual ~ModRMFilter() { } + virtual ~ModRMFilter() {} /// isDumb - Indicates whether this filter returns the same value for /// any value of the ModR/M byte. @@ -50,14 +51,11 @@ public: /// for operands. class DumbFilter : public ModRMFilter { void anchor() override; + public: - bool isDumb() const override { - return true; - } + bool isDumb() const override { return true; } - bool accepts(uint8_t modRM) const override { - return true; - } + bool accepts(uint8_t modRM) const override { return true; } }; /// ModFilter - Filters based on the mod bits [bits 7-6] of the ModR/M byte. @@ -66,6 +64,7 @@ public: class ModFilter : public ModRMFilter { void anchor() override; bool R; + public: /// Constructor /// @@ -86,6 +85,7 @@ class ExtendedFilter : public ModRMFilter { void anchor() override; bool R; uint8_t NNN; + public: /// Constructor /// @@ -95,9 +95,9 @@ public: ExtendedFilter(bool r, uint8_t nnn) : R(r), NNN(nnn) {} bool accepts(uint8_t modRM) const override { - return (((R && ((modRM & 0xc0) == 0xc0)) || - (!R && ((modRM & 0xc0) != 0xc0))) && - (((modRM & 0x38) >> 3) == NNN)); + return ( + ((R && ((modRM & 0xc0) == 0xc0)) || (!R && ((modRM & 0xc0) != 0xc0))) && + (((modRM & 0x38) >> 3) == NNN)); } }; @@ -107,6 +107,7 @@ class ExtendedRMFilter : public ModRMFilter { void anchor() override; bool R; uint8_t NNN; + public: /// Constructor /// @@ -116,8 +117,7 @@ public: ExtendedRMFilter(bool r, uint8_t nnn) : R(r), NNN(nnn) {} bool accepts(uint8_t modRM) const override { - return ((R && ((modRM & 0xc0) == 0xc0)) && - ((modRM & 0x7) == NNN)); + return ((R && ((modRM & 0xc0) == 0xc0)) && ((modRM & 0x7) == NNN)); } }; /// ExactFilter - The occasional extended opcode (such as VMCALL or MONITOR) @@ -125,15 +125,14 @@ public: class ExactFilter : public ModRMFilter { void anchor() override; uint8_t ModRM; + public: /// Constructor /// /// \param modRM The required value of the full ModR/M byte. ExactFilter(uint8_t modRM) : ModRM(modRM) {} - bool accepts(uint8_t modRM) const override { - return (ModRM == modRM); - } + bool accepts(uint8_t modRM) const override { return (ModRM == modRM); } }; } // namespace X86Disassembler diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp index 18f9610..873f3ae 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -1,4 +1,4 @@ -//===- X86RecognizableInstr.cpp - Disassembler instruction spec --*- C++ -*-===// +//===- X86RecognizableInstr.cpp - Disassembler instruction spec -*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h index 007c700..549fc5b 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.h +++ b/llvm/utils/TableGen/X86RecognizableInstr.h @@ -1,4 +1,4 @@ -//===- X86RecognizableInstr.h - Disassembler instruction spec ----*- C++ -*-===// +//===- X86RecognizableInstr.h - Disassembler instruction spec ---*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. -- cgit v1.1 From 2f8e37d20114ecb223caaa5a72e8b7c13daf9f34 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 9 Feb 2024 09:36:05 +0100 Subject: [SROA] Unfold gep of index select (#80983) SROA currently supports converting a gep of select into select of gep if the select is in the pointer operand. This patch expands support to selects in an index operand. This is intended to address the regression reported in https://github.com/llvm/llvm-project/pull/68882#issuecomment-1924909922. --- llvm/lib/Transforms/Scalar/SROA.cpp | 59 ++++++++++++++++++++------- llvm/test/Transforms/SROA/select-gep.ll | 72 ++++++++++++++++++++++++++------- 2 files changed, 103 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index e92e245..138dc38 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -3942,30 +3942,62 @@ private: return false; } - // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2) + // Fold gep (select cond, ptr1, ptr2), idx + // => select cond, gep(ptr1, idx), gep(ptr2, idx) + // and gep ptr, (select cond, idx1, idx2) + // => select cond, gep(ptr, idx1), gep(ptr, idx2) bool foldGEPSelect(GetElementPtrInst &GEPI) { - if (!GEPI.hasAllConstantIndices()) - return false; + // Check whether the GEP has exactly one select operand and all indices + // will become constant after the transform. + SelectInst *Sel = dyn_cast(GEPI.getPointerOperand()); + for (Value *Op : GEPI.indices()) { + if (auto *SI = dyn_cast(Op)) { + if (Sel) + return false; + + Sel = SI; + if (!isa(Sel->getTrueValue()) || + !isa(Sel->getFalseValue())) + return false; + continue; + } - SelectInst *Sel = cast(GEPI.getPointerOperand()); + if (!isa(Op)) + return false; + } + + if (!Sel) + return false; LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):" << "\n original: " << *Sel << "\n " << GEPI); + auto GetNewOps = [&](Value *SelOp) { + SmallVector NewOps; + for (Value *Op : GEPI.operands()) + if (Op == Sel) + NewOps.push_back(SelOp); + else + NewOps.push_back(Op); + return NewOps; + }; + + Value *True = Sel->getTrueValue(); + Value *False = Sel->getFalseValue(); + SmallVector TrueOps = GetNewOps(True); + SmallVector FalseOps = GetNewOps(False); + IRB.SetInsertPoint(&GEPI); - SmallVector Index(GEPI.indices()); bool IsInBounds = GEPI.isInBounds(); Type *Ty = GEPI.getSourceElementType(); - Value *True = Sel->getTrueValue(); - Value *NTrue = IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep", - IsInBounds); - - Value *False = Sel->getFalseValue(); + Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(), + True->getName() + ".sroa.gep", IsInBounds); - Value *NFalse = IRB.CreateGEP(Ty, False, Index, - False->getName() + ".sroa.gep", IsInBounds); + Value *NFalse = + IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(), + False->getName() + ".sroa.gep", IsInBounds); Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse, Sel->getName() + ".sroa.sel"); @@ -4039,8 +4071,7 @@ private: } bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { - if (isa(GEPI.getPointerOperand()) && - foldGEPSelect(GEPI)) + if (foldGEPSelect(GEPI)) return true; if (isa(GEPI.getPointerOperand()) && diff --git a/llvm/test/Transforms/SROA/select-gep.ll b/llvm/test/Transforms/SROA/select-gep.ll index 56924a0..1342a2c 100644 --- a/llvm/test/Transforms/SROA/select-gep.ll +++ b/llvm/test/Transforms/SROA/select-gep.ll @@ -155,14 +155,24 @@ bb: ret i32 %load } - +; Test gep of index select unfolding on an alloca that is splittable, but not +; promotable. The allocas here will be optimized away by subsequent passes. define i32 @test_select_idx_memcpy(i1 %c, ptr %p) { ; CHECK-LABEL: @test_select_idx_memcpy( -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [20 x i64], align 8 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[ALLOCA]], ptr [[P:%.*]], i64 160, i1 false) +; CHECK-NEXT: [[ALLOCA_SROA_0:%.*]] = alloca [4 x i8], align 8 +; CHECK-NEXT: [[ALLOCA_SROA_2:%.*]] = alloca [20 x i8], align 4 +; CHECK-NEXT: [[ALLOCA_SROA_22:%.*]] = alloca [4 x i8], align 8 +; CHECK-NEXT: [[ALLOCA_SROA_3:%.*]] = alloca [132 x i8], align 4 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ALLOCA_SROA_0]], ptr align 1 [[P:%.*]], i64 4, i1 false) +; CHECK-NEXT: [[ALLOCA_SROA_2_0_P_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 4 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ALLOCA_SROA_2]], ptr align 1 [[ALLOCA_SROA_2_0_P_SROA_IDX]], i64 20, i1 false) +; CHECK-NEXT: [[ALLOCA_SROA_22_0_P_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 24 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ALLOCA_SROA_22]], ptr align 1 [[ALLOCA_SROA_22_0_P_SROA_IDX]], i64 4, i1 false) +; CHECK-NEXT: [[ALLOCA_SROA_3_0_P_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 28 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ALLOCA_SROA_3]], ptr align 1 [[ALLOCA_SROA_3_0_P_SROA_IDX]], i64 132, i1 false) ; CHECK-NEXT: [[IDX:%.*]] = select i1 [[C:%.*]], i64 24, i64 0 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX]] -; CHECK-NEXT: [[RES:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[IDX_SROA_SEL:%.*]] = select i1 [[C]], ptr [[ALLOCA_SROA_22]], ptr [[ALLOCA_SROA_0]] +; CHECK-NEXT: [[RES:%.*]] = load i32, ptr [[IDX_SROA_SEL]], align 4 ; CHECK-NEXT: ret i32 [[RES]] ; %alloca = alloca [20 x i64], align 8 @@ -173,16 +183,13 @@ define i32 @test_select_idx_memcpy(i1 %c, ptr %p) { ret i32 %res } +; Test gep of index select unfolding on an alloca that is splittable and +; promotable. define i32 @test_select_idx_mem2reg(i1 %c) { ; CHECK-LABEL: @test_select_idx_mem2reg( -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [20 x i64], align 8 -; CHECK-NEXT: store i32 1, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24 -; CHECK-NEXT: store i32 2, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[IDX:%.*]] = select i1 [[C:%.*]], i64 24, i64 0 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX]] -; CHECK-NEXT: [[RES:%.*]] = load i32, ptr [[GEP2]], align 4 -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: [[RES_SROA_SPECULATED:%.*]] = select i1 [[C]], i32 2, i32 1 +; CHECK-NEXT: ret i32 [[RES_SROA_SPECULATED]] ; %alloca = alloca [20 x i64], align 8 store i32 1, ptr %alloca @@ -194,6 +201,9 @@ define i32 @test_select_idx_mem2reg(i1 %c) { ret i32 %res } +; Test gep of index select unfolding on an alloca that escaped, and as such +; is not splittable or promotable. +; FIXME: Ideally, no transform would take place in this case. define i32 @test_select_idx_escaped(i1 %c, ptr %p) { ; CHECK-LABEL: @test_select_idx_escaped( ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [20 x i64], align 8 @@ -202,8 +212,10 @@ define i32 @test_select_idx_escaped(i1 %c, ptr %p) { ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24 ; CHECK-NEXT: store i32 2, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[IDX:%.*]] = select i1 [[C:%.*]], i64 24, i64 0 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX]] -; CHECK-NEXT: [[RES:%.*]] = load i32, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[DOTSROA_GEP:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24 +; CHECK-NEXT: [[DOTSROA_GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 0 +; CHECK-NEXT: [[IDX_SROA_SEL:%.*]] = select i1 [[C]], ptr [[DOTSROA_GEP]], ptr [[DOTSROA_GEP1]] +; CHECK-NEXT: [[RES:%.*]] = load i32, ptr [[IDX_SROA_SEL]], align 4 ; CHECK-NEXT: ret i32 [[RES]] ; %alloca = alloca [20 x i64], align 8 @@ -217,6 +229,38 @@ define i32 @test_select_idx_escaped(i1 %c, ptr %p) { ret i32 %res } +; FIXME: Should we allow recursive select unfolding if all the leaves are +; constants? +define i32 @test_select_idx_nested(i1 %c, i1 %c2) { +; CHECK-LABEL: @test_select_idx_nested( +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [20 x i64], align 8 +; CHECK-NEXT: store i32 1, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 8 +; CHECK-NEXT: store i32 2, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24 +; CHECK-NEXT: store i32 3, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[IDX1:%.*]] = select i1 [[C:%.*]], i64 24, i64 0 +; CHECK-NEXT: [[IDX2:%.*]] = select i1 [[C2:%.*]], i64 [[IDX1]], i64 8 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX2]] +; CHECK-NEXT: [[RES:%.*]] = load i32, ptr [[GEP3]], align 4 +; CHECK-NEXT: ret i32 [[RES]] +; + %alloca = alloca [20 x i64], align 8 + store i32 1, ptr %alloca + %gep1 = getelementptr inbounds i8, ptr %alloca, i64 8 + store i32 2, ptr %gep1 + %gep2 = getelementptr inbounds i8, ptr %alloca, i64 24 + store i32 3, ptr %gep2 + %idx1 = select i1 %c, i64 24, i64 0 + %idx2 = select i1 %c2, i64 %idx1, i64 8 + %gep3 = getelementptr inbounds i8, ptr %alloca, i64 %idx2 + %res = load i32, ptr %gep3, align 4 + ret i32 %res +} + +; The following cases involve non-constant indices and should not be +; transformed. + define i32 @test_select_idx_not_constant1(i1 %c, ptr %p, i64 %arg) { ; CHECK-LABEL: @test_select_idx_not_constant1( ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [20 x i64], align 8 -- cgit v1.1 From 38b54c72ca83fd56830b13d2a8d7749887b77922 Mon Sep 17 00:00:00 2001 From: Jean Perier Date: Fri, 9 Feb 2024 00:50:48 -0800 Subject: [flang] fix shared library builds after #81166 Fix https://lab.llvm.org/buildbot/#/builders/268/builds/7826 IsDerivedTypeWithLengthParameter cannot be used here, it would make libFortranEvaluate dependent on linFortranSemantics. Replace by loop through parameter values. --- flang/lib/Evaluate/characteristics.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp index c14a422..80b0f34 100644 --- a/flang/lib/Evaluate/characteristics.cpp +++ b/flang/lib/Evaluate/characteristics.cpp @@ -474,9 +474,13 @@ bool DummyDataObject::IsPassedByDescriptor(bool isBindC) const { // Need to pass dynamic type info in a descriptor. return true; } else if (const auto *derived{GetDerivedTypeSpec(type.type())}) { - if (const semantics::Scope *scope = derived->scope()) { - // Need to pass length type parameters in a descriptor if any. - return scope->IsDerivedTypeWithLengthParameter(); + if (!derived->parameters().empty()) { + for (const auto ¶m : derived->parameters()) { + if (param.second.isLen()) { + // Need to pass length type parameters in a descriptor. + return true; + } + } } } else if (isBindC && type.type().IsAssumedLengthCharacter()) { // Fortran 2018 18.3.6 point 2 (5) -- cgit v1.1 From f6610578653fd47535a18284c688d725943ee8c3 Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Fri, 9 Feb 2024 09:57:38 +0100 Subject: Revert "[AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init" (#81234) Reverts llvm/llvm-project#79586 This broke the AMDGPU OpenMP Offload buildbot. The typical error message was that the GPU attempted to read beyong the largest legal address. Error message: AMDGPU fatal error 1: Received error in queue 0x7f8363f22000: HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION: The agent attempted to access memory beyond the largest legal address. --- llvm/docs/AMDGPUUsage.rst | 10 +- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 108 +++---- llvm/lib/Target/AMDGPU/SIFrameLowering.h | 14 +- .../AMDGPU/GlobalISel/call-outgoing-stack-args.ll | 10 +- .../abi-attribute-hints-undefined-behavior.ll | 18 +- .../blender-no-live-segment-at-def-implicit-def.ll | 5 +- .../AMDGPU/branch-folding-implicit-def-subreg.ll | 7 +- llvm/test/CodeGen/AMDGPU/call-argument-types.ll | 329 +++++++++------------ llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll | 30 +- llvm/test/CodeGen/AMDGPU/call-waitcnt.ll | 29 +- .../CodeGen/AMDGPU/callee-special-input-vgprs.ll | 6 +- llvm/test/CodeGen/AMDGPU/cc-update.ll | 84 +++--- .../AMDGPU/cross-block-use-is-not-abi-copy.ll | 10 +- .../CodeGen/AMDGPU/indirect-call-known-callees.ll | 9 +- llvm/test/CodeGen/AMDGPU/indirect-call.ll | 20 +- .../AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll | 5 +- llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 60 ++-- .../CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 5 +- .../CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll | 15 +- .../CodeGen/AMDGPU/lower-module-lds-via-table.ll | 15 +- ...machine-sink-temporal-divergence-swdev407790.ll | 14 +- .../CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll | 15 +- llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll | 7 +- .../CodeGen/AMDGPU/tuple-allocation-failure.ll | 14 +- llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll | 5 +- 25 files changed, 350 insertions(+), 494 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 3019968..6b24171 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -5530,13 +5530,9 @@ If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies Instead the flat SCRATCH instructions are used. Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs -that are used as a V# to access scratch. -The compiler synthesizes the initialization value for the Private Segment -Buffer in the kernel prologue, using the Flat Scratch Init to initialize low -64-bit and a known constant for the high ones. If the Flat Scratch Init is not -available, CP uses the value provided by the runtime. It is used, together with -Scratch Wavefront Offset as an offset, to access the private memory space using -a segment address. See +that are used as a V# to access scratch. CP uses the value provided by the +runtime. It is used, together with Scratch Wavefront Offset as an offset, to +access the private memory space using a segment address. See :ref:`amdgpu-amdhsa-initial-kernel-execution-state`. The scratch V# is a four-aligned SGPR and always selected for the kernel as diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 6327a81..d02aee7 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -379,8 +379,7 @@ public: } // namespace llvm // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` -// and return the FlatScratchInit Register used -Register SIFrameLowering::emitEntryFunctionFlatScratchInit( +void SIFrameLowering::emitEntryFunctionFlatScratchInit( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register ScratchWaveOffsetReg) const { const GCNSubtarget &ST = MF.getSubtarget(); @@ -400,7 +399,6 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( Register FlatScrInitLo; Register FlatScrInitHi; - Register FlatScratchInitReg; if (ST.isAmdPalOS()) { // Extract the scratch offset from the descriptor in the GIT @@ -410,6 +408,7 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( // Find unused reg to load flat scratch init into MachineRegisterInfo &MRI = MF.getRegInfo(); + Register FlatScrInit = AMDGPU::NoRegister; ArrayRef AllSGPR64s = TRI->getAllSGPR64(MF); unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; AllSGPR64s = AllSGPR64s.slice( @@ -418,28 +417,16 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( for (MCPhysReg Reg : AllSGPR64s) { if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) && MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { - FlatScratchInitReg = Reg; + FlatScrInit = Reg; break; } } + assert(FlatScrInit && "Failed to find free register for scratch init"); - } else { - FlatScratchInitReg = - MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); - - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(FlatScratchInitReg); - MBB.addLiveIn(FlatScratchInitReg); - } - - assert(FlatScratchInitReg && "Failed to find free register for scratch init"); - - FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); - FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - - if (ST.isAmdPalOS()) { + FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); + FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); - buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg); + buildGitPtr(MBB, I, DL, TII, FlatScrInit); // We now have the GIT ptr - now get the scratch descriptor from the entry // at offset 0 (or offset 16 for a compute shader). @@ -454,8 +441,8 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; const GCNSubtarget &Subtarget = MF.getSubtarget(); unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); - BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg) - .addReg(FlatScratchInitReg) + BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) + .addReg(FlatScrInit) .addImm(EncodedOffset) // offset .addImm(0) // cpol .addMemOperand(MMO); @@ -463,9 +450,20 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( // Mask the offset in [47:0] of the descriptor const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) - .addReg(FlatScrInitHi) - .addImm(0xffff); + .addReg(FlatScrInitHi) + .addImm(0xffff); And->getOperand(3).setIsDead(); // Mark SCC as dead. + } else { + Register FlatScratchInitReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); + assert(FlatScratchInitReg); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(FlatScratchInitReg); + MBB.addLiveIn(FlatScratchInitReg); + + FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); } // Do a 64-bit pointer add. @@ -488,21 +486,20 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( addReg(FlatScrInitHi). addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); - return FlatScratchInitReg; + return; } - assert(ST.getGeneration() == AMDGPUSubtarget::GFX9); - + // For GFX9. BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) - .addReg(FlatScrInitLo) - .addReg(ScratchWaveOffsetReg); + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) .addReg(FlatScrInitHi) .addImm(0); Addc->getOperand(3).setIsDead(); // Mark SCC as dead. - return AMDGPU::FLAT_SCR; + return; } assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); @@ -523,7 +520,6 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( .addReg(FlatScrInitLo, RegState::Kill) .addImm(8); LShr->getOperand(3).setIsDead(); // Mark SCC as dead. - return AMDGPU::FLAT_SCR; } // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not @@ -615,15 +611,11 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); assert(MFI->isEntryFunction()); - bool NeedsFlatScratchInit = - MFI->getUserSGPRInfo().hasFlatScratchInit() && - (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || - (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); - Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); @@ -649,7 +641,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // Now that we have fixed the reserved SRSRC we need to locate the // (potentially) preloaded SRSRC. Register PreloadedScratchRsrcReg; - if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) { + if (ST.isAmdHsaOrMesa(F)) { PreloadedScratchRsrcReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); if (ScratchRsrcReg && PreloadedScratchRsrcReg) { @@ -705,30 +697,33 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } + bool NeedsFlatScratchInit = + MFI->getUserSGPRInfo().hasFlatScratchInit() && + (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || + (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); + if ((NeedsFlatScratchInit || ScratchRsrcReg) && PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } - Register FlatScratchInit; if (NeedsFlatScratchInit) { - FlatScratchInit = - emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); + emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); } if (ScratchRsrcReg) { - emitEntryFunctionScratchRsrcRegSetup( - MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg, - PreloadedScratchRsrcReg, ScratchWaveOffsetReg); + emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, + PreloadedScratchRsrcReg, + ScratchRsrcReg, ScratchWaveOffsetReg); } } // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg, - Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const { + const DebugLoc &DL, Register PreloadedScratchRsrcReg, + Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -776,8 +771,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( .addImm(21) .addReg(Rsrc03); } - } else if (ST.isMesaGfxShader(Fn) || - (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) { + } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -836,26 +830,6 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( .addImm(Rsrc23 >> 32) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } else if (ST.isAmdHsaOrMesa(Fn)) { - - if (FlatScratchInit) { - const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - Register Lo_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - Register Hi_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), - TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1)) - .addReg(FlatScratchInit) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - BuildMI(MBB, I, DL, SMovB32, Lo_32) - .addImm(Rsrc23 & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, SMovB32, Hi_32) - .addImm(Rsrc23 >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - return; - } - assert(PreloadedScratchRsrcReg); if (ScratchRsrcReg != PreloadedScratchRsrcReg) { diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index f706d48..b3feb75 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -67,19 +67,19 @@ public: MachineBasicBlock::iterator MI) const override; private: - Register - emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const DebugLoc &DL, - Register ScratchWaveOffsetReg) const; + void emitEntryFunctionFlatScratchInit(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register ScratchWaveOffsetReg) const; Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const; void emitEntryFunctionScratchRsrcRegSetup( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - Register FlatScratchInit, Register ScratchRsrcReg, - Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const; + Register PreloadedPrivateBufferReg, Register ScratchRsrcReg, + Register ScratchWaveOffsetReg) const; public: bool hasFP(const MachineFunction &MF) const override; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 61bc28b..2465298 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -13,11 +13,10 @@ define amdgpu_kernel void @kernel_caller_stack() { ; MUBUF-LABEL: kernel_caller_stack: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; MUBUF-NEXT: s_mov_b32 s2, -1 ; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; MUBUF-NEXT: s_mov_b32 s3, 0xe00000 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_mov_b32 s32, 0 -; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v0, 10 @@ -62,10 +61,9 @@ define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-LABEL: kernel_caller_byval: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; MUBUF-NEXT: s_mov_b32 s2, -1 ; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; MUBUF-NEXT: s_mov_b32 s3, 0xe00000 -; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch +; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index 609b5e6..a439c0f 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -48,20 +48,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 { ; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs: ; FIXEDABI-SDAG: ; %bb.0: ; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9 -; FIXEDABI-SDAG-NEXT: s_mov_b32 s2, -1 -; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; FIXEDABI-SDAG-NEXT: s_mov_b32 s3, 0x11e80000 +; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9 ; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; FIXEDABI-SDAG-NEXT: s_mov_b64 s[0:1], flat_scratch +; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8 ; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2 ; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0 +; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5] ; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 ; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 @@ -71,20 +70,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 { ; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs: ; FIXEDABI-GISEL: ; %bb.0: ; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9 -; FIXEDABI-GISEL-NEXT: s_mov_b32 s2, -1 -; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; FIXEDABI-GISEL-NEXT: s_mov_b32 s3, 0x11e80000 +; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9 ; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2 -; FIXEDABI-GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch +; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8 ; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1 ; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0 +; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5] ; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 ; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index 74c6bb5..7c8d40c 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -10,9 +10,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index c06f213..5a128c7 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -5,14 +5,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-LABEL: name: f1 ; GFX90A: bb.0.bb: ; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr32 = S_MOV_B32 0 ; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc ; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc - ; GFX90A-NEXT: $sgpr2 = S_MOV_B32 4294967295, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $sgpr3 = S_MOV_B32 14680064, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9 ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index f72d22b..863bd0d 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -129,13 +129,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; HSA-LABEL: test_call_external_void_func_i1_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 @@ -235,9 +234,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 @@ -341,9 +339,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 @@ -425,13 +422,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; HSA-LABEL: test_call_external_void_func_i8_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 @@ -529,9 +525,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 @@ -630,9 +625,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 @@ -713,13 +707,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_i16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 @@ -816,9 +809,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_sshort v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 @@ -917,9 +909,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 @@ -1000,13 +991,12 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; HSA-LABEL: test_call_external_void_func_i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 42 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 @@ -1088,14 +1078,13 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_i64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: v_mov_b32_e32 v1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 @@ -1193,13 +1182,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_mov_b32 s4, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_mov_b32 s5, s4 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 @@ -1290,16 +1278,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2i64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 @@ -1404,13 +1391,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_mov_b32 s4, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_mov_b32 s5, s4 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v4, 1 ; HSA-NEXT: v_mov_b32_e32 v5, 2 ; HSA-NEXT: s_mov_b32 s32, 0 @@ -1528,13 +1514,12 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_mov_b32 s4, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_mov_b32 s5, s4 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v4, 1 ; HSA-NEXT: v_mov_b32_e32 v5, 2 ; HSA-NEXT: v_mov_b32_e32 v6, 3 @@ -1620,13 +1605,12 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_f16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x4400 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 @@ -1705,13 +1689,12 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 4.0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 @@ -1793,14 +1776,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 @@ -1886,15 +1868,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 4.0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 @@ -1987,17 +1968,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v5f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 4.0 ; HSA-NEXT: v_mov_b32_e32 v3, -1.0 ; HSA-NEXT: v_mov_b32_e32 v4, 0.5 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 @@ -2079,14 +2059,13 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_f64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 @@ -2175,16 +2154,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2f64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 ; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 @@ -2280,11 +2258,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3f64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 @@ -2292,6 +2268,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v4, 0 ; HSA-NEXT: v_mov_b32_e32 v5, 0x40200000 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 @@ -2380,15 +2357,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; HSA-LABEL: test_call_external_void_func_v2i16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 @@ -2480,15 +2456,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; HSA-LABEL: test_call_external_void_func_v3i16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 @@ -2581,15 +2556,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; HSA-LABEL: test_call_external_void_func_v3f16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 @@ -2673,14 +2647,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3i16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 ; HSA-NEXT: v_mov_b32_e32 v1, 3 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 @@ -2764,14 +2737,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3f16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; HSA-NEXT: v_mov_b32_e32 v1, 0x4400 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 @@ -2863,15 +2835,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; HSA-LABEL: test_call_external_void_func_v4i16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 @@ -2957,14 +2928,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v4i16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 ; HSA-NEXT: v_mov_b32_e32 v1, 0x40003 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 @@ -3055,15 +3025,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; HSA-LABEL: test_call_external_void_func_v2f16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 @@ -3151,15 +3120,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; HSA-LABEL: test_call_external_void_func_v2i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 @@ -3242,14 +3210,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 @@ -3335,15 +3302,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; HSA-LABEL: test_call_external_void_func_v3i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: v_mov_b32_e32 v1, 4 ; HSA-NEXT: v_mov_b32_e32 v2, 5 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 @@ -3432,16 +3398,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; HSA-LABEL: test_call_external_void_func_v3i32_i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: v_mov_b32_e32 v1, 4 ; HSA-NEXT: v_mov_b32_e32 v2, 5 ; HSA-NEXT: v_mov_b32_e32 v3, 6 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 @@ -3528,15 +3493,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; HSA-LABEL: test_call_external_void_func_v4i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 @@ -3626,16 +3590,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v4i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 @@ -3728,17 +3691,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v5i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: v_mov_b32_e32 v4, 5 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 @@ -3841,14 +3803,13 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 @@ -3954,11 +3915,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v8i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 @@ -3968,6 +3927,7 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v6, 7 ; HSA-NEXT: v_mov_b32_e32 v7, 8 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 @@ -4078,6 +4038,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4085,9 +4046,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 ; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 @@ -4224,6 +4183,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4235,10 +4195,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 @@ -4401,10 +4359,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 @@ -4509,15 +4466,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; ; HSA-LABEL: test_call_external_i32_func_i32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 42 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_mov_b32 s39, 0x1100f000 ; HSA-NEXT: s_mov_b32 s38, -1 ; HSA-NEXT: s_getpc_b64 s[4:5] @@ -4625,14 +4581,13 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 ; HSA-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 @@ -4747,11 +4702,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; HSA-NEXT: v_mov_b32_e32 v0, 8 @@ -4759,6 +4712,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; HSA-NEXT: s_movk_i32 s32, 0x400 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 @@ -4923,11 +4877,9 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; HSA-NEXT: v_mov_b32_e32 v0, 8 @@ -4935,6 +4887,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; HSA-NEXT: s_movk_i32 s32, 0x800 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 @@ -5132,13 +5085,12 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 @@ -5387,15 +5339,14 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64: ; HSA: ; %bb.0: ; %entry ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; HSA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x80 ; HSA-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: v_mov_b32_e32 v0, s23 ; HSA-NEXT: v_mov_b32_e32 v1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll index 8e2fca5..c62a082 100644 --- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll @@ -11,11 +11,10 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-LABEL: known_x_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -32,10 +31,9 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-LABEL: known_y_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -52,10 +50,9 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-LABEL: known_z_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -72,10 +69,9 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si ; CHECK-LABEL: known_yz_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -92,10 +88,9 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si ; CHECK-LABEL: known_xz_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -113,10 +108,9 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s ; CHECK-LABEL: known_xyz_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, 0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll index 6db5eff..616e5f0 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -7,13 +7,12 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: ds_read_b32 v0, v0 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 @@ -31,11 +30,10 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_store_dword v0, v0, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -54,12 +52,11 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -77,12 +74,11 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) # define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -103,13 +99,12 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 0705d49..9f535a9 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -165,7 +165,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 ; FIXEDABI: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI: v_or_b32_e32 v31, v0, v1 +; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 @@ -181,7 +181,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 ; FIXEDABI: v_lshlrev_b32_e32 v1, 20, v2 -; FIXEDABI: v_or_b32_e32 v31, v0, v1 +; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 @@ -198,7 +198,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { ; FIXEDABI-NOT: v2 ; FIXEDABI:v_lshlrev_b32_e32 v0, 20, v2 ; FIXEDABI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI: v_or_b32_e32 v31, v1, v0 +; FIXEDABI-NEXT: v_or_b32_e32 v31, v1, v0 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll index 7188883..6f42fd0 100644 --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -68,14 +68,13 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -89,12 +88,11 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX900-LABEL: test_kern_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_mov_b32 s3, 0xe00000 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -114,12 +112,11 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: s_mov_b32 s2, -1 -; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 -; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] +; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_getpc_b64 s[16:17] ; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 @@ -151,14 +148,13 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 @@ -175,12 +171,11 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX900-LABEL: test_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_mov_b32 s3, 0xe00000 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 @@ -204,11 +199,10 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0 -; GFX1010-NEXT: s_mov_b32 s2, -1 -; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 -; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 @@ -317,14 +311,13 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -339,12 +332,11 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX900-LABEL: test_force_fp_kern_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_mov_b32 s3, 0xe00000 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -366,12 +358,11 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: s_mov_b32 s2, -1 -; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 -; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] +; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_getpc_b64 s[16:17] ; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 @@ -422,15 +413,14 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX803-LABEL: test_force_fp_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_mov_b32 s33, 0 -; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 @@ -447,13 +437,12 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX900-LABEL: test_force_fp_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_mov_b32 s3, 0xe00000 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_mov_b32 s33, 0 -; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 @@ -478,11 +467,10 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0 -; GFX1010-NEXT: s_mov_b32 s2, -1 -; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 -; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 68c632a..11871db 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -180,9 +180,8 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_load_dword s8, s[6:7], 0x0 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s8, 0 @@ -230,9 +229,8 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_load_dword s8, s[6:7], 0x0 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s8, 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index 2d019ef..47110d9 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -13,6 +13,8 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -23,17 +25,14 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX9-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, 1, s7 -; GFX9-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-NEXT: s_cmp_eq_u32 s4, 1 -; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_cselect_b32 s5, s13, s11 ; GFX9-NEXT: s_cselect_b32 s4, s12, s10 -; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index a66ed93..408199b 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -12,9 +12,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s12, s14 ; GCN-NEXT: s_getpc_b64 s[14:15] @@ -38,9 +37,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) { ; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GISEL-NEXT: s_add_i32 s12, s12, s17 ; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GISEL-NEXT: s_mov_b32 s2, -1 -; GISEL-NEXT: s_mov_b32 s3, 0x1e8f000 -; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GISEL-NEXT: s_mov_b32 s13, s15 ; GISEL-NEXT: s_mov_b32 s12, s14 ; GISEL-NEXT: s_getpc_b64 s[14:15] @@ -69,9 +67,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s12, s14 ; GCN-NEXT: s_getpc_b64 s[14:15] @@ -96,9 +93,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) { ; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GISEL-NEXT: s_add_i32 s12, s12, s17 ; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GISEL-NEXT: s_mov_b32 s2, -1 -; GISEL-NEXT: s_mov_b32 s3, 0x1e8f000 -; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GISEL-NEXT: s_mov_b32 s13, s15 ; GISEL-NEXT: s_mov_b32 s12, s14 ; GISEL-NEXT: s_getpc_b64 s[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 8843efd..6e90554 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -11,9 +11,8 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_mov_b32 s33, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 4851c4f..66f31bb 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -118,11 +118,10 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -178,11 +177,10 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -238,11 +236,10 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -298,11 +295,10 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -345,6 +341,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -353,9 +351,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 0 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -375,15 +370,14 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -416,6 +410,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -424,9 +420,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 2 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -446,15 +439,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -487,6 +479,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -495,9 +489,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 1 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -517,15 +508,14 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -558,6 +548,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -566,9 +558,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 3 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -588,15 +577,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index 26271a0..61818da 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -45,9 +45,8 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index f780188..bb7c43f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -164,9 +164,8 @@ define amdgpu_kernel void @k01() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 @@ -199,9 +198,8 @@ define amdgpu_kernel void @k23() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 @@ -242,9 +240,8 @@ define amdgpu_kernel void @k123() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index fa4b93f..4d73436 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -229,9 +229,8 @@ define amdgpu_kernel void @k01() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 @@ -269,9 +268,8 @@ define amdgpu_kernel void @k23() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 @@ -312,9 +310,8 @@ define amdgpu_kernel void @k123() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index e17f311..138a6a8 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -44,18 +44,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 ; CHECK-NEXT: s_add_u32 s42, s34, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_addc_u32 s43, s35, 0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] ; CHECK-NEXT: s_mov_b32 s33, s14 ; CHECK-NEXT: s_mov_b32 s40, s13 ; CHECK-NEXT: s_mov_b32 s41, s12 @@ -782,18 +781,17 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; CHECK-NEXT: s_load_dwordx2 s[44:45], s[6:7], 0x10 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 ; CHECK-NEXT: s_add_u32 s42, s36, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_addc_u32 s43, s37, 0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] ; CHECK-NEXT: s_mov_b32 s33, s14 ; CHECK-NEXT: s_mov_b32 s40, s13 ; CHECK-NEXT: s_mov_b32 s41, s12 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index 70a9bbb..f70441e 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -69,9 +69,8 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -129,9 +128,8 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -242,9 +240,8 @@ define protected amdgpu_kernel void @kernel() { ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[16:17] diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index e6d9c0d..e7c5aaf 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -45,8 +45,8 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s3, 0xe00000 +; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: s_mul_i32 s4, s4, s5 @@ -55,9 +55,8 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; GFX9-NEXT: s_add_u32 s6, s6, indirect@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, indirect@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch -; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0 ; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: ds_write_b64 v0, v[3:4] diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 8d8459f..1118cc3 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -45,8 +45,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[38:39], 0x20 ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s15 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 +; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) @@ -71,10 +73,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS1-NEXT: s_mov_b32 s2, -1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3 -; GLOBALNESS1-NEXT: s_mov_b32 s3, 0xe00000 -; GLOBALNESS1-NEXT: s_mov_b64 s[0:1], flat_scratch ; GLOBALNESS1-NEXT: s_mov_b32 s68, s14 ; GLOBALNESS1-NEXT: s_mov_b32 s69, s13 ; GLOBALNESS1-NEXT: s_mov_b32 s70, s12 @@ -333,8 +332,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[38:39], 0x20 ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s15 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 +; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) @@ -359,10 +360,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS0-NEXT: s_mov_b32 s2, -1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3 -; GLOBALNESS0-NEXT: s_mov_b32 s3, 0xe00000 -; GLOBALNESS0-NEXT: s_mov_b64 s[0:1], flat_scratch ; GLOBALNESS0-NEXT: s_mov_b32 s66, s14 ; GLOBALNESS0-NEXT: s_mov_b32 s67, s13 ; GLOBALNESS0-NEXT: s_mov_b32 s68, s12 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll index 7d759089..7840559 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -14,9 +14,8 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id ; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v5, 42 -- cgit v1.1 From d72e8c259637991c8d0be642a5ab2bfeb19b1c6e Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Fri, 9 Feb 2024 09:57:59 +0100 Subject: [NFC] Add b9079ba to git-blame-ignore-revs (#81233) --- .git-blame-ignore-revs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index ea84e31..1f498a8 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -78,3 +78,6 @@ f6d557ee34b6bbdb1dc32f29e34b4a4a8ad35e81 082b89b25faae3e45a023caf51b65ca0f02f377f 0ba22f51d128bee9d69756c56c4678097270e10b 84da0e1bb75f8666cf222d2f600f37bebb9ea389 + +# [NFC] clang-format utils/TableGen (#80973) +b9079baaddfed5e604fbfaa1d81a7a1c38e78c26 -- cgit v1.1 From df2513c80bbd444ce97d28961bd5c20ffd7d3c44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 09:18:47 +0100 Subject: [clang][Interp] Fix three-way comparison detection Instead of using !T && CPlusPlus, just check the BinaryOperator's opcode. Turns out we also hit this code path for some assignments of structs in C++. --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 2 +- clang/test/SemaCXX/conditional-expr.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 21bc29f..bf45615 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -464,7 +464,7 @@ bool ByteCodeExprGen::VisitBinaryOperator(const BinaryOperator *BO) { // Special case for C++'s three-way/spaceship operator <=>, which // returns a std::{strong,weak,partial}_ordering (which is a class, so doesn't // have a PrimType). - if (!T && Ctx.getLangOpts().CPlusPlus) { + if (!T && BO->getOpcode() == BO_Cmp) { if (DiscardResult) return true; const ComparisonCategoryInfo *CmpInfo = diff --git a/clang/test/SemaCXX/conditional-expr.cpp b/clang/test/SemaCXX/conditional-expr.cpp index 9a5e2ba..01effaa 100644 --- a/clang/test/SemaCXX/conditional-expr.cpp +++ b/clang/test/SemaCXX/conditional-expr.cpp @@ -1,5 +1,7 @@ // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx11 -std=c++11 -Wsign-conversion %s +// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx11 -std=c++11 -Wsign-conversion %s -fexperimental-new-constant-interpreter // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx17 -std=c++17 -Wsign-conversion %s +// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx17 -std=c++17 -Wsign-conversion %s -fexperimental-new-constant-interpreter // C++ rules for ?: are a lot stricter than C rules, and have to take into // account more conversion options. -- cgit v1.1 From 79e43eb3e20484bdb6f32eecc336742dd721fcc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 10:11:51 +0100 Subject: [clang][Interp] Protect ArrayDecay ops against dummy pointers --- clang/lib/AST/Interp/Interp.h | 3 +++ clang/test/AST/Interp/arrays.cpp | 20 ++++++++++++++++++++ clang/test/SemaCXX/self-comparison.cpp | 1 + 3 files changed, 24 insertions(+) diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 1299a70..bcabd93 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -1891,6 +1891,9 @@ inline bool ArrayElemPop(InterpState &S, CodePtr OpPC, uint32_t Index) { inline bool ArrayDecay(InterpState &S, CodePtr OpPC) { const Pointer &Ptr = S.Stk.pop(); + if (Ptr.isDummy()) + return false; + if (!Ptr.isUnknownSizeArray()) { S.Stk.push(Ptr.atIndex(0)); return true; diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp index e14ff34..dedfa01 100644 --- a/clang/test/AST/Interp/arrays.cpp +++ b/clang/test/AST/Interp/arrays.cpp @@ -598,3 +598,23 @@ namespace NonConstReads { const int y = 0; int yy[y]; } + +namespace SelfComparison { + struct S { + int field; + static int static_field; + int array[4]; + }; + + struct T { + int field; + static int static_field; + int array[4]; + S s; + }; + + int struct_test(S s1, S s2, S *s3, T t) { + return s3->array[t.field] == s3->array[t.field]; // expected-warning {{self-comparison always evaluates to true}} \ + // ref-warning {{self-comparison always evaluates to true}} + }; +} diff --git a/clang/test/SemaCXX/self-comparison.cpp b/clang/test/SemaCXX/self-comparison.cpp index 72127f1..c3c8755 100644 --- a/clang/test/SemaCXX/self-comparison.cpp +++ b/clang/test/SemaCXX/self-comparison.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++2a +// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++2a -fexperimental-new-constant-interpreter int foo(int x) { return x == x; // expected-warning {{self-comparison always evaluates to true}} -- cgit v1.1 From 9e73656af524a2c592978aec91de67316c5ce69f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 10:23:54 +0100 Subject: [clang][Interp] Support ExpressionTraitExprs Just push a constant bool value. --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 7 +++++++ clang/lib/AST/Interp/ByteCodeExprGen.h | 1 + clang/test/SemaCXX/expression-traits.cpp | 1 + 3 files changed, 9 insertions(+) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index bf45615..2539e75 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -2051,6 +2051,13 @@ bool ByteCodeExprGen::VisitCXXInheritedCtorInitExpr( return this->emitCall(F, E); } +template +bool ByteCodeExprGen::VisitExpressionTraitExpr( + const ExpressionTraitExpr *E) { + assert(Ctx.getLangOpts().CPlusPlus); + return this->emitConstBool(E->getValue(), E); +} + template bool ByteCodeExprGen::discard(const Expr *E) { if (E->containsErrors()) return false; diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h index c908a9b..ae216f5 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.h +++ b/clang/lib/AST/Interp/ByteCodeExprGen.h @@ -112,6 +112,7 @@ public: bool VisitChooseExpr(const ChooseExpr *E); bool VisitObjCBoolLiteralExpr(const ObjCBoolLiteralExpr *E); bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E); + bool VisitExpressionTraitExpr(const ExpressionTraitExpr *E); protected: bool visitExpr(const Expr *E) override; diff --git a/clang/test/SemaCXX/expression-traits.cpp b/clang/test/SemaCXX/expression-traits.cpp index a76f0c4..64ddca0 100644 --- a/clang/test/SemaCXX/expression-traits.cpp +++ b/clang/test/SemaCXX/expression-traits.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify -fcxx-exceptions %s +// RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify -fcxx-exceptions %s -fexperimental-new-constant-interpreter // // Tests for "expression traits" intrinsics such as __is_lvalue_expr. -- cgit v1.1 From 5609bd83c3bd39a7522b05b32decc9e3c8ad08ae Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Fri, 9 Feb 2024 01:49:39 -0800 Subject: Revert "[clang-format] Update FormatToken::isSimpleTypeSpecifier() (#80241)" This reverts commit 763139afc19ddf2e0f0265dc828ce8e5fbe92530. It seems that LangOpts is not initialized before use. --- clang/include/clang/Format/Format.h | 2 -- clang/lib/Format/FormatToken.cpp | 35 ++++++++++++++++++++++++++++++++++- clang/lib/Format/FormatTokenLexer.cpp | 7 +++---- clang/lib/Format/FormatTokenLexer.h | 1 + 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index bb63d33..cb14d98 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -5175,8 +5175,6 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style, ArrayRef Ranges, StringRef FileName = ""); -extern LangOptions LangOpts; - /// Returns the ``LangOpts`` that the formatter expects you to set. /// /// \param Style determines specific settings for lexing mode. diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index 69f751d..b791c5a 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -34,8 +34,41 @@ const char *getTokenTypeName(TokenType Type) { return nullptr; } +// FIXME: This is copy&pasted from Sema. Put it in a common place and remove +// duplication. bool FormatToken::isSimpleTypeSpecifier() const { - return Tok.isSimpleTypeSpecifier(LangOpts); + switch (Tok.getKind()) { + case tok::kw_short: + case tok::kw_long: + case tok::kw___int64: + case tok::kw___int128: + case tok::kw_signed: + case tok::kw_unsigned: + case tok::kw_void: + case tok::kw_char: + case tok::kw_int: + case tok::kw_half: + case tok::kw_float: + case tok::kw_double: + case tok::kw___bf16: + case tok::kw__Float16: + case tok::kw___float128: + case tok::kw___ibm128: + case tok::kw_wchar_t: + case tok::kw_bool: +#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait: +#include "clang/Basic/TransformTypeTraits.def" + case tok::annot_typename: + case tok::kw_char8_t: + case tok::kw_char16_t: + case tok::kw_char32_t: + case tok::kw_typeof: + case tok::kw_decltype: + case tok::kw__Atomic: + return true; + default: + return false; + } } bool FormatToken::isTypeOrIdentifier() const { diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index 31b2b7e..a87d0ba 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -22,20 +22,18 @@ namespace clang { namespace format { -LangOptions LangOpts; - FormatTokenLexer::FormatTokenLexer( const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator &Allocator, IdentifierTable &IdentTable) : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), - Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID), + Column(Column), TrailingWhitespace(0), + LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID), Style(Style), IdentTable(IdentTable), Keywords(IdentTable), Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0), FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), MacroBlockEndRegex(Style.MacroBlockEnd) { - LangOpts = getFormattingLangOpts(Style); Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts)); Lex->SetKeepWhitespaceMode(true); @@ -1444,6 +1442,7 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) { void FormatTokenLexer::resetLexer(unsigned Offset) { StringRef Buffer = SourceMgr.getBufferData(ID); + LangOpts = getFormattingLangOpts(Style); Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts, Buffer.begin(), Buffer.begin() + Offset, Buffer.end())); Lex->SetKeepWhitespaceMode(true); diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h index 52838f1..65dd733 100644 --- a/clang/lib/Format/FormatTokenLexer.h +++ b/clang/lib/Format/FormatTokenLexer.h @@ -120,6 +120,7 @@ private: unsigned Column; unsigned TrailingWhitespace; std::unique_ptr Lex; + LangOptions LangOpts; const SourceManager &SourceMgr; FileID ID; const FormatStyle &Style; -- cgit v1.1 From 245d7727d51548c3d5d867b69b1f9b1efff2502e Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Fri, 9 Feb 2024 01:53:47 -0800 Subject: Revert "[clang-format] Fix an out-of-bounds bug uncovered by 763139afc19d" This reverts commit 173e674ba55eb93e8af43f2eece7feffe9954b34. Actually, NotTokens[5] in QualifierFixerTest.cpp is not out of bounds. --- clang/unittests/Format/QualifierFixerTest.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp index 4e1768d..324366c 100644 --- a/clang/unittests/Format/QualifierFixerTest.cpp +++ b/clang/unittests/Format/QualifierFixerTest.cpp @@ -1100,6 +1100,8 @@ TEST_F(QualifierFixerTest, IsQualifierType) { NotTokens[3], ConfiguredTokens)); EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( NotTokens[4], ConfiguredTokens)); + EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( + NotTokens[5], ConfiguredTokens)); EXPECT_FALSE( LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[0])); @@ -1111,6 +1113,8 @@ TEST_F(QualifierFixerTest, IsQualifierType) { LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[3])); EXPECT_FALSE( LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[4])); + EXPECT_FALSE( + LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[5])); } TEST_F(QualifierFixerTest, IsMacro) { -- cgit v1.1 From c227eca73970c65d9663e6d65abe3f9daef2a25f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 11:01:07 +0100 Subject: [clang][Interp][NFC] Convert test case to verify=expected,both style --- clang/test/AST/Interp/arrays.cpp | 247 +++++++++++++-------------------------- 1 file changed, 83 insertions(+), 164 deletions(-) diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp index dedfa01..3c06ab5 100644 --- a/clang/test/AST/Interp/arrays.cpp +++ b/clang/test/AST/Interp/arrays.cpp @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify %s -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify %s -// RUN: %clang_cc1 -verify=ref %s -// RUN: %clang_cc1 -verify=ref -std=c++20 %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify=expected,both %s +// RUN: %clang_cc1 -verify=ref,both %s +// RUN: %clang_cc1 -verify=ref,both -std=c++20 %s constexpr int m = 3; constexpr const int *foo[][5] = { @@ -73,53 +73,40 @@ static_assert(getElementFromEnd(data, 5, 0) == 1, ""); static_assert(getElementFromEnd(data, 5, 4) == 5, ""); constexpr int getFirstElem(const int *a) { - return a[0]; // expected-note {{read of dereferenced null pointer}} \ - // ref-note {{read of dereferenced null pointer}} + return a[0]; // both-note {{read of dereferenced null pointer}} } -static_assert(getFirstElem(nullptr) == 1, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to}} +static_assert(getFirstElem(nullptr) == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} constexpr static int arr[2] = {1,2}; constexpr static int arr2[2] = {3,4}; constexpr int *p1 = nullptr; -constexpr int *p2 = p1 + 1; // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{cannot perform pointer arithmetic on null pointer}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{cannot perform pointer arithmetic on null pointer}} +constexpr int *p2 = p1 + 1; // both-error {{must be initialized by a constant expression}} \ + // both-note {{cannot perform pointer arithmetic on null pointer}} constexpr int *p3 = p1 + 0; constexpr int *p4 = p1 - 0; constexpr int *p5 = 0 + p1; -constexpr int *p6 = 0 - p1; // expected-error {{invalid operands to binary expression}} \ - // ref-error {{invalid operands to binary expression}} +constexpr int *p6 = 0 - p1; // both-error {{invalid operands to binary expression}} constexpr int const * ap1 = &arr[0]; -constexpr int const * ap2 = ap1 + 3; // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{cannot refer to element 3 of array of 2}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{cannot refer to element 3 of array of 2}} - -constexpr auto ap3 = arr - 1; // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{cannot refer to element -1}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{cannot refer to element -1}} +constexpr int const * ap2 = ap1 + 3; // both-error {{must be initialized by a constant expression}} \ + // both-note {{cannot refer to element 3 of array of 2}} + +constexpr auto ap3 = arr - 1; // both-error {{must be initialized by a constant expression}} \ + // both-note {{cannot refer to element -1}} constexpr int k1 = &arr[1] - &arr[0]; static_assert(k1 == 1, ""); static_assert((&arr[0] - &arr[1]) == -1, ""); -constexpr int k2 = &arr2[1] - &arr[0]; // expected-error {{must be initialized by a constant expression}} \ - // ref-error {{must be initialized by a constant expression}} +constexpr int k2 = &arr2[1] - &arr[0]; // both-error {{must be initialized by a constant expression}} static_assert((arr + 0) == arr, ""); static_assert(&arr[0] == arr, ""); static_assert(*(&arr[0]) == 1, ""); static_assert(*(&arr[1]) == 2, ""); -constexpr const int *OOB = (arr + 3) - 3; // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{cannot refer to element 3 of array of 2}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{cannot refer to element 3 of array of 2}} +constexpr const int *OOB = (arr + 3) - 3; // both-error {{must be initialized by a constant expression}} \ + // both-note {{cannot refer to element 3 of array of 2}} template constexpr T getElementOf(T* array, int i) { @@ -135,11 +122,8 @@ constexpr T& getElementOfArray(T (&array)[N], int I) { static_assert(getElementOfArray(foo[2], 3) == &m, ""); -static_assert(data[0] == 4, ""); // expected-error{{failed}} \ - // expected-note{{5 == 4}} \ - // ref-error{{failed}} \ - // ref-note{{5 == 4}} - +static_assert(data[0] == 4, ""); // both-error{{failed}} \ + // both-note{{5 == 4}} constexpr int dynamic[] = { f, 3, 2 + 5, data[3], *getElementOf(foo[2], 3) @@ -185,21 +169,15 @@ struct fred y [] = { [0] = { .s[0] = 'q' } }; namespace indices { constexpr int first[] = {1}; - constexpr int firstValue = first[2]; // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{cannot refer to element 2 of array of 1}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{cannot refer to element 2 of array of 1}} + constexpr int firstValue = first[2]; // both-error {{must be initialized by a constant expression}} \ + // both-note {{cannot refer to element 2 of array of 1}} constexpr int second[10] = {17}; - constexpr int secondValue = second[10];// ref-error {{must be initialized by a constant expression}} \ - // ref-note {{read of dereferenced one-past-the-end pointer}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{read of dereferenced one-past-the-end pointer}} - - constexpr int negative = second[-2]; // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{cannot refer to element -2 of array of 10}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{cannot refer to element -2 of array of 10}} + constexpr int secondValue = second[10];// both-error {{must be initialized by a constant expression}} \ + // both-note {{read of dereferenced one-past-the-end pointer}} \ + + constexpr int negative = second[-2]; // both-error {{must be initialized by a constant expression}} \ + // both-note {{cannot refer to element -2 of array of 10}} }; namespace DefaultInit { @@ -222,12 +200,9 @@ public: class AU { public: int a; - constexpr AU() : a(5 / 0) {} // expected-warning {{division by zero is undefined}} \ - // expected-note 2{{division by zero}} \ - // expected-error {{never produces a constant expression}} \ - // ref-error {{never produces a constant expression}} \ - // ref-note 2{{division by zero}} \ - // ref-warning {{division by zero is undefined}} + constexpr AU() : a(5 / 0) {} // both-warning {{division by zero is undefined}} \ + // both-note 2{{division by zero}} \ + // both-error {{never produces a constant expression}} }; class B { public: @@ -241,13 +216,10 @@ static_assert(b.a[1].a == 12, ""); class BU { public: AU a[2]; - constexpr BU() {} // expected-note {{in call to 'AU()'}} \ - // ref-note {{in call to 'AU()'}} + constexpr BU() {} // both-note {{in call to 'AU()'}} }; -constexpr BU bu; // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{in call to 'BU()'}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{in call to 'BU()'}} +constexpr BU bu; // both-error {{must be initialized by a constant expression}} \ + // both-note {{in call to 'BU()'}} namespace IncDec { constexpr int getNextElem(const int *A, int I) { @@ -311,62 +283,43 @@ namespace IncDec { } static_assert(getSecondToLast2() == 3, ""); - constexpr int bad1() { // ref-error {{never produces a constant expression}} \ - // expected-error {{never produces a constant expression}} + constexpr int bad1() { // both-error {{never produces a constant expression}} const int *e = E + 3; e++; // This is fine because it's a one-past-the-end pointer - return *e; // expected-note 2{{read of dereferenced one-past-the-end pointer}} \ - // ref-note 2{{read of dereferenced one-past-the-end pointer}} + return *e; // both-note 2{{read of dereferenced one-past-the-end pointer}} } - static_assert(bad1() == 0, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to}} + static_assert(bad1() == 0, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} - constexpr int bad2() { // ref-error {{never produces a constant expression}} \ - // expected-error {{never produces a constant expression}} + constexpr int bad2() { // both-error {{never produces a constant expression}} const int *e = E + 4; - e++; // expected-note 2{{cannot refer to element 5 of array of 4 elements}} \ - // ref-note 2{{cannot refer to element 5 of array of 4 elements}} + e++; // both-note 2{{cannot refer to element 5 of array of 4 elements}} return *e; // This is UB as well } - static_assert(bad2() == 0, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to}} + static_assert(bad2() == 0, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} - - constexpr int bad3() { // ref-error {{never produces a constant expression}} \ - // expected-error {{never produces a constant expression}} + constexpr int bad3() { // both-error {{never produces a constant expression}} const int *e = E; - e--; // expected-note 2{{cannot refer to element -1 of array of 4 elements}} \ - // ref-note 2{{cannot refer to element -1 of array of 4 elements}} + e--; // both-note 2{{cannot refer to element -1 of array of 4 elements}} return *e; // This is UB as well } - static_assert(bad3() == 0, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to}} + static_assert(bad3() == 0, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} constexpr int nullptr1(bool Pre) { int *a = nullptr; if (Pre) - ++a; // ref-note {{arithmetic on null pointer}} \ - // expected-note {{arithmetic on null pointer}} + ++a; // both-note {{arithmetic on null pointer}} else - a++; // ref-note {{arithmetic on null pointer}} \ - // expected-note {{arithmetic on null pointer}} + a++; // both-note {{arithmetic on null pointer}} return 1; } - static_assert(nullptr1(true) == 1, ""); // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to}} \ - // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to}} - - static_assert(nullptr1(false) == 1, ""); // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to}} \ - // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to}} + static_assert(nullptr1(true) == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} + + static_assert(nullptr1(false) == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} }; namespace ZeroInit { @@ -425,28 +378,20 @@ namespace NoInitMapLeak { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdivision-by-zero" #pragma clang diagnostic ignored "-Wc++20-extensions" - constexpr int testLeak() { // expected-error {{never produces a constant expression}} \ - // ref-error {{never produces a constant expression}} + constexpr int testLeak() { // both-error {{never produces a constant expression}} int a[2]; a[0] = 1; // interrupts interpretation. - (void)(1 / 0); // expected-note 2{{division by zero}} \ - // ref-note 2{{division by zero}} - + (void)(1 / 0); // both-note 2{{division by zero}} return 1; } #pragma clang diagnostic pop - static_assert(testLeak() == 1, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to 'testLeak()'}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to 'testLeak()'}} + static_assert(testLeak() == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'testLeak()'}} - - constexpr int a[] = {1,2,3,4/0,5}; // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{division by zero}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{division by zero}} \ + constexpr int a[] = {1,2,3,4/0,5}; // both-error {{must be initialized by a constant expression}} \ + // both-note {{division by zero}} \ // ref-note {{declared here}} /// FIXME: This should fail in the new interpreter as well. @@ -456,18 +401,13 @@ namespace NoInitMapLeak { static_assert(b == 1, ""); // ref-error {{not an integral constant expression}} \ // ref-note {{not a constant expression}} - constexpr int f() { // expected-error {{never produces a constant expression}} \ - // ref-error {{never produces a constant expression}} - int a[] = {19,2,3/0,4}; // expected-note 2{{division by zero}} \ - // expected-warning {{is undefined}} \ - // ref-note 2{{division by zero}} \ - // ref-warning {{is undefined}} + constexpr int f() { // both-error {{never produces a constant expression}} + int a[] = {19,2,3/0,4}; // both-note 2{{division by zero}} \ + // both-warning {{is undefined}} return 1; } - static_assert(f() == 1, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to}} + static_assert(f() == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} } namespace Incomplete { @@ -477,38 +417,27 @@ namespace Incomplete { }; constexpr Foo F{}; - constexpr const int *A = F.a; // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{array-to-pointer decay of array member without known bound}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{array-to-pointer decay of array member without known bound}} - - constexpr const int *B = F.a + 1; // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{array-to-pointer decay of array member without known bound}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{array-to-pointer decay of array member without known bound}} - - constexpr int C = *F.a; // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{array-to-pointer decay of array member without known bound}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{array-to-pointer decay of array member without known bound}} + constexpr const int *A = F.a; // both-error {{must be initialized by a constant expression}} \ + // both-note {{array-to-pointer decay of array member without known bound}} + constexpr const int *B = F.a + 1; // both-error {{must be initialized by a constant expression}} \ + // both-note {{array-to-pointer decay of array member without known bound}} + constexpr int C = *F.a; // both-error {{must be initialized by a constant expression}} \ + // both-note {{array-to-pointer decay of array member without known bound}} /// These are from test/SemaCXX/constant-expression-cxx11.cpp /// and are the only tests using the 'indexing of array without known bound' diagnostic. /// We currently diagnose them differently. extern int arr[]; // expected-note 3{{declared here}} - constexpr int *c = &arr[1]; // ref-error {{must be initialized by a constant expression}} \ + constexpr int *c = &arr[1]; // both-error {{must be initialized by a constant expression}} \ // ref-note {{indexing of array without known bound}} \ - // expected-error {{must be initialized by a constant expression}} \ // expected-note {{read of non-constexpr variable 'arr'}} - constexpr int *d = &arr[1]; // ref-error {{must be initialized by a constant expression}} \ + constexpr int *d = &arr[1]; // both-error {{must be initialized by a constant expression}} \ // ref-note {{indexing of array without known bound}} \ - // expected-error {{must be initialized by a constant expression}} \ // expected-note {{read of non-constexpr variable 'arr'}} - constexpr int *e = arr + 1; // ref-error {{must be initialized by a constant expression}} \ + constexpr int *e = arr + 1; // both-error {{must be initialized by a constant expression}} \ // ref-note {{indexing of array without known bound}} \ - // expected-error {{must be initialized by a constant expression}} \ // expected-note {{read of non-constexpr variable 'arr'}} } @@ -528,8 +457,7 @@ namespace GH69115 { if (C) return; // Invalid in constexpr. - (void)(1 / 0); // expected-warning {{undefined}} \ - // ref-warning {{undefined}} + (void)(1 / 0); // both-warning {{undefined}} } class F { @@ -569,23 +497,15 @@ namespace GH69115 { namespace NonConstReads { #if __cplusplus >= 202002L - void *p = nullptr; // ref-note {{declared here}} \ - // expected-note {{declared here}} - - int arr[!p]; // ref-error {{not allowed at file scope}} \ - // expected-error {{not allowed at file scope}} \ - // ref-warning {{variable length arrays}} \ - // ref-note {{read of non-constexpr variable 'p'}} \ - // expected-warning {{variable length arrays}} \ - // expected-note {{read of non-constexpr variable 'p'}} - int z; // ref-note {{declared here}} \ - // expected-note {{declared here}} - int a[z]; // ref-error {{not allowed at file scope}} \ - // expected-error {{not allowed at file scope}} \ - // ref-warning {{variable length arrays}} \ - // ref-note {{read of non-const variable 'z'}} \ - // expected-warning {{variable length arrays}} \ - // expected-note {{read of non-const variable 'z'}} + void *p = nullptr; // both-note {{declared here}} + + int arr[!p]; // both-error {{not allowed at file scope}} \ + // both-warning {{variable length arrays}} \ + // both-note {{read of non-constexpr variable 'p'}} + int z; // both-note {{declared here}} + int a[z]; // both-error {{not allowed at file scope}} \ + // both-warning {{variable length arrays}} \ + // both-note {{read of non-const variable 'z'}} #else void *p = nullptr; int arr[!p]; // ref-error {{not allowed at file scope}} \ @@ -614,7 +534,6 @@ namespace SelfComparison { }; int struct_test(S s1, S s2, S *s3, T t) { - return s3->array[t.field] == s3->array[t.field]; // expected-warning {{self-comparison always evaluates to true}} \ - // ref-warning {{self-comparison always evaluates to true}} + return s3->array[t.field] == s3->array[t.field]; // both-warning {{self-comparison always evaluates to true}} }; } -- cgit v1.1 From 02362b1ad1c07a01714b195d769400dd40dbfd04 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Fri, 9 Feb 2024 02:11:44 -0800 Subject: [clang-format] Check token size in QualifierFixerTest.cpp --- clang/unittests/Format/QualifierFixerTest.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp index 324366c..0aa755a 100644 --- a/clang/unittests/Format/QualifierFixerTest.cpp +++ b/clang/unittests/Format/QualifierFixerTest.cpp @@ -1055,6 +1055,7 @@ TEST_F(QualifierFixerTest, IsQualifierType) { auto Tokens = annotate( "const static inline auto restrict int double long constexpr friend"); + ASSERT_EQ(Tokens.size(), 11u) << Tokens; EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( Tokens[0], ConfiguredTokens)); @@ -1089,6 +1090,7 @@ TEST_F(QualifierFixerTest, IsQualifierType) { EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[9])); auto NotTokens = annotate("for while do Foo Bar "); + ASSERT_EQ(NotTokens.size(), 6u) << Tokens; EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( NotTokens[0], ConfiguredTokens)); @@ -1120,6 +1122,7 @@ TEST_F(QualifierFixerTest, IsQualifierType) { TEST_F(QualifierFixerTest, IsMacro) { auto Tokens = annotate("INT INTPR Foo int"); + ASSERT_EQ(Tokens.size(), 5u) << Tokens; EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isPossibleMacro(Tokens[0])); EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isPossibleMacro(Tokens[1])); -- cgit v1.1 From ac3bd2bd530127786741bd9f164d66a3c3f40961 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 9 Feb 2024 11:20:19 +0100 Subject: [LoopReroll] Remove unused and unmaintained pass (#80972) Remove the LoopReroll pass, which is both unused (in any default pipeline) and unmaintained, with numerous open correctness issues (https://github.com/llvm/llvm-project/issues?q=is%3Aissue+is%3Aopen+loop-reroll). The removal is in line with https://discourse.llvm.org/t/rfc-disallow-unmaintained-unused-passes/75151. There is also a defunct `-freroll-loops` option in clang, which I'll remove separately. Migrated from https://reviews.llvm.org/D150684. --- llvm/include/llvm/Transforms/Scalar/LoopReroll.h | 25 - llvm/lib/Passes/PassBuilder.cpp | 1 - llvm/lib/Passes/PassRegistry.def | 1 - llvm/lib/Transforms/Scalar/CMakeLists.txt | 1 - llvm/lib/Transforms/Scalar/LoopRerollPass.cpp | 1679 -------------------- llvm/test/Transforms/LoopReroll/basic.ll | 976 ------------ llvm/test/Transforms/LoopReroll/basic32iters.ll | 328 ---- llvm/test/Transforms/LoopReroll/complex_reroll.ll | 237 --- llvm/test/Transforms/LoopReroll/external_use.ll | 60 - llvm/test/Transforms/LoopReroll/extra_instr.ll | 361 ----- llvm/test/Transforms/LoopReroll/indvar_with_ext.ll | 184 --- llvm/test/Transforms/LoopReroll/negative.ll | 48 - llvm/test/Transforms/LoopReroll/nonconst_lb.ll | 168 -- llvm/test/Transforms/LoopReroll/ptrindvar.ll | 125 -- llvm/test/Transforms/LoopReroll/reduction.ll | 132 -- llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll | 130 -- 16 files changed, 4456 deletions(-) delete mode 100644 llvm/include/llvm/Transforms/Scalar/LoopReroll.h delete mode 100644 llvm/lib/Transforms/Scalar/LoopRerollPass.cpp delete mode 100644 llvm/test/Transforms/LoopReroll/basic.ll delete mode 100644 llvm/test/Transforms/LoopReroll/basic32iters.ll delete mode 100644 llvm/test/Transforms/LoopReroll/complex_reroll.ll delete mode 100644 llvm/test/Transforms/LoopReroll/external_use.ll delete mode 100644 llvm/test/Transforms/LoopReroll/extra_instr.ll delete mode 100644 llvm/test/Transforms/LoopReroll/indvar_with_ext.ll delete mode 100644 llvm/test/Transforms/LoopReroll/negative.ll delete mode 100644 llvm/test/Transforms/LoopReroll/nonconst_lb.ll delete mode 100644 llvm/test/Transforms/LoopReroll/ptrindvar.ll delete mode 100644 llvm/test/Transforms/LoopReroll/reduction.ll delete mode 100644 llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll diff --git a/llvm/include/llvm/Transforms/Scalar/LoopReroll.h b/llvm/include/llvm/Transforms/Scalar/LoopReroll.h deleted file mode 100644 index 496e8df..0000000 --- a/llvm/include/llvm/Transforms/Scalar/LoopReroll.h +++ /dev/null @@ -1,25 +0,0 @@ -//===- LoopReroll.h - Loop rerolling pass ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H -#define LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H - -#include "llvm/IR/PassManager.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" - -namespace llvm { - -class LoopRerollPass : public PassInfoMixin { -public: - PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, LPMUpdater &U); -}; - -} // end namespace llvm - -#endif // LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 7c306c4..007dc76 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -216,7 +216,6 @@ #include "llvm/Transforms/Scalar/LoopLoadElimination.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Scalar/LoopPredication.h" -#include "llvm/Transforms/Scalar/LoopReroll.h" #include "llvm/Transforms/Scalar/LoopRotation.h" #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" #include "llvm/Transforms/Scalar/LoopSink.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 4451180..6cb87fb 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -599,7 +599,6 @@ LOOP_PASS("loop-idiom", LoopIdiomRecognizePass()) LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass()) LOOP_PASS("loop-predication", LoopPredicationPass()) LOOP_PASS("loop-reduce", LoopStrengthReducePass()) -LOOP_PASS("loop-reroll", LoopRerollPass()) LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass()) LOOP_PASS("loop-unroll-full", LoopFullUnrollPass()) LOOP_PASS("loop-versioning-licm", LoopVersioningLICMPass()) diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt index 2dd2703..5527efa 100644 --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -40,7 +40,6 @@ add_llvm_component_library(LLVMScalarOpts LoopLoadElimination.cpp LoopPassManager.cpp LoopPredication.cpp - LoopRerollPass.cpp LoopRotation.cpp LoopSimplifyCFG.cpp LoopStrengthReduce.cpp diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp deleted file mode 100644 index 7f62526..0000000 --- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ /dev/null @@ -1,1679 +0,0 @@ -//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass implements a simple loop reroller. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/MapVector.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Use.h" -#include "llvm/IR/User.h" -#include "llvm/IR/Value.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar/LoopReroll.h" -#include "llvm/Transforms/Utils.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" -#include -#include -#include -#include -#include -#include - -using namespace llvm; - -#define DEBUG_TYPE "loop-reroll" - -STATISTIC(NumRerolledLoops, "Number of rerolled loops"); - -static cl::opt -NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), - cl::Hidden, - cl::desc("The maximum number of failures to tolerate" - " during fuzzy matching. (default: 400)")); - -// This loop re-rolling transformation aims to transform loops like this: -// -// int foo(int a); -// void bar(int *x) { -// for (int i = 0; i < 500; i += 3) { -// foo(i); -// foo(i+1); -// foo(i+2); -// } -// } -// -// into a loop like this: -// -// void bar(int *x) { -// for (int i = 0; i < 500; ++i) -// foo(i); -// } -// -// It does this by looking for loops that, besides the latch code, are composed -// of isomorphic DAGs of instructions, with each DAG rooted at some increment -// to the induction variable, and where each DAG is isomorphic to the DAG -// rooted at the induction variable (excepting the sub-DAGs which root the -// other induction-variable increments). In other words, we're looking for loop -// bodies of the form: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// f(%iv) -// %iv.1 = add %iv, 1 <-- a root increment -// f(%iv.1) -// %iv.2 = add %iv, 2 <-- a root increment -// f(%iv.2) -// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment -// f(%iv.scale_m_1) -// ... -// %iv.next = add %iv, scale -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit -// -// where each f(i) is a set of instructions that, collectively, are a function -// only of i (and other loop-invariant values). -// -// As a special case, we can also reroll loops like this: -// -// int foo(int); -// void bar(int *x) { -// for (int i = 0; i < 500; ++i) { -// x[3*i] = foo(0); -// x[3*i+1] = foo(0); -// x[3*i+2] = foo(0); -// } -// } -// -// into this: -// -// void bar(int *x) { -// for (int i = 0; i < 1500; ++i) -// x[i] = foo(0); -// } -// -// in which case, we're looking for inputs like this: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// %scaled.iv = mul %iv, scale -// f(%scaled.iv) -// %scaled.iv.1 = add %scaled.iv, 1 -// f(%scaled.iv.1) -// %scaled.iv.2 = add %scaled.iv, 2 -// f(%scaled.iv.2) -// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 -// f(%scaled.iv.scale_m_1) -// ... -// %iv.next = add %iv, 1 -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit - -namespace { - - enum IterationLimits { - /// The maximum number of iterations that we'll try and reroll. - IL_MaxRerollIterations = 32, - /// The bitvector index used by loop induction variables and other - /// instructions that belong to all iterations. - IL_All, - IL_End - }; - - class LoopReroll { - public: - LoopReroll(AliasAnalysis *AA, LoopInfo *LI, ScalarEvolution *SE, - TargetLibraryInfo *TLI, DominatorTree *DT, bool PreserveLCSSA) - : AA(AA), LI(LI), SE(SE), TLI(TLI), DT(DT), - PreserveLCSSA(PreserveLCSSA) {} - bool runOnLoop(Loop *L); - - protected: - AliasAnalysis *AA; - LoopInfo *LI; - ScalarEvolution *SE; - TargetLibraryInfo *TLI; - DominatorTree *DT; - bool PreserveLCSSA; - - using SmallInstructionVector = SmallVector; - using SmallInstructionSet = SmallPtrSet; - using TinyInstructionVector = SmallVector; - - // Map between induction variable and its increment - DenseMap IVToIncMap; - - // For loop with multiple induction variables, remember the ones used only to - // control the loop. - TinyInstructionVector LoopControlIVs; - - // A chain of isomorphic instructions, identified by a single-use PHI - // representing a reduction. Only the last value may be used outside the - // loop. - struct SimpleLoopReduction { - SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) { - assert(isa(P) && "First reduction instruction must be a PHI"); - add(L); - } - - bool valid() const { - return Valid; - } - - Instruction *getPHI() const { - assert(Valid && "Using invalid reduction"); - return Instructions.front(); - } - - Instruction *getReducedValue() const { - assert(Valid && "Using invalid reduction"); - return Instructions.back(); - } - - Instruction *get(size_t i) const { - assert(Valid && "Using invalid reduction"); - return Instructions[i+1]; - } - - Instruction *operator [] (size_t i) const { return get(i); } - - // The size, ignoring the initial PHI. - size_t size() const { - assert(Valid && "Using invalid reduction"); - return Instructions.size()-1; - } - - using iterator = SmallInstructionVector::iterator; - using const_iterator = SmallInstructionVector::const_iterator; - - iterator begin() { - assert(Valid && "Using invalid reduction"); - return std::next(Instructions.begin()); - } - - const_iterator begin() const { - assert(Valid && "Using invalid reduction"); - return std::next(Instructions.begin()); - } - - iterator end() { return Instructions.end(); } - const_iterator end() const { return Instructions.end(); } - - protected: - bool Valid = false; - SmallInstructionVector Instructions; - - void add(Loop *L); - }; - - // The set of all reductions, and state tracking of possible reductions - // during loop instruction processing. - struct ReductionTracker { - using SmallReductionVector = SmallVector; - - // Add a new possible reduction. - void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); } - - // Setup to track possible reductions corresponding to the provided - // rerolling scale. Only reductions with a number of non-PHI instructions - // that is divisible by the scale are considered. Three instructions sets - // are filled in: - // - A set of all possible instructions in eligible reductions. - // - A set of all PHIs in eligible reductions - // - A set of all reduced values (last instructions) in eligible - // reductions. - void restrictToScale(uint64_t Scale, - SmallInstructionSet &PossibleRedSet, - SmallInstructionSet &PossibleRedPHISet, - SmallInstructionSet &PossibleRedLastSet) { - PossibleRedIdx.clear(); - PossibleRedIter.clear(); - Reds.clear(); - - for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) - if (PossibleReds[i].size() % Scale == 0) { - PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); - PossibleRedPHISet.insert(PossibleReds[i].getPHI()); - - PossibleRedSet.insert(PossibleReds[i].getPHI()); - PossibleRedIdx[PossibleReds[i].getPHI()] = i; - for (Instruction *J : PossibleReds[i]) { - PossibleRedSet.insert(J); - PossibleRedIdx[J] = i; - } - } - } - - // The functions below are used while processing the loop instructions. - - // Are the two instructions both from reductions, and furthermore, from - // the same reduction? - bool isPairInSame(Instruction *J1, Instruction *J2) { - DenseMap::iterator J1I = PossibleRedIdx.find(J1); - if (J1I != PossibleRedIdx.end()) { - DenseMap::iterator J2I = PossibleRedIdx.find(J2); - if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) - return true; - } - - return false; - } - - // The two provided instructions, the first from the base iteration, and - // the second from iteration i, form a matched pair. If these are part of - // a reduction, record that fact. - void recordPair(Instruction *J1, Instruction *J2, unsigned i) { - if (PossibleRedIdx.count(J1)) { - assert(PossibleRedIdx.count(J2) && - "Recording reduction vs. non-reduction instruction?"); - - PossibleRedIter[J1] = 0; - PossibleRedIter[J2] = i; - - int Idx = PossibleRedIdx[J1]; - assert(Idx == PossibleRedIdx[J2] && - "Recording pair from different reductions?"); - Reds.insert(Idx); - } - } - - // The functions below can be called after we've finished processing all - // instructions in the loop, and we know which reductions were selected. - - bool validateSelected(); - void replaceSelected(); - - protected: - // The vector of all possible reductions (for any scale). - SmallReductionVector PossibleReds; - - DenseMap PossibleRedIdx; - DenseMap PossibleRedIter; - DenseSet Reds; - }; - - // A DAGRootSet models an induction variable being used in a rerollable - // loop. For example, - // - // x[i*3+0] = y1 - // x[i*3+1] = y2 - // x[i*3+2] = y3 - // - // Base instruction -> i*3 - // +---+----+ - // / | \ - // ST[y1] +1 +2 <-- Roots - // | | - // ST[y2] ST[y3] - // - // There may be multiple DAGRoots, for example: - // - // x[i*2+0] = ... (1) - // x[i*2+1] = ... (1) - // x[i*2+4] = ... (2) - // x[i*2+5] = ... (2) - // x[(i+1234)*2+5678] = ... (3) - // x[(i+1234)*2+5679] = ... (3) - // - // The loop will be rerolled by adding a new loop induction variable, - // one for the Base instruction in each DAGRootSet. - // - struct DAGRootSet { - Instruction *BaseInst; - SmallInstructionVector Roots; - - // The instructions between IV and BaseInst (but not including BaseInst). - SmallInstructionSet SubsumedInsts; - }; - - // The set of all DAG roots, and state tracking of all roots - // for a particular induction variable. - struct DAGRootTracker { - DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, - ScalarEvolution *SE, AliasAnalysis *AA, - TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI, - bool PreserveLCSSA, - DenseMap &IncrMap, - TinyInstructionVector LoopCtrlIVs) - : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI), - PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap), - LoopControlIVs(LoopCtrlIVs) {} - - /// Stage 1: Find all the DAG roots for the induction variable. - bool findRoots(); - - /// Stage 2: Validate if the found roots are valid. - bool validate(ReductionTracker &Reductions); - - /// Stage 3: Assuming validate() returned true, perform the - /// replacement. - /// @param BackedgeTakenCount The backedge-taken count of L. - void replace(const SCEV *BackedgeTakenCount); - - protected: - using UsesTy = MapVector; - - void findRootsRecursive(Instruction *IVU, - SmallInstructionSet SubsumedInsts); - bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts); - bool collectPossibleRoots(Instruction *Base, - std::map &Roots); - bool validateRootSet(DAGRootSet &DRS); - - bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet); - void collectInLoopUserSet(const SmallInstructionVector &Roots, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet &Users); - void collectInLoopUserSet(Instruction *Root, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet &Users); - - UsesTy::iterator nextInstr(int Val, UsesTy &In, - const SmallInstructionSet &Exclude, - UsesTy::iterator *StartI=nullptr); - bool isBaseInst(Instruction *I); - bool isRootInst(Instruction *I); - bool instrDependsOn(Instruction *I, - UsesTy::iterator Start, - UsesTy::iterator End); - void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr); - - LoopReroll *Parent; - - // Members of Parent, replicated here for brevity. - Loop *L; - ScalarEvolution *SE; - AliasAnalysis *AA; - TargetLibraryInfo *TLI; - DominatorTree *DT; - LoopInfo *LI; - bool PreserveLCSSA; - - // The loop induction variable. - Instruction *IV; - - // Loop step amount. - int64_t Inc; - - // Loop reroll count; if Inc == 1, this records the scaling applied - // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; - // If Inc is not 1, Scale = Inc. - uint64_t Scale; - - // The roots themselves. - SmallVector RootSets; - - // All increment instructions for IV. - SmallInstructionVector LoopIncs; - - // Map of all instructions in the loop (in order) to the iterations - // they are used in (or specially, IL_All for instructions - // used in the loop increment mechanism). - UsesTy Uses; - - // Map between induction variable and its increment - DenseMap &IVToIncMap; - - TinyInstructionVector LoopControlIVs; - }; - - // Check if it is a compare-like instruction whose user is a branch - bool isCompareUsedByBranch(Instruction *I) { - auto *TI = I->getParent()->getTerminator(); - if (!isa(TI) || !isa(I)) - return false; - return I->hasOneUse() && TI->getOperand(0) == I; - }; - - bool isLoopControlIV(Loop *L, Instruction *IV); - void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); - void collectPossibleReductions(Loop *L, - ReductionTracker &Reductions); - bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, - const SCEV *BackedgeTakenCount, ReductionTracker &Reductions); - }; - -} // end anonymous namespace - -// Returns true if the provided instruction is used outside the given loop. -// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in -// non-loop blocks to be outside the loop. -static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { - for (User *U : I->users()) { - if (!L->contains(cast(U))) - return true; - } - return false; -} - -// Check if an IV is only used to control the loop. There are two cases: -// 1. It only has one use which is loop increment, and the increment is only -// used by comparison and the PHI (could has sext with nsw in between), and the -// comparison is only used by branch. -// 2. It is used by loop increment and the comparison, the loop increment is -// only used by the PHI, and the comparison is used only by the branch. -bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) { - unsigned IVUses = IV->getNumUses(); - if (IVUses != 2 && IVUses != 1) - return false; - - for (auto *User : IV->users()) { - int32_t IncOrCmpUses = User->getNumUses(); - bool IsCompInst = isCompareUsedByBranch(cast(User)); - - // User can only have one or two uses. - if (IncOrCmpUses != 2 && IncOrCmpUses != 1) - return false; - - // Case 1 - if (IVUses == 1) { - // The only user must be the loop increment. - // The loop increment must have two uses. - if (IsCompInst || IncOrCmpUses != 2) - return false; - } - - // Case 2 - if (IVUses == 2 && IncOrCmpUses != 1) - return false; - - // The users of the IV must be a binary operation or a comparison - if (auto *BO = dyn_cast(User)) { - if (BO->getOpcode() == Instruction::Add) { - // Loop Increment - // User of Loop Increment should be either PHI or CMP - for (auto *UU : User->users()) { - if (PHINode *PN = dyn_cast(UU)) { - if (PN != IV) - return false; - } - // Must be a CMP or an ext (of a value with nsw) then CMP - else { - auto *UUser = cast(UU); - // Skip SExt if we are extending an nsw value - // TODO: Allow ZExt too - if (BO->hasNoSignedWrap() && UUser->hasOneUse() && - isa(UUser)) - UUser = cast(*(UUser->user_begin())); - if (!isCompareUsedByBranch(UUser)) - return false; - } - } - } else - return false; - // Compare : can only have one use, and must be branch - } else if (!IsCompInst) - return false; - } - return true; -} - -// Collect the list of loop induction variables with respect to which it might -// be possible to reroll the loop. -void LoopReroll::collectPossibleIVs(Loop *L, - SmallInstructionVector &PossibleIVs) { - for (Instruction &IV : L->getHeader()->phis()) { - if (!IV.getType()->isIntegerTy() && !IV.getType()->isPointerTy()) - continue; - - if (const SCEVAddRecExpr *PHISCEV = - dyn_cast(SE->getSCEV(&IV))) { - if (PHISCEV->getLoop() != L) - continue; - if (!PHISCEV->isAffine()) - continue; - const auto *IncSCEV = dyn_cast(PHISCEV->getStepRecurrence(*SE)); - if (IncSCEV) { - IVToIncMap[&IV] = IncSCEV->getValue()->getSExtValue(); - LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << IV << " = " << *PHISCEV - << "\n"); - - if (isLoopControlIV(L, &IV)) { - LoopControlIVs.push_back(&IV); - LLVM_DEBUG(dbgs() << "LRR: Loop control only IV: " << IV - << " = " << *PHISCEV << "\n"); - } else - PossibleIVs.push_back(&IV); - } - } - } -} - -// Add the remainder of the reduction-variable chain to the instruction vector -// (the initial PHINode has already been added). If successful, the object is -// marked as valid. -void LoopReroll::SimpleLoopReduction::add(Loop *L) { - assert(!Valid && "Cannot add to an already-valid chain"); - - // The reduction variable must be a chain of single-use instructions - // (including the PHI), except for the last value (which is used by the PHI - // and also outside the loop). - Instruction *C = Instructions.front(); - if (C->user_empty()) - return; - - do { - C = cast(*C->user_begin()); - if (C->hasOneUse()) { - if (!C->isBinaryOp()) - return; - - if (!(isa(Instructions.back()) || - C->isSameOperationAs(Instructions.back()))) - return; - - Instructions.push_back(C); - } - } while (C->hasOneUse()); - - if (Instructions.size() < 2 || - !C->isSameOperationAs(Instructions.back()) || - C->use_empty()) - return; - - // C is now the (potential) last instruction in the reduction chain. - for (User *U : C->users()) { - // The only in-loop user can be the initial PHI. - if (L->contains(cast(U))) - if (cast(U) != Instructions.front()) - return; - } - - Instructions.push_back(C); - Valid = true; -} - -// Collect the vector of possible reduction variables. -void LoopReroll::collectPossibleReductions(Loop *L, - ReductionTracker &Reductions) { - BasicBlock *Header = L->getHeader(); - for (BasicBlock::iterator I = Header->begin(), - IE = Header->getFirstInsertionPt(); I != IE; ++I) { - if (!isa(I)) - continue; - if (!I->getType()->isSingleValueType()) - continue; - - SimpleLoopReduction SLR(&*I, L); - if (!SLR.valid()) - continue; - - LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " - << SLR.size() << " chained instructions)\n"); - Reductions.addSLR(SLR); - } -} - -// Collect the set of all users of the provided root instruction. This set of -// users contains not only the direct users of the root instruction, but also -// all users of those users, and so on. There are two exceptions: -// -// 1. Instructions in the set of excluded instructions are never added to the -// use set (even if they are users). This is used, for example, to exclude -// including root increments in the use set of the primary IV. -// -// 2. Instructions in the set of final instructions are added to the use set -// if they are users, but their users are not added. This is used, for -// example, to prevent a reduction update from forcing all later reduction -// updates into the use set. -void LoopReroll::DAGRootTracker::collectInLoopUserSet( - Instruction *Root, const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet &Users) { - SmallInstructionVector Queue(1, Root); - while (!Queue.empty()) { - Instruction *I = Queue.pop_back_val(); - if (!Users.insert(I).second) - continue; - - if (!Final.count(I)) - for (Use &U : I->uses()) { - Instruction *User = cast(U.getUser()); - if (PHINode *PN = dyn_cast(User)) { - // Ignore "wrap-around" uses to PHIs of this loop's header. - if (PN->getIncomingBlock(U) == L->getHeader()) - continue; - } - - if (L->contains(User) && !Exclude.count(User)) { - Queue.push_back(User); - } - } - - // We also want to collect single-user "feeder" values. - for (Use &U : I->operands()) { - if (Instruction *Op = dyn_cast(U)) - if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && - !Final.count(Op)) - Queue.push_back(Op); - } - } -} - -// Collect all of the users of all of the provided root instructions (combined -// into a single set). -void LoopReroll::DAGRootTracker::collectInLoopUserSet( - const SmallInstructionVector &Roots, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet &Users) { - for (Instruction *Root : Roots) - collectInLoopUserSet(Root, Exclude, Final, Users); -} - -static bool isUnorderedLoadStore(Instruction *I) { - if (LoadInst *LI = dyn_cast(I)) - return LI->isUnordered(); - if (StoreInst *SI = dyn_cast(I)) - return SI->isUnordered(); - if (MemIntrinsic *MI = dyn_cast(I)) - return !MI->isVolatile(); - return false; -} - -/// Return true if IVU is a "simple" arithmetic operation. -/// This is used for narrowing the search space for DAGRoots; only arithmetic -/// and GEPs can be part of a DAGRoot. -static bool isSimpleArithmeticOp(User *IVU) { - if (Instruction *I = dyn_cast(IVU)) { - switch (I->getOpcode()) { - default: return false; - case Instruction::Add: - case Instruction::Sub: - case Instruction::Mul: - case Instruction::Shl: - case Instruction::AShr: - case Instruction::LShr: - case Instruction::GetElementPtr: - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - return true; - } - } - return false; -} - -static bool isLoopIncrement(User *U, Instruction *IV) { - BinaryOperator *BO = dyn_cast(U); - - if ((BO && BO->getOpcode() != Instruction::Add) || - (!BO && !isa(U))) - return false; - - for (auto *UU : U->users()) { - PHINode *PN = dyn_cast(UU); - if (PN && PN == IV) - return true; - } - return false; -} - -bool LoopReroll::DAGRootTracker:: -collectPossibleRoots(Instruction *Base, std::map &Roots) { - SmallInstructionVector BaseUsers; - - for (auto *I : Base->users()) { - ConstantInt *CI = nullptr; - - if (isLoopIncrement(I, IV)) { - LoopIncs.push_back(cast(I)); - continue; - } - - // The root nodes must be either GEPs, ORs or ADDs. - if (auto *BO = dyn_cast(I)) { - if (BO->getOpcode() == Instruction::Add || - BO->getOpcode() == Instruction::Or) - CI = dyn_cast(BO->getOperand(1)); - } else if (auto *GEP = dyn_cast(I)) { - Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1); - CI = dyn_cast(LastOperand); - } - - if (!CI) { - if (Instruction *II = dyn_cast(I)) { - BaseUsers.push_back(II); - continue; - } else { - LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I - << "\n"); - return false; - } - } - - int64_t V = std::abs(CI->getValue().getSExtValue()); - if (Roots.find(V) != Roots.end()) - // No duplicates, please. - return false; - - Roots[V] = cast(I); - } - - // Make sure we have at least two roots. - if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty())) - return false; - - // If we found non-loop-inc, non-root users of Base, assume they are - // for the zeroth root index. This is because "add %a, 0" gets optimized - // away. - if (BaseUsers.size()) { - if (Roots.find(0) != Roots.end()) { - LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n"); - return false; - } - Roots[0] = Base; - } - - // Calculate the number of users of the base, or lowest indexed, iteration. - unsigned NumBaseUses = BaseUsers.size(); - if (NumBaseUses == 0) - NumBaseUses = Roots.begin()->second->getNumUses(); - - // Check that every node has the same number of users. - for (auto &KV : Roots) { - if (KV.first == 0) - continue; - if (!KV.second->hasNUses(NumBaseUses)) { - LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " - << "#Base=" << NumBaseUses - << ", #Root=" << KV.second->getNumUses() << "\n"); - return false; - } - } - - return true; -} - -void LoopReroll::DAGRootTracker:: -findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) { - // Does the user look like it could be part of a root set? - // All its users must be simple arithmetic ops. - if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1)) - return; - - if (I != IV && findRootsBase(I, SubsumedInsts)) - return; - - SubsumedInsts.insert(I); - - for (User *V : I->users()) { - Instruction *I = cast(V); - if (is_contained(LoopIncs, I)) - continue; - - if (!isSimpleArithmeticOp(I)) - continue; - - // The recursive call makes a copy of SubsumedInsts. - findRootsRecursive(I, SubsumedInsts); - } -} - -bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) { - if (DRS.Roots.empty()) - return false; - - // If the value of the base instruction is used outside the loop, we cannot - // reroll the loop. Check for other root instructions is unnecessary because - // they don't match any base instructions if their values are used outside. - if (hasUsesOutsideLoop(DRS.BaseInst, L)) - return false; - - // Consider a DAGRootSet with N-1 roots (so N different values including - // BaseInst). - // Define d = Roots[0] - BaseInst, which should be the same as - // Roots[I] - Roots[I-1] for all I in [1..N). - // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the - // loop iteration J. - // - // Now, For the loop iterations to be consecutive: - // D = d * N - const auto *ADR = dyn_cast(SE->getSCEV(DRS.BaseInst)); - if (!ADR) - return false; - - // Check that the first root is evenly spaced. - unsigned N = DRS.Roots.size() + 1; - const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR); - if (isa(StepSCEV) || StepSCEV->getType()->isPointerTy()) - return false; - const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N); - if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) - return false; - - // Check that the remainling roots are evenly spaced. - for (unsigned i = 1; i < N - 1; ++i) { - const SCEV *NewStepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[i]), - SE->getSCEV(DRS.Roots[i-1])); - if (NewStepSCEV != StepSCEV) - return false; - } - - return true; -} - -bool LoopReroll::DAGRootTracker:: -findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { - // The base of a RootSet must be an AddRec, so it can be erased. - const auto *IVU_ADR = dyn_cast(SE->getSCEV(IVU)); - if (!IVU_ADR || IVU_ADR->getLoop() != L) - return false; - - std::map V; - if (!collectPossibleRoots(IVU, V)) - return false; - - // If we didn't get a root for index zero, then IVU must be - // subsumed. - if (V.find(0) == V.end()) - SubsumedInsts.insert(IVU); - - // Partition the vector into monotonically increasing indexes. - DAGRootSet DRS; - DRS.BaseInst = nullptr; - - SmallVector PotentialRootSets; - - for (auto &KV : V) { - if (!DRS.BaseInst) { - DRS.BaseInst = KV.second; - DRS.SubsumedInsts = SubsumedInsts; - } else if (DRS.Roots.empty()) { - DRS.Roots.push_back(KV.second); - } else if (V.find(KV.first - 1) != V.end()) { - DRS.Roots.push_back(KV.second); - } else { - // Linear sequence terminated. - if (!validateRootSet(DRS)) - return false; - - // Construct a new DAGRootSet with the next sequence. - PotentialRootSets.push_back(DRS); - DRS.BaseInst = KV.second; - DRS.Roots.clear(); - } - } - - if (!validateRootSet(DRS)) - return false; - - PotentialRootSets.push_back(DRS); - - RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end()); - - return true; -} - -bool LoopReroll::DAGRootTracker::findRoots() { - Inc = IVToIncMap[IV]; - - assert(RootSets.empty() && "Unclean state!"); - if (std::abs(Inc) == 1) { - for (auto *IVU : IV->users()) { - if (isLoopIncrement(IVU, IV)) - LoopIncs.push_back(cast(IVU)); - } - findRootsRecursive(IV, SmallInstructionSet()); - LoopIncs.push_back(IV); - } else { - if (!findRootsBase(IV, SmallInstructionSet())) - return false; - } - - // Ensure all sets have the same size. - if (RootSets.empty()) { - LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n"); - return false; - } - for (auto &V : RootSets) { - if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) { - LLVM_DEBUG( - dbgs() - << "LRR: Aborting because not all root sets have the same size\n"); - return false; - } - } - - Scale = RootSets[0].Roots.size() + 1; - - if (Scale > IL_MaxRerollIterations) { - LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. " - << "#Found=" << Scale - << ", #Max=" << IL_MaxRerollIterations << "\n"); - return false; - } - - LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale - << "\n"); - - return true; -} - -bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) { - // Populate the MapVector with all instructions in the block, in order first, - // so we can iterate over the contents later in perfect order. - for (auto &I : *L->getHeader()) { - Uses[&I].resize(IL_End); - } - - SmallInstructionSet Exclude; - for (auto &DRS : RootSets) { - Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); - Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); - Exclude.insert(DRS.BaseInst); - } - Exclude.insert(LoopIncs.begin(), LoopIncs.end()); - - for (auto &DRS : RootSets) { - DenseSet VBase; - collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase); - for (auto *I : VBase) { - Uses[I].set(0); - } - - unsigned Idx = 1; - for (auto *Root : DRS.Roots) { - DenseSet V; - collectInLoopUserSet(Root, Exclude, PossibleRedSet, V); - - // While we're here, check the use sets are the same size. - if (V.size() != VBase.size()) { - LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n"); - return false; - } - - for (auto *I : V) { - Uses[I].set(Idx); - } - ++Idx; - } - - // Make sure our subsumed instructions are remembered too. - for (auto *I : DRS.SubsumedInsts) { - Uses[I].set(IL_All); - } - } - - // Make sure the loop increments are also accounted for. - - Exclude.clear(); - for (auto &DRS : RootSets) { - Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); - Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); - Exclude.insert(DRS.BaseInst); - } - - DenseSet V; - collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V); - for (auto *I : V) { - if (I->mayHaveSideEffects()) { - LLVM_DEBUG(dbgs() << "LRR: Aborting - " - << "An instruction which does not belong to any root " - << "sets must not have side effects: " << *I); - return false; - } - Uses[I].set(IL_All); - } - - return true; -} - -/// Get the next instruction in "In" that is a member of set Val. -/// Start searching from StartI, and do not return anything in Exclude. -/// If StartI is not given, start from In.begin(). -LoopReroll::DAGRootTracker::UsesTy::iterator -LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In, - const SmallInstructionSet &Exclude, - UsesTy::iterator *StartI) { - UsesTy::iterator I = StartI ? *StartI : In.begin(); - while (I != In.end() && (I->second.test(Val) == 0 || - Exclude.contains(I->first))) - ++I; - return I; -} - -bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) { - for (auto &DRS : RootSets) { - if (DRS.BaseInst == I) - return true; - } - return false; -} - -bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) { - for (auto &DRS : RootSets) { - if (is_contained(DRS.Roots, I)) - return true; - } - return false; -} - -/// Return true if instruction I depends on any instruction between -/// Start and End. -bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, - UsesTy::iterator Start, - UsesTy::iterator End) { - for (auto *U : I->users()) { - for (auto It = Start; It != End; ++It) - if (U == It->first) - return true; - } - return false; -} - -static bool isIgnorableInst(const Instruction *I) { - if (isa(I)) - return true; - const IntrinsicInst* II = dyn_cast(I); - if (!II) - return false; - switch (II->getIntrinsicID()) { - default: - return false; - case Intrinsic::annotation: - case Intrinsic::ptr_annotation: - case Intrinsic::var_annotation: - // TODO: the following intrinsics may also be allowed: - // lifetime_start, lifetime_end, invariant_start, invariant_end - return true; - } - return false; -} - -bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { - // We now need to check for equivalence of the use graph of each root with - // that of the primary induction variable (excluding the roots). Our goal - // here is not to solve the full graph isomorphism problem, but rather to - // catch common cases without a lot of work. As a result, we will assume - // that the relative order of the instructions in each unrolled iteration - // is the same (although we will not make an assumption about how the - // different iterations are intermixed). Note that while the order must be - // the same, the instructions may not be in the same basic block. - - // An array of just the possible reductions for this scale factor. When we - // collect the set of all users of some root instructions, these reduction - // instructions are treated as 'final' (their uses are not considered). - // This is important because we don't want the root use set to search down - // the reduction chain. - SmallInstructionSet PossibleRedSet; - SmallInstructionSet PossibleRedLastSet; - SmallInstructionSet PossibleRedPHISet; - Reductions.restrictToScale(Scale, PossibleRedSet, - PossibleRedPHISet, PossibleRedLastSet); - - // Populate "Uses" with where each instruction is used. - if (!collectUsedInstructions(PossibleRedSet)) - return false; - - // Make sure we mark the reduction PHIs as used in all iterations. - for (auto *I : PossibleRedPHISet) { - Uses[I].set(IL_All); - } - - // Make sure we mark loop-control-only PHIs as used in all iterations. See - // comment above LoopReroll::isLoopControlIV for more information. - BasicBlock *Header = L->getHeader(); - for (Instruction *LoopControlIV : LoopControlIVs) { - for (auto *U : LoopControlIV->users()) { - Instruction *IVUser = dyn_cast(U); - // IVUser could be loop increment or compare - Uses[IVUser].set(IL_All); - for (auto *UU : IVUser->users()) { - Instruction *UUser = dyn_cast(UU); - // UUser could be compare, PHI or branch - Uses[UUser].set(IL_All); - // Skip SExt - if (isa(UUser)) { - UUser = dyn_cast(*(UUser->user_begin())); - Uses[UUser].set(IL_All); - } - // Is UUser a compare instruction? - if (UU->hasOneUse()) { - Instruction *BI = dyn_cast(*UUser->user_begin()); - if (BI == cast(Header->getTerminator())) - Uses[BI].set(IL_All); - } - } - } - } - - // Make sure all instructions in the loop are in one and only one - // set. - for (auto &KV : Uses) { - if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) { - LLVM_DEBUG( - dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " - << *KV.first << " (#uses=" << KV.second.count() << ")\n"); - return false; - } - } - - LLVM_DEBUG(for (auto &KV - : Uses) { - dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n"; - }); - - BatchAAResults BatchAA(*AA); - for (unsigned Iter = 1; Iter < Scale; ++Iter) { - // In addition to regular aliasing information, we need to look for - // instructions from later (future) iterations that have side effects - // preventing us from reordering them past other instructions with side - // effects. - bool FutureSideEffects = false; - AliasSetTracker AST(BatchAA); - // The map between instructions in f(%iv.(i+1)) and f(%iv). - DenseMap BaseMap; - - // Compare iteration Iter to the base. - SmallInstructionSet Visited; - auto BaseIt = nextInstr(0, Uses, Visited); - auto RootIt = nextInstr(Iter, Uses, Visited); - auto LastRootIt = Uses.begin(); - - while (BaseIt != Uses.end() && RootIt != Uses.end()) { - Instruction *BaseInst = BaseIt->first; - Instruction *RootInst = RootIt->first; - - // Skip over the IV or root instructions; only match their users. - bool Continue = false; - if (isBaseInst(BaseInst)) { - Visited.insert(BaseInst); - BaseIt = nextInstr(0, Uses, Visited); - Continue = true; - } - if (isRootInst(RootInst)) { - LastRootIt = RootIt; - Visited.insert(RootInst); - RootIt = nextInstr(Iter, Uses, Visited); - Continue = true; - } - if (Continue) continue; - - if (!BaseInst->isSameOperationAs(RootInst)) { - // Last chance saloon. We don't try and solve the full isomorphism - // problem, but try and at least catch the case where two instructions - // *of different types* are round the wrong way. We won't be able to - // efficiently tell, given two ADD instructions, which way around we - // should match them, but given an ADD and a SUB, we can at least infer - // which one is which. - // - // This should allow us to deal with a greater subset of the isomorphism - // problem. It does however change a linear algorithm into a quadratic - // one, so limit the number of probes we do. - auto TryIt = RootIt; - unsigned N = NumToleratedFailedMatches; - while (TryIt != Uses.end() && - !BaseInst->isSameOperationAs(TryIt->first) && - N--) { - ++TryIt; - TryIt = nextInstr(Iter, Uses, Visited, &TryIt); - } - - if (TryIt == Uses.end() || TryIt == RootIt || - instrDependsOn(TryIt->first, RootIt, TryIt)) { - LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " - << *BaseInst << " vs. " << *RootInst << "\n"); - return false; - } - - RootIt = TryIt; - RootInst = TryIt->first; - } - - // All instructions between the last root and this root - // may belong to some other iteration. If they belong to a - // future iteration, then they're dangerous to alias with. - // - // Note that because we allow a limited amount of flexibility in the order - // that we visit nodes, LastRootIt might be *before* RootIt, in which - // case we've already checked this set of instructions so we shouldn't - // do anything. - for (; LastRootIt < RootIt; ++LastRootIt) { - Instruction *I = LastRootIt->first; - if (LastRootIt->second.find_first() < (int)Iter) - continue; - if (I->mayWriteToMemory()) - AST.add(I); - // Note: This is specifically guarded by a check on isa, - // which while a valid (somewhat arbitrary) micro-optimization, is - // needed because otherwise isSafeToSpeculativelyExecute returns - // false on PHI nodes. - if (!isa(I) && !isUnorderedLoadStore(I) && - !isSafeToSpeculativelyExecute(I)) - // Intervening instructions cause side effects. - FutureSideEffects = true; - } - - // Make sure that this instruction, which is in the use set of this - // root instruction, does not also belong to the base set or the set of - // some other root instruction. - if (RootIt->second.count() > 1) { - LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst - << " vs. " << *RootInst << " (prev. case overlap)\n"); - return false; - } - - // Make sure that we don't alias with any instruction in the alias set - // tracker. If we do, then we depend on a future iteration, and we - // can't reroll. - if (RootInst->mayReadFromMemory()) { - for (auto &K : AST) { - if (isModOrRefSet(K.aliasesUnknownInst(RootInst, BatchAA))) { - LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " - << *BaseInst << " vs. " << *RootInst - << " (depends on future store)\n"); - return false; - } - } - } - - // If we've past an instruction from a future iteration that may have - // side effects, and this instruction might also, then we can't reorder - // them, and this matching fails. As an exception, we allow the alias - // set tracker to handle regular (unordered) load/store dependencies. - if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) && - !isSafeToSpeculativelyExecute(BaseInst)) || - (!isUnorderedLoadStore(RootInst) && - !isSafeToSpeculativelyExecute(RootInst)))) { - LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst - << " vs. " << *RootInst - << " (side effects prevent reordering)\n"); - return false; - } - - // For instructions that are part of a reduction, if the operation is - // associative, then don't bother matching the operands (because we - // already know that the instructions are isomorphic, and the order - // within the iteration does not matter). For non-associative reductions, - // we do need to match the operands, because we need to reject - // out-of-order instructions within an iteration! - // For example (assume floating-point addition), we need to reject this: - // x += a[i]; x += b[i]; - // x += a[i+1]; x += b[i+1]; - // x += b[i+2]; x += a[i+2]; - bool InReduction = Reductions.isPairInSame(BaseInst, RootInst); - - if (!(InReduction && BaseInst->isAssociative())) { - bool Swapped = false, SomeOpMatched = false; - for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) { - Value *Op2 = RootInst->getOperand(j); - - // If this is part of a reduction (and the operation is not - // associatve), then we match all operands, but not those that are - // part of the reduction. - if (InReduction) - if (Instruction *Op2I = dyn_cast(Op2)) - if (Reductions.isPairInSame(RootInst, Op2I)) - continue; - - DenseMap::iterator BMI = BaseMap.find(Op2); - if (BMI != BaseMap.end()) { - Op2 = BMI->second; - } else { - for (auto &DRS : RootSets) { - if (DRS.Roots[Iter-1] == (Instruction*) Op2) { - Op2 = DRS.BaseInst; - break; - } - } - } - - if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) { - // If we've not already decided to swap the matched operands, and - // we've not already matched our first operand (note that we could - // have skipped matching the first operand because it is part of a - // reduction above), and the instruction is commutative, then try - // the swapped match. - if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched && - BaseInst->getOperand(!j) == Op2) { - Swapped = true; - } else { - LLVM_DEBUG(dbgs() - << "LRR: iteration root match failed at " << *BaseInst - << " vs. " << *RootInst << " (operand " << j << ")\n"); - return false; - } - } - - SomeOpMatched = true; - } - } - - if ((!PossibleRedLastSet.count(BaseInst) && - hasUsesOutsideLoop(BaseInst, L)) || - (!PossibleRedLastSet.count(RootInst) && - hasUsesOutsideLoop(RootInst, L))) { - LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst - << " vs. " << *RootInst << " (uses outside loop)\n"); - return false; - } - - Reductions.recordPair(BaseInst, RootInst, Iter); - BaseMap.insert(std::make_pair(RootInst, BaseInst)); - - LastRootIt = RootIt; - Visited.insert(BaseInst); - Visited.insert(RootInst); - BaseIt = nextInstr(0, Uses, Visited); - RootIt = nextInstr(Iter, Uses, Visited); - } - assert(BaseIt == Uses.end() && RootIt == Uses.end() && - "Mismatched set sizes!"); - } - - LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV - << "\n"); - - return true; -} - -void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) { - BasicBlock *Header = L->getHeader(); - - // Compute the start and increment for each BaseInst before we start erasing - // instructions. - SmallVector StartExprs; - SmallVector IncrExprs; - for (auto &DRS : RootSets) { - const SCEVAddRecExpr *IVSCEV = - cast(SE->getSCEV(DRS.BaseInst)); - StartExprs.push_back(IVSCEV->getStart()); - IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV)); - } - - // Remove instructions associated with non-base iterations. - for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(*Header))) { - unsigned I = Uses[&Inst].find_first(); - if (I > 0 && I < IL_All) { - LLVM_DEBUG(dbgs() << "LRR: removing: " << Inst << "\n"); - Inst.eraseFromParent(); - } - } - - // Rewrite each BaseInst using SCEV. - for (size_t i = 0, e = RootSets.size(); i != e; ++i) - // Insert the new induction variable. - replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]); - - { // Limit the lifetime of SCEVExpander. - BranchInst *BI = cast(Header->getTerminator()); - const DataLayout &DL = Header->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "reroll"); - auto Zero = SE->getZero(BackedgeTakenCount->getType()); - auto One = SE->getOne(BackedgeTakenCount->getType()); - auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap); - Value *NewIV = - Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(), - Header->getFirstNonPHIOrDbg()); - // FIXME: This arithmetic can overflow. - auto TripCount = SE->getAddExpr(BackedgeTakenCount, One); - auto ScaledTripCount = SE->getMulExpr( - TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale)); - auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One); - Value *TakenCount = - Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(), - Header->getFirstNonPHIOrDbg()); - Value *Cond = - new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond"); - BI->setCondition(Cond); - - if (BI->getSuccessor(1) != Header) - BI->swapSuccessors(); - } - - SimplifyInstructionsInBlock(Header, TLI); - DeleteDeadPHIs(Header, TLI); -} - -void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS, - const SCEV *Start, - const SCEV *IncrExpr) { - BasicBlock *Header = L->getHeader(); - Instruction *Inst = DRS.BaseInst; - - const SCEV *NewIVSCEV = - SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap); - - { // Limit the lifetime of SCEVExpander. - const DataLayout &DL = Header->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "reroll"); - Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(), - Header->getFirstNonPHIOrDbg()); - - for (auto &KV : Uses) - if (KV.second.find_first() == 0) - KV.first->replaceUsesOfWith(Inst, NewIV); - } -} - -// Validate the selected reductions. All iterations must have an isomorphic -// part of the reduction chain and, for non-associative reductions, the chain -// entries must appear in order. -bool LoopReroll::ReductionTracker::validateSelected() { - // For a non-associative reduction, the chain entries must appear in order. - for (int i : Reds) { - int PrevIter = 0, BaseCount = 0, Count = 0; - for (Instruction *J : PossibleReds[i]) { - // Note that all instructions in the chain must have been found because - // all instructions in the function must have been assigned to some - // iteration. - int Iter = PossibleRedIter[J]; - if (Iter != PrevIter && Iter != PrevIter + 1 && - !PossibleReds[i].getReducedValue()->isAssociative()) { - LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " - << J << "\n"); - return false; - } - - if (Iter != PrevIter) { - if (Count != BaseCount) { - LLVM_DEBUG(dbgs() - << "LRR: Iteration " << PrevIter << " reduction use count " - << Count << " is not equal to the base use count " - << BaseCount << "\n"); - return false; - } - - Count = 0; - } - - ++Count; - if (Iter == 0) - ++BaseCount; - - PrevIter = Iter; - } - } - - return true; -} - -// For all selected reductions, remove all parts except those in the first -// iteration (and the PHI). Replace outside uses of the reduced value with uses -// of the first-iteration reduced value (in other words, reroll the selected -// reductions). -void LoopReroll::ReductionTracker::replaceSelected() { - // Fixup reductions to refer to the last instruction associated with the - // first iteration (not the last). - for (int i : Reds) { - int j = 0; - for (int e = PossibleReds[i].size(); j != e; ++j) - if (PossibleRedIter[PossibleReds[i][j]] != 0) { - --j; - break; - } - - // Replace users with the new end-of-chain value. - SmallInstructionVector Users; - for (User *U : PossibleReds[i].getReducedValue()->users()) { - Users.push_back(cast(U)); - } - - for (Instruction *User : Users) - User->replaceUsesOfWith(PossibleReds[i].getReducedValue(), - PossibleReds[i][j]); - } -} - -// Reroll the provided loop with respect to the provided induction variable. -// Generally, we're looking for a loop like this: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// f(%iv) -// %iv.1 = add %iv, 1 <-- a root increment -// f(%iv.1) -// %iv.2 = add %iv, 2 <-- a root increment -// f(%iv.2) -// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment -// f(%iv.scale_m_1) -// ... -// %iv.next = add %iv, scale -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit -// -// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of -// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can -// be intermixed with eachother. The restriction imposed by this algorithm is -// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), -// etc. be the same. -// -// First, we collect the use set of %iv, excluding the other increment roots. -// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) -// times, having collected the use set of f(%iv.(i+1)), during which we: -// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to -// the next unmatched instruction in f(%iv.(i+1)). -// - Ensure that both matched instructions don't have any external users -// (with the exception of last-in-chain reduction instructions). -// - Track the (aliasing) write set, and other side effects, of all -// instructions that belong to future iterations that come before the matched -// instructions. If the matched instructions read from that write set, then -// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in -// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, -// if any of these future instructions had side effects (could not be -// speculatively executed), and so do the matched instructions, when we -// cannot reorder those side-effect-producing instructions, and rerolling -// fails. -// -// Finally, we make sure that all loop instructions are either loop increment -// roots, belong to simple latch code, parts of validated reductions, part of -// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions -// have been validated), then we reroll the loop. -bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, - const SCEV *BackedgeTakenCount, - ReductionTracker &Reductions) { - DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA, - IVToIncMap, LoopControlIVs); - - if (!DAGRoots.findRoots()) - return false; - LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV - << "\n"); - - if (!DAGRoots.validate(Reductions)) - return false; - if (!Reductions.validateSelected()) - return false; - // At this point, we've validated the rerolling, and we're committed to - // making changes! - - Reductions.replaceSelected(); - DAGRoots.replace(BackedgeTakenCount); - - ++NumRerolledLoops; - return true; -} - -bool LoopReroll::runOnLoop(Loop *L) { - BasicBlock *Header = L->getHeader(); - LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %" - << Header->getName() << " (" << L->getNumBlocks() - << " block(s))\n"); - - // For now, we'll handle only single BB loops. - if (L->getNumBlocks() > 1) - return false; - - if (!SE->hasLoopInvariantBackedgeTakenCount(L)) - return false; - - const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); - LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n"); - LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount - << "\n"); - - // First, we need to find the induction variable with respect to which we can - // reroll (there may be several possible options). - SmallInstructionVector PossibleIVs; - IVToIncMap.clear(); - LoopControlIVs.clear(); - collectPossibleIVs(L, PossibleIVs); - - if (PossibleIVs.empty()) { - LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n"); - return false; - } - - ReductionTracker Reductions; - collectPossibleReductions(L, Reductions); - bool Changed = false; - - // For each possible IV, collect the associated possible set of 'root' nodes - // (i+1, i+2, etc.). - for (Instruction *PossibleIV : PossibleIVs) - if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) { - Changed = true; - break; - } - LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n"); - - // Trip count of L has changed so SE must be re-evaluated. - if (Changed) - SE->forgetLoop(L); - - return Changed; -} - -PreservedAnalyses LoopRerollPass::run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, - LPMUpdater &U) { - return LoopReroll(&AR.AA, &AR.LI, &AR.SE, &AR.TLI, &AR.DT, true).runOnLoop(&L) - ? getLoopPassPreservedAnalyses() - : PreservedAnalyses::all(); -} diff --git a/llvm/test/Transforms/LoopReroll/basic.ll b/llvm/test/Transforms/LoopReroll/basic.ll deleted file mode 100644 index 92d3456..0000000 --- a/llvm/test/Transforms/LoopReroll/basic.ll +++ /dev/null @@ -1,976 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; int foo(int a); -; void bar(int *x) { -; for (int i = 0; i < 500; i += 3) { -; foo(i); -; foo(i+1); -; foo(i+2); -; } -; } - -define void @bar(ptr nocapture readnone %x) #0 { -; CHECK-LABEL: define void @bar -; CHECK-SAME: (ptr nocapture readnone [[X:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @foo(i32 [[INDVAR]]) #[[ATTR1:[0-9]+]] -; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i32 [[INDVAR]], 500 -; CHECK-NEXT: br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ] - %call = tail call i32 @foo(i32 %i.08) #1 - %add = add nsw i32 %i.08, 1 - %call1 = tail call i32 @foo(i32 %add) #1 - %add2 = add nsw i32 %i.08, 2 - %call3 = tail call i32 @foo(i32 %add2) #1 - %add3 = add nsw i32 %i.08, 3 - %exitcond = icmp sge i32 %add3, 500 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} - -declare i32 @foo(i32) - -; void hi1(int *x) { -; for (int i = 0; i < 1500; i += 3) { -; x[i] = foo(0); -; x[i+1] = foo(0); -; x[i+2] = foo(0); -; } -; } - -; Function Attrs: nounwind uwtable -define void @hi1(ptr nocapture %x) #0 { -; CHECK-LABEL: define void @hi1 -; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVAR]] -; CHECK-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 1499 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %call = tail call i32 @foo(i32 0) #1 - %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvars.iv - store i32 %call, ptr %arrayidx, align 4 - %call1 = tail call i32 @foo(i32 0) #1 - %0 = add nsw i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %0 - store i32 %call1, ptr %arrayidx3, align 4 - %call4 = tail call i32 @foo(i32 0) #1 - %1 = add nsw i64 %indvars.iv, 2 - %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %1 - store i32 %call4, ptr %arrayidx7, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3 - %2 = trunc i64 %indvars.iv.next to i32 - %cmp = icmp slt i32 %2, 1500 - br i1 %cmp, label %for.body, label %for.end - -for.end: ; preds = %for.body - ret void -} - -; void hi2(int *x) { -; for (int i = 0; i < 500; ++i) { -; x[3*i] = foo(0); -; x[3*i+1] = foo(0); -; x[3*i+2] = foo(0); -; } -; } - -; Function Attrs: nounwind uwtable -define void @hi2(ptr nocapture %x) #0 { -; CHECK-LABEL: define void @hi2 -; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499 -; CHECK-NEXT: br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %call = tail call i32 @foo(i32 0) #1 - %0 = mul nsw i64 %indvars.iv, 3 - %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0 - store i32 %call, ptr %arrayidx, align 4 - %call1 = tail call i32 @foo(i32 0) #1 - %1 = add nsw i64 %0, 1 - %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1 - store i32 %call1, ptr %arrayidx4, align 4 - %call5 = tail call i32 @foo(i32 0) #1 - %2 = add nsw i64 %0, 2 - %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2 - store i32 %call5, ptr %arrayidx9, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 500 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} - -; void goo(float alpha, float *a, float *b) { -; for (int i = 0; i < 3200; i += 5) { -; a[i] += alpha * b[i]; -; a[i + 1] += alpha * b[i + 1]; -; a[i + 2] += alpha * b[i + 2]; -; a[i + 3] += alpha * b[i + 3]; -; a[i + 4] += alpha * b[i + 4]; -; } -; } - -; Function Attrs: nounwind uwtable -define void @goo(float %alpha, ptr nocapture %a, ptr nocapture readonly %b) #0 { -; CHECK-LABEL: define void @goo -; CHECK-SAME: (float [[ALPHA:%.*]], ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVAR]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP1]], [[ALPHA]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVAR]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP2]], [[MUL]] -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 3199 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv - %0 = load float, ptr %arrayidx, align 4 - %mul = fmul float %0, %alpha - %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv - %1 = load float, ptr %arrayidx2, align 4 - %add = fadd float %1, %mul - store float %add, ptr %arrayidx2, align 4 - %2 = add nsw i64 %indvars.iv, 1 - %arrayidx5 = getelementptr inbounds float, ptr %b, i64 %2 - %3 = load float, ptr %arrayidx5, align 4 - %mul6 = fmul float %3, %alpha - %arrayidx9 = getelementptr inbounds float, ptr %a, i64 %2 - %4 = load float, ptr %arrayidx9, align 4 - %add10 = fadd float %4, %mul6 - store float %add10, ptr %arrayidx9, align 4 - %5 = add nsw i64 %indvars.iv, 2 - %arrayidx13 = getelementptr inbounds float, ptr %b, i64 %5 - %6 = load float, ptr %arrayidx13, align 4 - %mul14 = fmul float %6, %alpha - %arrayidx17 = getelementptr inbounds float, ptr %a, i64 %5 - %7 = load float, ptr %arrayidx17, align 4 - %add18 = fadd float %7, %mul14 - store float %add18, ptr %arrayidx17, align 4 - %8 = add nsw i64 %indvars.iv, 3 - %arrayidx21 = getelementptr inbounds float, ptr %b, i64 %8 - %9 = load float, ptr %arrayidx21, align 4 - %mul22 = fmul float %9, %alpha - %arrayidx25 = getelementptr inbounds float, ptr %a, i64 %8 - %10 = load float, ptr %arrayidx25, align 4 - %add26 = fadd float %10, %mul22 - store float %add26, ptr %arrayidx25, align 4 - %11 = add nsw i64 %indvars.iv, 4 - %arrayidx29 = getelementptr inbounds float, ptr %b, i64 %11 - %12 = load float, ptr %arrayidx29, align 4 - %mul30 = fmul float %12, %alpha - %arrayidx33 = getelementptr inbounds float, ptr %a, i64 %11 - %13 = load float, ptr %arrayidx33, align 4 - %add34 = fadd float %13, %mul30 - store float %add34, ptr %arrayidx33, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5 - %14 = trunc i64 %indvars.iv.next to i32 - %cmp = icmp slt i32 %14, 3200 - br i1 %cmp, label %for.body, label %for.end - -for.end: ; preds = %for.body - ret void -} - -; void hoo(float alpha, float *a, float *b, int *ip) { -; for (int i = 0; i < 3200; i += 5) { -; a[i] += alpha * b[ip[i]]; -; a[i + 1] += alpha * b[ip[i + 1]]; -; a[i + 2] += alpha * b[ip[i + 2]]; -; a[i + 3] += alpha * b[ip[i + 3]]; -; a[i + 4] += alpha * b[ip[i + 4]]; -; } -; } - -; Function Attrs: nounwind uwtable -define void @hoo(float %alpha, ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %ip) #0 { -; CHECK-LABEL: define void @hoo -; CHECK-SAME: (float [[ALPHA:%.*]], ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[IP:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[IP]], i64 [[INDVAR]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP2]], [[ALPHA]] -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVAR]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[MUL]] -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 3199 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %ip, i64 %indvars.iv - %0 = load i32, ptr %arrayidx, align 4 - %idxprom1 = sext i32 %0 to i64 - %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %idxprom1 - %1 = load float, ptr %arrayidx2, align 4 - %mul = fmul float %1, %alpha - %arrayidx4 = getelementptr inbounds float, ptr %a, i64 %indvars.iv - %2 = load float, ptr %arrayidx4, align 4 - %add = fadd float %2, %mul - store float %add, ptr %arrayidx4, align 4 - %3 = add nsw i64 %indvars.iv, 1 - %arrayidx7 = getelementptr inbounds i32, ptr %ip, i64 %3 - %4 = load i32, ptr %arrayidx7, align 4 - %idxprom8 = sext i32 %4 to i64 - %arrayidx9 = getelementptr inbounds float, ptr %b, i64 %idxprom8 - %5 = load float, ptr %arrayidx9, align 4 - %mul10 = fmul float %5, %alpha - %arrayidx13 = getelementptr inbounds float, ptr %a, i64 %3 - %6 = load float, ptr %arrayidx13, align 4 - %add14 = fadd float %6, %mul10 - store float %add14, ptr %arrayidx13, align 4 - %7 = add nsw i64 %indvars.iv, 2 - %arrayidx17 = getelementptr inbounds i32, ptr %ip, i64 %7 - %8 = load i32, ptr %arrayidx17, align 4 - %idxprom18 = sext i32 %8 to i64 - %arrayidx19 = getelementptr inbounds float, ptr %b, i64 %idxprom18 - %9 = load float, ptr %arrayidx19, align 4 - %mul20 = fmul float %9, %alpha - %arrayidx23 = getelementptr inbounds float, ptr %a, i64 %7 - %10 = load float, ptr %arrayidx23, align 4 - %add24 = fadd float %10, %mul20 - store float %add24, ptr %arrayidx23, align 4 - %11 = add nsw i64 %indvars.iv, 3 - %arrayidx27 = getelementptr inbounds i32, ptr %ip, i64 %11 - %12 = load i32, ptr %arrayidx27, align 4 - %idxprom28 = sext i32 %12 to i64 - %arrayidx29 = getelementptr inbounds float, ptr %b, i64 %idxprom28 - %13 = load float, ptr %arrayidx29, align 4 - %mul30 = fmul float %13, %alpha - %arrayidx33 = getelementptr inbounds float, ptr %a, i64 %11 - %14 = load float, ptr %arrayidx33, align 4 - %add34 = fadd float %14, %mul30 - store float %add34, ptr %arrayidx33, align 4 - %15 = add nsw i64 %indvars.iv, 4 - %arrayidx37 = getelementptr inbounds i32, ptr %ip, i64 %15 - %16 = load i32, ptr %arrayidx37, align 4 - %idxprom38 = sext i32 %16 to i64 - %arrayidx39 = getelementptr inbounds float, ptr %b, i64 %idxprom38 - %17 = load float, ptr %arrayidx39, align 4 - %mul40 = fmul float %17, %alpha - %arrayidx43 = getelementptr inbounds float, ptr %a, i64 %15 - %18 = load float, ptr %arrayidx43, align 4 - %add44 = fadd float %18, %mul40 - store float %add44, ptr %arrayidx43, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5 - %19 = trunc i64 %indvars.iv.next to i32 - %cmp = icmp slt i32 %19, 3200 - br i1 %cmp, label %for.body, label %for.end - - - - -for.end: ; preds = %for.body - ret void -} - -; void multi1(int *x) { -; y = foo(0) -; for (int i = 0; i < 500; ++i) { -; x[3*i] = y; -; x[3*i+1] = y; -; x[3*i+2] = y; -; x[3*i+6] = y; -; x[3*i+7] = y; -; x[3*i+8] = y; -; } -; } - -; Function Attrs: nounwind uwtable -define void @multi1(ptr nocapture %x) #0 { -; CHECK-LABEL: define void @multi1 -; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDVARS_IV]], 6 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[TMP0]] -; CHECK-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499 -; CHECK-NEXT: br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - %call = tail call i32 @foo(i32 0) #1 - br label %for.body - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %0 = mul nsw i64 %indvars.iv, 3 - %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0 - store i32 %call, ptr %arrayidx, align 4 - %1 = add nsw i64 %0, 1 - %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1 - store i32 %call, ptr %arrayidx4, align 4 - %2 = add nsw i64 %0, 2 - %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2 - store i32 %call, ptr %arrayidx9, align 4 - %3 = add nsw i64 %0, 6 - %arrayidx6 = getelementptr inbounds i32, ptr %x, i64 %3 - store i32 %call, ptr %arrayidx6, align 4 - %4 = add nsw i64 %0, 7 - %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %4 - store i32 %call, ptr %arrayidx7, align 4 - %5 = add nsw i64 %0, 8 - %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %5 - store i32 %call, ptr %arrayidx8, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 500 - br i1 %exitcond, label %for.end, label %for.body - - - -for.end: ; preds = %for.body - ret void -} - -; void multi2(int *x) { -; y = foo(0) -; for (int i = 0; i < 500; ++i) { -; x[3*i] = y; -; x[3*i+1] = y; -; x[3*i+2] = y; -; x[3*(i+1)] = y; -; x[3*(i+1)+1] = y; -; x[3*(i+1)+2] = y; -; } -; } - -; Function Attrs: nounwind uwtable -define void @multi2(ptr nocapture %x) #0 { -; CHECK-LABEL: define void @multi2 -; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[TMP0]] -; CHECK-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499 -; CHECK-NEXT: br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - %call = tail call i32 @foo(i32 0) #1 - br label %for.body - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %0 = mul nsw i64 %indvars.iv, 3 - %add = add nsw i64 %indvars.iv, 1 - %newmul = mul nsw i64 %add, 3 - %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0 - store i32 %call, ptr %arrayidx, align 4 - %1 = add nsw i64 %0, 1 - %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1 - store i32 %call, ptr %arrayidx4, align 4 - %2 = add nsw i64 %0, 2 - %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2 - store i32 %call, ptr %arrayidx9, align 4 - %arrayidx6 = getelementptr inbounds i32, ptr %x, i64 %newmul - store i32 %call, ptr %arrayidx6, align 4 - %3 = add nsw i64 %newmul, 1 - %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %3 - store i32 %call, ptr %arrayidx7, align 4 - %4 = add nsw i64 %newmul, 2 - %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %4 - store i32 %call, ptr %arrayidx8, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 500 - br i1 %exitcond, label %for.end, label %for.body - - - -for.end: ; preds = %for.body - ret void -} - -; void multi3(int *x) { -; y = foo(0) -; for (int i = 0; i < 500; ++i) { -; // Note: No zero index -; x[3*i+3] = y; -; x[3*i+4] = y; -; x[3*i+5] = y; -; } -; } - -; Function Attrs: nounwind uwtable -define void @multi3(ptr nocapture %x) #0 { -; CHECK-LABEL: define void @multi3 -; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[TMP0]] -; CHECK-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499 -; CHECK-NEXT: br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - %call = tail call i32 @foo(i32 0) #1 - br label %for.body - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %0 = mul nsw i64 %indvars.iv, 3 - %x0 = add nsw i64 %0, 3 - %add = add nsw i64 %indvars.iv, 1 - %arrayidx = getelementptr inbounds i32, ptr %x, i64 %x0 - store i32 %call, ptr %arrayidx, align 4 - %1 = add nsw i64 %0, 4 - %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1 - store i32 %call, ptr %arrayidx4, align 4 - %2 = add nsw i64 %0, 5 - %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2 - store i32 %call, ptr %arrayidx9, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 500 - br i1 %exitcond, label %for.end, label %for.body - - -for.end: ; preds = %for.body - ret void -} - -; int foo(int a); -; void bar2(int *x, int y, int z) { -; for (int i = 0; i < 500; i += 3) { -; foo(i+y+i*z); // Slightly reordered instruction order -; foo(i+1+y+(i+1)*z); -; foo(i+2+y+(i+2)*z); -; } -; } - -; Function Attrs: nounwind uwtable -define void @bar2(ptr nocapture readnone %x, i32 %y, i32 %z) #0 { -; CHECK-LABEL: define void @bar2 -; CHECK-SAME: (ptr nocapture readnone [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDVAR]], [[Y]] -; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[INDVAR]], [[Z]] -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @foo(i32 [[TMP3]]) #[[ATTR1]] -; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i32 [[INDVAR]], 500 -; CHECK-NEXT: br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ] - - %tmp1 = add i32 %i.08, %y - %tmp2 = mul i32 %i.08, %z - %tmp3 = add i32 %tmp2, %tmp1 - %call = tail call i32 @foo(i32 %tmp3) #1 - - %add = add nsw i32 %i.08, 1 - %tmp2a = mul i32 %add, %z - %tmp1a = add i32 %add, %y - %tmp3a = add i32 %tmp2a, %tmp1a - %calla = tail call i32 @foo(i32 %tmp3a) #1 - - %add2 = add nsw i32 %i.08, 2 - %tmp2b = mul i32 %add2, %z - %tmp1b = add i32 %add2, %y - %tmp3b = add i32 %tmp2b, %tmp1b - %callb = tail call i32 @foo(i32 %tmp3b) #1 - - %add3 = add nsw i32 %i.08, 3 - - %exitcond = icmp sge i32 %add3, 500 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} - -%struct.s = type { i32, i32 } - -; Function Attrs: nounwind uwtable -define void @gep1(ptr nocapture %x) #0 { -; CHECK-LABEL: define void @gep1 -; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = mul nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[X]], i64 [[TMP0]], i32 0 -; CHECK-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], 1 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[X]], i64 [[TMP1]], i32 0 -; CHECK-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[X]], i64 [[TMP2]], i32 0 -; CHECK-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 500 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - %call = tail call i32 @foo(i32 0) #1 - br label %for.body - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %0 = mul nsw i64 %indvars.iv, 3 - %arrayidx = getelementptr inbounds %struct.s, ptr %x, i64 %0, i32 0 - store i32 %call, ptr %arrayidx, align 4 - %1 = add nsw i64 %0, 1 - %arrayidx4 = getelementptr inbounds %struct.s, ptr %x, i64 %1, i32 0 - store i32 %call, ptr %arrayidx4, align 4 - %2 = add nsw i64 %0, 2 - %arrayidx9 = getelementptr inbounds %struct.s, ptr %x, i64 %2, i32 0 - store i32 %call, ptr %arrayidx9, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 500 - br i1 %exitcond, label %for.end, label %for.body - -; This test is a crash test only. -for.end: ; preds = %for.body - ret void -} - -define void @gep-indexing(ptr nocapture %x) { -; CHECK-LABEL: define void @gep-indexing -; CHECK-SAME: (ptr nocapture [[X:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[X]], i64 [[TMP0]] -; CHECK-NEXT: store i32 [[CALL]], ptr [[SCEVGEP]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499 -; CHECK-NEXT: br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - %call = tail call i32 @foo(i32 0) #1 - br label %for.body - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %0 = mul nsw i64 %indvars.iv, 3 - %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0 - store i32 %call, ptr %arrayidx, align 4 - %arrayidx4 = getelementptr inbounds i32, ptr %arrayidx, i64 1 - store i32 %call, ptr %arrayidx4, align 4 - %arrayidx9 = getelementptr inbounds i32, ptr %arrayidx, i64 2 - store i32 %call, ptr %arrayidx9, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 500 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} - - -define void @unordered_atomic_ops(ptr noalias %buf_0, ptr noalias %buf_1) { -; CHECK-LABEL: define void @unordered_atomic_ops -; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVAR]] -; CHECK-NEXT: [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVAR]] -; CHECK-NEXT: [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] unordered, align 4 -; CHECK-NEXT: store atomic i32 [[VA]], ptr [[BUF1_A]] unordered, align 4 -; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], 3199 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: - %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = add i32 %indvars.iv, 2 - %indvars.mid = add i32 %indvars.iv, 1 - %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv - %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid - %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv - %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid - %va = load atomic i32, ptr %buf0_a unordered, align 4 - %vb = load atomic i32, ptr %buf0_b unordered, align 4 - store atomic i32 %va, ptr %buf1_a unordered, align 4 - store atomic i32 %vb, ptr %buf1_b unordered, align 4 - %cmp = icmp slt i32 %indvars.iv.next, 3200 - br i1 %cmp, label %for.body, label %for.end - -for.end: - ret void -} - -define void @unordered_atomic_ops_nomatch(ptr noalias %buf_0, ptr noalias %buf_1) { -; Negative test -; CHECK-LABEL: define void @unordered_atomic_ops_nomatch -; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[INDVARS_MID:%.*]] = add i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[BUF0_B:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_MID]] -; CHECK-NEXT: [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[BUF1_B:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_MID]] -; CHECK-NEXT: [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] unordered, align 4 -; CHECK-NEXT: [[VB:%.*]] = load atomic i32, ptr [[BUF0_B]] unordered, align 4 -; CHECK-NEXT: store i32 [[VA]], ptr [[BUF1_A]], align 4 -; CHECK-NEXT: store atomic i32 [[VB]], ptr [[BUF1_B]] unordered, align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INDVARS_IV_NEXT]], 3200 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: - - %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = add i32 %indvars.iv, 2 - %indvars.mid = add i32 %indvars.iv, 1 - %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv - %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid - %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv - %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid - %va = load atomic i32, ptr %buf0_a unordered, align 4 - %vb = load atomic i32, ptr %buf0_b unordered, align 4 - store i32 %va, ptr %buf1_a, align 4 ;; Not atomic - store atomic i32 %vb, ptr %buf1_b unordered, align 4 - %cmp = icmp slt i32 %indvars.iv.next, 3200 - br i1 %cmp, label %for.body, label %for.end - -for.end: - ret void -} - -define void @ordered_atomic_ops(ptr noalias %buf_0, ptr noalias %buf_1) { -; Negative test -; CHECK-LABEL: define void @ordered_atomic_ops -; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[INDVARS_MID:%.*]] = add i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[BUF0_B:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_MID]] -; CHECK-NEXT: [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[BUF1_B:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_MID]] -; CHECK-NEXT: [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] acquire, align 4 -; CHECK-NEXT: [[VB:%.*]] = load atomic i32, ptr [[BUF0_B]] acquire, align 4 -; CHECK-NEXT: store atomic i32 [[VA]], ptr [[BUF1_A]] release, align 4 -; CHECK-NEXT: store atomic i32 [[VB]], ptr [[BUF1_B]] release, align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INDVARS_IV_NEXT]], 3200 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: - - %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = add i32 %indvars.iv, 2 - %indvars.mid = add i32 %indvars.iv, 1 - %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv - %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid - %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv - %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid - %va = load atomic i32, ptr %buf0_a acquire, align 4 - %vb = load atomic i32, ptr %buf0_b acquire, align 4 - store atomic i32 %va, ptr %buf1_a release, align 4 - store atomic i32 %vb, ptr %buf1_b release, align 4 - %cmp = icmp slt i32 %indvars.iv.next, 3200 - br i1 %cmp, label %for.body, label %for.end - -for.end: - ret void -} - -define void @unordered_atomic_ops_with_fence(ptr noalias %buf_0, ptr noalias %buf_1) { -; CHECK-LABEL: define void @unordered_atomic_ops_with_fence -; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[INDVARS_MID:%.*]] = add i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[BUF0_B:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_MID]] -; CHECK-NEXT: [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[BUF1_B:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_MID]] -; CHECK-NEXT: [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] unordered, align 4 -; CHECK-NEXT: [[VB:%.*]] = load atomic i32, ptr [[BUF0_B]] unordered, align 4 -; CHECK-NEXT: fence seq_cst -; CHECK-NEXT: store atomic i32 [[VA]], ptr [[BUF1_A]] unordered, align 4 -; CHECK-NEXT: store atomic i32 [[VB]], ptr [[BUF1_B]] unordered, align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INDVARS_IV_NEXT]], 3200 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: - - %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = add i32 %indvars.iv, 2 - %indvars.mid = add i32 %indvars.iv, 1 - %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv - %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid - %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv - %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid - %va = load atomic i32, ptr %buf0_a unordered, align 4 - %vb = load atomic i32, ptr %buf0_b unordered, align 4 - fence seq_cst - store atomic i32 %va, ptr %buf1_a unordered, align 4 - store atomic i32 %vb, ptr %buf1_b unordered, align 4 - %cmp = icmp slt i32 %indvars.iv.next, 3200 - br i1 %cmp, label %for.body, label %for.end - -for.end: - ret void -} - -define void @pointer_bitcast_baseinst(ptr %arg, ptr %arg1, i64 %arg2) { -; CHECK-LABEL: define void @pointer_bitcast_baseinst -; CHECK-SAME: (ptr [[ARG:%.*]], ptr [[ARG1:%.*]], i64 [[ARG2:%.*]]) { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[ARG2]], -17 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; CHECK-NEXT: br label [[BB3:%.*]] -; CHECK: bb3: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[BB3]] ], [ 0, [[BB:%.*]] ] -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[INDVAR]], 3 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], 1 -; CHECK-NEXT: [[INST5:%.*]] = shl nuw i64 [[TMP5]], 1 -; CHECK-NEXT: [[INST6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[INST5]] -; CHECK-NEXT: [[INST8:%.*]] = load <8 x i16>, ptr [[INST6]], align 2 -; CHECK-NEXT: [[INST13:%.*]] = getelementptr i16, ptr [[ARG]], i64 [[TMP5]] -; CHECK-NEXT: store <8 x i16> [[INST8]], ptr [[INST13]], align 2 -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR]], [[TMP3]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[BB19:%.*]], label [[BB3]] -; CHECK: bb19: -; CHECK-NEXT: ret void -; -bb: - br label %bb3 - -bb3: ; preds = %bb3, %bb - %inst = phi i64 [ 1, %bb ], [ %inst17, %bb3 ] - %inst4 = add nuw i64 %inst, 8 - %inst5 = shl nuw i64 %inst, 1 - %inst6 = getelementptr i8, ptr %arg1, i64 %inst5 - %inst8 = load <8 x i16>, ptr %inst6, align 2 - %inst9 = shl i64 %inst4, 1 - %inst10 = getelementptr i8, ptr %arg1, i64 %inst9 - %inst12 = load <8 x i16>, ptr %inst10, align 2 - %inst13 = getelementptr i16, ptr %arg, i64 %inst - store <8 x i16> %inst8, ptr %inst13, align 2 - %inst15 = getelementptr i16, ptr %arg, i64 %inst4 - store <8 x i16> %inst12, ptr %inst15, align 2 - %inst17 = add nuw nsw i64 %inst, 16 - %inst18 = icmp eq i64 %inst17, %arg2 - br i1 %inst18, label %bb19, label %bb3 - -bb19: ; preds = %bb3 - ret void -} - -define void @bad_step(ptr nocapture readnone %x) #0 { -; CHECK-LABEL: define void @bad_step -; CHECK-SAME: (ptr nocapture readnone [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @foo(i32 [[I_08]]) #[[ATTR1]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I_08]], 2 -; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 @foo(i32 [[ADD]]) #[[ATTR1]] -; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[I_08]], 3 -; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 @foo(i32 [[ADD2]]) #[[ATTR1]] -; CHECK-NEXT: [[ADD3]] = add nsw i32 [[I_08]], 6 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp sge i32 [[ADD3]], 500 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ] - %call = tail call i32 @foo(i32 %i.08) #1 - %add = add nsw i32 %i.08, 2 - %call1 = tail call i32 @foo(i32 %add) #1 - %add2 = add nsw i32 %i.08, 3 - %call3 = tail call i32 @foo(i32 %add2) #1 - %add3 = add nsw i32 %i.08, 6 - %exitcond = icmp sge i32 %add3, 500 - br i1 %exitcond, label %for.end, label %for.body - - -for.end: ; preds = %for.body - ret void -} - -@a = external global [2 x [512 x i64]], align 16 -@b = external global [512 x [4 x i64]], align 16 - -define void @ptr_step_crash() { -; CHECK-LABEL: define void @ptr_step_crash() { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY42_3:%.*]] -; CHECK: for.body42.3: -; CHECK-NEXT: [[K_2207_3:%.*]] = phi i32 [ -512, [[ENTRY:%.*]] ], [ [[INC63_3:%.*]], [[FOR_BODY42_3]] ] -; CHECK-NEXT: [[SUB46_3:%.*]] = add nsw i32 [[K_2207_3]], 512 -; CHECK-NEXT: [[IDXPROM47_3:%.*]] = zext i32 [[SUB46_3]] to i64 -; CHECK-NEXT: [[ARRAYIDX48_3:%.*]] = getelementptr inbounds [2 x [512 x i64]], ptr @a, i64 0, i64 0, i64 [[IDXPROM47_3]] -; CHECK-NEXT: [[ARRAYIDX55_3:%.*]] = getelementptr inbounds [512 x [4 x i64]], ptr @b, i64 0, i64 [[IDXPROM47_3]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX55_3]], align 8 -; CHECK-NEXT: [[INC63_3]] = add nsw i32 [[K_2207_3]], 1 -; CHECK-NEXT: br i1 true, label [[FOR_INC65_3:%.*]], label [[FOR_BODY42_3]] -; CHECK: for.inc65.3: -; CHECK-NEXT: ret void -; -entry: - br label %for.body42.3 - -for.body42.3: ; preds = %for.body42.3, %entry - %k.2207.3 = phi i32 [ -512, %entry ], [ %inc63.3, %for.body42.3 ] - %sub46.3 = add nsw i32 %k.2207.3, 512 - %idxprom47.3 = zext i32 %sub46.3 to i64 - %arrayidx48.3 = getelementptr inbounds [2 x [512 x i64]], ptr @a, i64 0, i64 0, i64 %idxprom47.3 - %arrayidx55.3 = getelementptr inbounds [512 x [4 x i64]], ptr @b, i64 0, i64 %idxprom47.3, i64 3 - %0 = load i64, ptr %arrayidx55.3, align 8 - %inc63.3 = add nsw i32 %k.2207.3, 1 - br i1 undef, label %for.inc65.3, label %for.body42.3 - -for.inc65.3: ; preds = %for.body42.3 - ret void -} - -attributes #0 = { nounwind uwtable } -attributes #1 = { nounwind } diff --git a/llvm/test/Transforms/LoopReroll/basic32iters.ll b/llvm/test/Transforms/LoopReroll/basic32iters.ll deleted file mode 100644 index edf38cb..0000000 --- a/llvm/test/Transforms/LoopReroll/basic32iters.ll +++ /dev/null @@ -1,328 +0,0 @@ -; RUN: opt < %s -passes=loop-reroll -verify-scev -S | FileCheck %s -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; void goo32(float alpha, float *a, float *b) { -; for (int i = 0; i < 3200; i += 32) { -; a[i] += alpha * b[i]; -; a[i + 1] += alpha * b[i + 1]; -; a[i + 2] += alpha * b[i + 2]; -; a[i + 3] += alpha * b[i + 3]; -; a[i + 4] += alpha * b[i + 4]; -; a[i + 5] += alpha * b[i + 5]; -; a[i + 6] += alpha * b[i + 6]; -; a[i + 7] += alpha * b[i + 7]; -; a[i + 8] += alpha * b[i + 8]; -; a[i + 9] += alpha * b[i + 9]; -; a[i + 10] += alpha * b[i + 10]; -; a[i + 11] += alpha * b[i + 11]; -; a[i + 12] += alpha * b[i + 12]; -; a[i + 13] += alpha * b[i + 13]; -; a[i + 14] += alpha * b[i + 14]; -; a[i + 15] += alpha * b[i + 15]; -; a[i + 16] += alpha * b[i + 16]; -; a[i + 17] += alpha * b[i + 17]; -; a[i + 18] += alpha * b[i + 18]; -; a[i + 19] += alpha * b[i + 19]; -; a[i + 20] += alpha * b[i + 20]; -; a[i + 21] += alpha * b[i + 21]; -; a[i + 22] += alpha * b[i + 22]; -; a[i + 23] += alpha * b[i + 23]; -; a[i + 24] += alpha * b[i + 24]; -; a[i + 25] += alpha * b[i + 25]; -; a[i + 26] += alpha * b[i + 26]; -; a[i + 27] += alpha * b[i + 27]; -; a[i + 28] += alpha * b[i + 28]; -; a[i + 29] += alpha * b[i + 29]; -; a[i + 30] += alpha * b[i + 30]; -; a[i + 31] += alpha * b[i + 31]; -; } -; } - -; Function Attrs: norecurse nounwind uwtable -define void @goo32(float %alpha, ptr %a, ptr readonly %b) #0 { -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv - %0 = load float, ptr %arrayidx, align 4 - %mul = fmul float %0, %alpha - %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv - %1 = load float, ptr %arrayidx2, align 4 - %add = fadd float %1, %mul - store float %add, ptr %arrayidx2, align 4 - %2 = or disjoint i64 %indvars.iv, 1 - %arrayidx5 = getelementptr inbounds float, ptr %b, i64 %2 - %3 = load float, ptr %arrayidx5, align 4 - %mul6 = fmul float %3, %alpha - %arrayidx9 = getelementptr inbounds float, ptr %a, i64 %2 - %4 = load float, ptr %arrayidx9, align 4 - %add10 = fadd float %4, %mul6 - store float %add10, ptr %arrayidx9, align 4 - %5 = or disjoint i64 %indvars.iv, 2 - %arrayidx13 = getelementptr inbounds float, ptr %b, i64 %5 - %6 = load float, ptr %arrayidx13, align 4 - %mul14 = fmul float %6, %alpha - %arrayidx17 = getelementptr inbounds float, ptr %a, i64 %5 - %7 = load float, ptr %arrayidx17, align 4 - %add18 = fadd float %7, %mul14 - store float %add18, ptr %arrayidx17, align 4 - %8 = or disjoint i64 %indvars.iv, 3 - %arrayidx21 = getelementptr inbounds float, ptr %b, i64 %8 - %9 = load float, ptr %arrayidx21, align 4 - %mul22 = fmul float %9, %alpha - %arrayidx25 = getelementptr inbounds float, ptr %a, i64 %8 - %10 = load float, ptr %arrayidx25, align 4 - %add26 = fadd float %10, %mul22 - store float %add26, ptr %arrayidx25, align 4 - %11 = or disjoint i64 %indvars.iv, 4 - %arrayidx29 = getelementptr inbounds float, ptr %b, i64 %11 - %12 = load float, ptr %arrayidx29, align 4 - %mul30 = fmul float %12, %alpha - %arrayidx33 = getelementptr inbounds float, ptr %a, i64 %11 - %13 = load float, ptr %arrayidx33, align 4 - %add34 = fadd float %13, %mul30 - store float %add34, ptr %arrayidx33, align 4 - %14 = or disjoint i64 %indvars.iv, 5 - %arrayidx37 = getelementptr inbounds float, ptr %b, i64 %14 - %15 = load float, ptr %arrayidx37, align 4 - %mul38 = fmul float %15, %alpha - %arrayidx41 = getelementptr inbounds float, ptr %a, i64 %14 - %16 = load float, ptr %arrayidx41, align 4 - %add42 = fadd float %16, %mul38 - store float %add42, ptr %arrayidx41, align 4 - %17 = or disjoint i64 %indvars.iv, 6 - %arrayidx45 = getelementptr inbounds float, ptr %b, i64 %17 - %18 = load float, ptr %arrayidx45, align 4 - %mul46 = fmul float %18, %alpha - %arrayidx49 = getelementptr inbounds float, ptr %a, i64 %17 - %19 = load float, ptr %arrayidx49, align 4 - %add50 = fadd float %19, %mul46 - store float %add50, ptr %arrayidx49, align 4 - %20 = or disjoint i64 %indvars.iv, 7 - %arrayidx53 = getelementptr inbounds float, ptr %b, i64 %20 - %21 = load float, ptr %arrayidx53, align 4 - %mul54 = fmul float %21, %alpha - %arrayidx57 = getelementptr inbounds float, ptr %a, i64 %20 - %22 = load float, ptr %arrayidx57, align 4 - %add58 = fadd float %22, %mul54 - store float %add58, ptr %arrayidx57, align 4 - %23 = or disjoint i64 %indvars.iv, 8 - %arrayidx61 = getelementptr inbounds float, ptr %b, i64 %23 - %24 = load float, ptr %arrayidx61, align 4 - %mul62 = fmul float %24, %alpha - %arrayidx65 = getelementptr inbounds float, ptr %a, i64 %23 - %25 = load float, ptr %arrayidx65, align 4 - %add66 = fadd float %25, %mul62 - store float %add66, ptr %arrayidx65, align 4 - %26 = or disjoint i64 %indvars.iv, 9 - %arrayidx69 = getelementptr inbounds float, ptr %b, i64 %26 - %27 = load float, ptr %arrayidx69, align 4 - %mul70 = fmul float %27, %alpha - %arrayidx73 = getelementptr inbounds float, ptr %a, i64 %26 - %28 = load float, ptr %arrayidx73, align 4 - %add74 = fadd float %28, %mul70 - store float %add74, ptr %arrayidx73, align 4 - %29 = or disjoint i64 %indvars.iv, 10 - %arrayidx77 = getelementptr inbounds float, ptr %b, i64 %29 - %30 = load float, ptr %arrayidx77, align 4 - %mul78 = fmul float %30, %alpha - %arrayidx81 = getelementptr inbounds float, ptr %a, i64 %29 - %31 = load float, ptr %arrayidx81, align 4 - %add82 = fadd float %31, %mul78 - store float %add82, ptr %arrayidx81, align 4 - %32 = or disjoint i64 %indvars.iv, 11 - %arrayidx85 = getelementptr inbounds float, ptr %b, i64 %32 - %33 = load float, ptr %arrayidx85, align 4 - %mul86 = fmul float %33, %alpha - %arrayidx89 = getelementptr inbounds float, ptr %a, i64 %32 - %34 = load float, ptr %arrayidx89, align 4 - %add90 = fadd float %34, %mul86 - store float %add90, ptr %arrayidx89, align 4 - %35 = or disjoint i64 %indvars.iv, 12 - %arrayidx93 = getelementptr inbounds float, ptr %b, i64 %35 - %36 = load float, ptr %arrayidx93, align 4 - %mul94 = fmul float %36, %alpha - %arrayidx97 = getelementptr inbounds float, ptr %a, i64 %35 - %37 = load float, ptr %arrayidx97, align 4 - %add98 = fadd float %37, %mul94 - store float %add98, ptr %arrayidx97, align 4 - %38 = or disjoint i64 %indvars.iv, 13 - %arrayidx101 = getelementptr inbounds float, ptr %b, i64 %38 - %39 = load float, ptr %arrayidx101, align 4 - %mul102 = fmul float %39, %alpha - %arrayidx105 = getelementptr inbounds float, ptr %a, i64 %38 - %40 = load float, ptr %arrayidx105, align 4 - %add106 = fadd float %40, %mul102 - store float %add106, ptr %arrayidx105, align 4 - %41 = or disjoint i64 %indvars.iv, 14 - %arrayidx109 = getelementptr inbounds float, ptr %b, i64 %41 - %42 = load float, ptr %arrayidx109, align 4 - %mul110 = fmul float %42, %alpha - %arrayidx113 = getelementptr inbounds float, ptr %a, i64 %41 - %43 = load float, ptr %arrayidx113, align 4 - %add114 = fadd float %43, %mul110 - store float %add114, ptr %arrayidx113, align 4 - %44 = or disjoint i64 %indvars.iv, 15 - %arrayidx117 = getelementptr inbounds float, ptr %b, i64 %44 - %45 = load float, ptr %arrayidx117, align 4 - %mul118 = fmul float %45, %alpha - %arrayidx121 = getelementptr inbounds float, ptr %a, i64 %44 - %46 = load float, ptr %arrayidx121, align 4 - %add122 = fadd float %46, %mul118 - store float %add122, ptr %arrayidx121, align 4 - %47 = or disjoint i64 %indvars.iv, 16 - %arrayidx125 = getelementptr inbounds float, ptr %b, i64 %47 - %48 = load float, ptr %arrayidx125, align 4 - %mul126 = fmul float %48, %alpha - %arrayidx129 = getelementptr inbounds float, ptr %a, i64 %47 - %49 = load float, ptr %arrayidx129, align 4 - %add130 = fadd float %49, %mul126 - store float %add130, ptr %arrayidx129, align 4 - %50 = or disjoint i64 %indvars.iv, 17 - %arrayidx133 = getelementptr inbounds float, ptr %b, i64 %50 - %51 = load float, ptr %arrayidx133, align 4 - %mul134 = fmul float %51, %alpha - %arrayidx137 = getelementptr inbounds float, ptr %a, i64 %50 - %52 = load float, ptr %arrayidx137, align 4 - %add138 = fadd float %52, %mul134 - store float %add138, ptr %arrayidx137, align 4 - %53 = or disjoint i64 %indvars.iv, 18 - %arrayidx141 = getelementptr inbounds float, ptr %b, i64 %53 - %54 = load float, ptr %arrayidx141, align 4 - %mul142 = fmul float %54, %alpha - %arrayidx145 = getelementptr inbounds float, ptr %a, i64 %53 - %55 = load float, ptr %arrayidx145, align 4 - %add146 = fadd float %55, %mul142 - store float %add146, ptr %arrayidx145, align 4 - %56 = or disjoint i64 %indvars.iv, 19 - %arrayidx149 = getelementptr inbounds float, ptr %b, i64 %56 - %57 = load float, ptr %arrayidx149, align 4 - %mul150 = fmul float %57, %alpha - %arrayidx153 = getelementptr inbounds float, ptr %a, i64 %56 - %58 = load float, ptr %arrayidx153, align 4 - %add154 = fadd float %58, %mul150 - store float %add154, ptr %arrayidx153, align 4 - %59 = or disjoint i64 %indvars.iv, 20 - %arrayidx157 = getelementptr inbounds float, ptr %b, i64 %59 - %60 = load float, ptr %arrayidx157, align 4 - %mul158 = fmul float %60, %alpha - %arrayidx161 = getelementptr inbounds float, ptr %a, i64 %59 - %61 = load float, ptr %arrayidx161, align 4 - %add162 = fadd float %61, %mul158 - store float %add162, ptr %arrayidx161, align 4 - %62 = or disjoint i64 %indvars.iv, 21 - %arrayidx165 = getelementptr inbounds float, ptr %b, i64 %62 - %63 = load float, ptr %arrayidx165, align 4 - %mul166 = fmul float %63, %alpha - %arrayidx169 = getelementptr inbounds float, ptr %a, i64 %62 - %64 = load float, ptr %arrayidx169, align 4 - %add170 = fadd float %64, %mul166 - store float %add170, ptr %arrayidx169, align 4 - %65 = or disjoint i64 %indvars.iv, 22 - %arrayidx173 = getelementptr inbounds float, ptr %b, i64 %65 - %66 = load float, ptr %arrayidx173, align 4 - %mul174 = fmul float %66, %alpha - %arrayidx177 = getelementptr inbounds float, ptr %a, i64 %65 - %67 = load float, ptr %arrayidx177, align 4 - %add178 = fadd float %67, %mul174 - store float %add178, ptr %arrayidx177, align 4 - %68 = or disjoint i64 %indvars.iv, 23 - %arrayidx181 = getelementptr inbounds float, ptr %b, i64 %68 - %69 = load float, ptr %arrayidx181, align 4 - %mul182 = fmul float %69, %alpha - %arrayidx185 = getelementptr inbounds float, ptr %a, i64 %68 - %70 = load float, ptr %arrayidx185, align 4 - %add186 = fadd float %70, %mul182 - store float %add186, ptr %arrayidx185, align 4 - %71 = or disjoint i64 %indvars.iv, 24 - %arrayidx189 = getelementptr inbounds float, ptr %b, i64 %71 - %72 = load float, ptr %arrayidx189, align 4 - %mul190 = fmul float %72, %alpha - %arrayidx193 = getelementptr inbounds float, ptr %a, i64 %71 - %73 = load float, ptr %arrayidx193, align 4 - %add194 = fadd float %73, %mul190 - store float %add194, ptr %arrayidx193, align 4 - %74 = or disjoint i64 %indvars.iv, 25 - %arrayidx197 = getelementptr inbounds float, ptr %b, i64 %74 - %75 = load float, ptr %arrayidx197, align 4 - %mul198 = fmul float %75, %alpha - %arrayidx201 = getelementptr inbounds float, ptr %a, i64 %74 - %76 = load float, ptr %arrayidx201, align 4 - %add202 = fadd float %76, %mul198 - store float %add202, ptr %arrayidx201, align 4 - %77 = or disjoint i64 %indvars.iv, 26 - %arrayidx205 = getelementptr inbounds float, ptr %b, i64 %77 - %78 = load float, ptr %arrayidx205, align 4 - %mul206 = fmul float %78, %alpha - %arrayidx209 = getelementptr inbounds float, ptr %a, i64 %77 - %79 = load float, ptr %arrayidx209, align 4 - %add210 = fadd float %79, %mul206 - store float %add210, ptr %arrayidx209, align 4 - %80 = or disjoint i64 %indvars.iv, 27 - %arrayidx213 = getelementptr inbounds float, ptr %b, i64 %80 - %81 = load float, ptr %arrayidx213, align 4 - %mul214 = fmul float %81, %alpha - %arrayidx217 = getelementptr inbounds float, ptr %a, i64 %80 - %82 = load float, ptr %arrayidx217, align 4 - %add218 = fadd float %82, %mul214 - store float %add218, ptr %arrayidx217, align 4 - %83 = or disjoint i64 %indvars.iv, 28 - %arrayidx221 = getelementptr inbounds float, ptr %b, i64 %83 - %84 = load float, ptr %arrayidx221, align 4 - %mul222 = fmul float %84, %alpha - %arrayidx225 = getelementptr inbounds float, ptr %a, i64 %83 - %85 = load float, ptr %arrayidx225, align 4 - %add226 = fadd float %85, %mul222 - store float %add226, ptr %arrayidx225, align 4 - %86 = or disjoint i64 %indvars.iv, 29 - %arrayidx229 = getelementptr inbounds float, ptr %b, i64 %86 - %87 = load float, ptr %arrayidx229, align 4 - %mul230 = fmul float %87, %alpha - %arrayidx233 = getelementptr inbounds float, ptr %a, i64 %86 - %88 = load float, ptr %arrayidx233, align 4 - %add234 = fadd float %88, %mul230 - store float %add234, ptr %arrayidx233, align 4 - %89 = or disjoint i64 %indvars.iv, 30 - %arrayidx237 = getelementptr inbounds float, ptr %b, i64 %89 - %90 = load float, ptr %arrayidx237, align 4 - %mul238 = fmul float %90, %alpha - %arrayidx241 = getelementptr inbounds float, ptr %a, i64 %89 - %91 = load float, ptr %arrayidx241, align 4 - %add242 = fadd float %91, %mul238 - store float %add242, ptr %arrayidx241, align 4 - %92 = or disjoint i64 %indvars.iv, 31 - %arrayidx245 = getelementptr inbounds float, ptr %b, i64 %92 - %93 = load float, ptr %arrayidx245, align 4 - %mul246 = fmul float %93, %alpha - %arrayidx249 = getelementptr inbounds float, ptr %a, i64 %92 - %94 = load float, ptr %arrayidx249, align 4 - %add250 = fadd float %94, %mul246 - store float %add250, ptr %arrayidx249, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 32 - %cmp = icmp slt i64 %indvars.iv.next, 3200 - br i1 %cmp, label %for.body, label %for.end - -; CHECK-LABEL: @goo32 - -; CHECK: for.body: -; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ] -; CHECK: %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvar -; CHECK: %0 = load float, ptr %arrayidx, align 4 -; CHECK: %mul = fmul float %0, %alpha -; CHECK: %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvar -; CHECK: %1 = load float, ptr %arrayidx2, align 4 -; CHECK: %add = fadd float %1, %mul -; CHECK: store float %add, ptr %arrayidx2, align 4 -; CHECK: %indvar.next = add i64 %indvar, 1 -; CHECK: %exitcond = icmp eq i64 %indvar, 3199 -; CHECK: br i1 %exitcond, label %for.end, label %for.body -; CHECK: ret - -for.end: ; preds = %for.body - ret void -} - -attributes #0 = { nounwind uwtable } diff --git a/llvm/test/Transforms/LoopReroll/complex_reroll.ll b/llvm/test/Transforms/LoopReroll/complex_reroll.ll deleted file mode 100644 index 27139ee..0000000 --- a/llvm/test/Transforms/LoopReroll/complex_reroll.ll +++ /dev/null @@ -1,237 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=loop-reroll %s | FileCheck %s -declare i32 @goo(i32, i32) - -@buf = external global ptr -@aaa = global [16 x i8] c"\01\02\03\04\05\06\07\08\09\0A\0B\0C\0D\0E\0F\10", align 1 - -define i32 @test1(i32 %len) { -; CHECK-LABEL: @test1( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SUM44_020:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[WHILE_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr @aaa, i64 [[INDVAR]] -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[SCEVGEP]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP1]] to i64 -; CHECK-NEXT: [[ADD]] = add i64 [[CONV]], [[SUM44_020]] -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 15 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] -; CHECK: while.end: -; CHECK-NEXT: [[ADD9_LCSSA:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ] -; CHECK-NEXT: [[CONV11:%.*]] = trunc i64 [[ADD9_LCSSA]] to i32 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @goo(i32 0, i32 [[CONV11]]) -; CHECK-NEXT: unreachable -; -entry: - br label %while.body - -while.body: - - %dec22 = phi i32 [ 4, %entry ], [ %dec, %while.body ] - %buf.021 = phi ptr [ @aaa, %entry ], [ %add.ptr, %while.body ] - %sum44.020 = phi i64 [ 0, %entry ], [ %add9, %while.body ] - %0 = load i8, ptr %buf.021, align 1 - %conv = zext i8 %0 to i64 - %add = add i64 %conv, %sum44.020 - %arrayidx1 = getelementptr inbounds i8, ptr %buf.021, i64 1 - %1 = load i8, ptr %arrayidx1, align 1 - %conv2 = zext i8 %1 to i64 - %add3 = add i64 %add, %conv2 - %arrayidx4 = getelementptr inbounds i8, ptr %buf.021, i64 2 - %2 = load i8, ptr %arrayidx4, align 1 - %conv5 = zext i8 %2 to i64 - %add6 = add i64 %add3, %conv5 - %arrayidx7 = getelementptr inbounds i8, ptr %buf.021, i64 3 - %3 = load i8, ptr %arrayidx7, align 1 - %conv8 = zext i8 %3 to i64 - %add9 = add i64 %add6, %conv8 - %add.ptr = getelementptr inbounds i8, ptr %buf.021, i64 4 - %dec = add nsw i32 %dec22, -1 - %tobool = icmp eq i32 %dec, 0 - br i1 %tobool, label %while.end, label %while.body - -while.end: ; preds = %while.body - %conv11 = trunc i64 %add9 to i32 - %call = tail call i32 @goo(i32 0, i32 %conv11) - unreachable -} - -define i32 @test2(i32 %N, ptr nocapture readonly %a, i32 %S) { -; CHECK-LABEL: @test2( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP_9:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP_9]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.lr.ph: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.for.cond.cleanup_crit_edge: -; CHECK-NEXT: [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[S_ADDR_0_LCSSA:%.*]] = phi i32 [ [[ADD2_LCSSA]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]] ], [ [[S:%.*]], [[ENTRY:%.*]] ] -; CHECK-NEXT: ret i32 [[S_ADDR_0_LCSSA]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_LR_PH]] ] -; CHECK-NEXT: [[S_ADDR_011:%.*]] = phi i32 [ [[S]], [[FOR_BODY_LR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVAR]] to i32 -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[INDVAR]], 2 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4 -; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP6]], [[S_ADDR_011]] -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP4]], [[TMP3]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label [[FOR_BODY]] -; -entry: - %cmp.9 = icmp sgt i32 %N, 0 - br i1 %cmp.9, label %for.body.lr.ph, label %for.cond.cleanup - -for.body.lr.ph: - br label %for.body - -for.cond.for.cond.cleanup_crit_edge: - br label %for.cond.cleanup - -for.cond.cleanup: - %S.addr.0.lcssa = phi i32 [ %add2, %for.cond.for.cond.cleanup_crit_edge ], [ %S, %entry ] - ret i32 %S.addr.0.lcssa - -for.body: - - %i.012 = phi i32 [ 0, %for.body.lr.ph ], [ %add3, %for.body ] - %S.addr.011 = phi i32 [ %S, %for.body.lr.ph ], [ %add2, %for.body ] - %a.addr.010 = phi ptr [ %a, %for.body.lr.ph ], [ %incdec.ptr1, %for.body ] - %incdec.ptr = getelementptr inbounds i32, ptr %a.addr.010, i64 1 - %0 = load i32, ptr %a.addr.010, align 4 - %add = add nsw i32 %0, %S.addr.011 - %incdec.ptr1 = getelementptr inbounds i32, ptr %a.addr.010, i64 2 - %1 = load i32, ptr %incdec.ptr, align 4 - %add2 = add nsw i32 %add, %1 - %add3 = add nsw i32 %i.012, 2 - %cmp = icmp slt i32 %add3, %N - br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge -} - -define i32 @test3(ptr nocapture readonly %buf, i32 %len) #0 { -; CHECK-LABEL: @test3( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[LEN:%.*]], 1 -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]] -; CHECK: while.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], -2 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[S_012:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ undef, [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVAR]] to i32 -; CHECK-NEXT: [[TMP5:%.*]] = mul nsw i64 [[INDVAR]], -4 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[BUF:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4 -; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP6]], [[S_012]] -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP4]], [[TMP3]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ] -; CHECK-NEXT: br label [[WHILE_END]] -; CHECK: while.end: -; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD2_LCSSA]], [[WHILE_END_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[S_0_LCSSA]] -; -entry: - %cmp10 = icmp sgt i32 %len, 1 - br i1 %cmp10, label %while.body.preheader, label %while.end - -while.body.preheader: ; preds = %entry - br label %while.body - -while.body: ; preds = %while.body.preheader, %while.body - - %i.013 = phi i32 [ %sub, %while.body ], [ %len, %while.body.preheader ] - %S.012 = phi i32 [ %add2, %while.body ], [ undef, %while.body.preheader ] - %buf.addr.011 = phi ptr [ %add.ptr, %while.body ], [ %buf, %while.body.preheader ] - %0 = load i32, ptr %buf.addr.011, align 4 - %add = add nsw i32 %0, %S.012 - %arrayidx1 = getelementptr inbounds i32, ptr %buf.addr.011, i64 -1 - %1 = load i32, ptr %arrayidx1, align 4 - %add2 = add nsw i32 %add, %1 - %add.ptr = getelementptr inbounds i32, ptr %buf.addr.011, i64 -2 - %sub = add nsw i32 %i.013, -2 - %cmp = icmp sgt i32 %sub, 1 - br i1 %cmp, label %while.body, label %while.end.loopexit - -while.end.loopexit: ; preds = %while.body - br label %while.end - -while.end: ; preds = %while.end.loopexit, %entry - %S.0.lcssa = phi i32 [ undef, %entry ], [ %add2, %while.end.loopexit ] - ret i32 %S.0.lcssa -} - -define i32 @test4(i32 %len) { -; CHECK-LABEL: @test4( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SUM44_020:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[WHILE_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr @aaa, i64 [[INDVAR]] -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[SCEVGEP]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP1]] to i64 -; CHECK-NEXT: [[ADD]] = add i64 [[CONV]], [[SUM44_020]] -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 23 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] -; CHECK: while.end: -; CHECK-NEXT: [[ADD9_LCSSA:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ] -; CHECK-NEXT: [[CONV11:%.*]] = trunc i64 [[ADD9_LCSSA]] to i32 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @goo(i32 0, i32 [[CONV11]]) -; CHECK-NEXT: unreachable -; -entry: - br label %while.body - -while.body: - %a = phi i32 [ 4, %entry ], [ %a.next, %while.body ] - %b = phi i32 [ 6, %entry ], [ %b.next, %while.body ] - %buf.021 = phi ptr [ @aaa, %entry ], [ %add.ptr, %while.body ] - %sum44.020 = phi i64 [ 0, %entry ], [ %add9, %while.body ] - %0 = load i8, ptr %buf.021, align 1 - %conv = zext i8 %0 to i64 - %add = add i64 %conv, %sum44.020 - %arrayidx1 = getelementptr inbounds i8, ptr %buf.021, i64 1 - %1 = load i8, ptr %arrayidx1, align 1 - %conv2 = zext i8 %1 to i64 - %add3 = add i64 %add, %conv2 - %arrayidx4 = getelementptr inbounds i8, ptr %buf.021, i64 2 - %2 = load i8, ptr %arrayidx4, align 1 - %conv5 = zext i8 %2 to i64 - %add6 = add i64 %add3, %conv5 - %arrayidx7 = getelementptr inbounds i8, ptr %buf.021, i64 3 - %3 = load i8, ptr %arrayidx7, align 1 - %conv8 = zext i8 %3 to i64 - %add9 = add i64 %add6, %conv8 - %add.ptr = getelementptr inbounds i8, ptr %buf.021, i64 4 - %a.next = add nsw i32 %a, -1 - %b.next = add nsw i32 %b, -1 - %cond = add nsw i32 %a, %b - %tobool = icmp eq i32 %cond, 0 - br i1 %tobool, label %while.end, label %while.body - -while.end: ; preds = %while.body - %conv11 = trunc i64 %add9 to i32 - %call = tail call i32 @goo(i32 0, i32 %conv11) - unreachable -} - diff --git a/llvm/test/Transforms/LoopReroll/external_use.ll b/llvm/test/Transforms/LoopReroll/external_use.ll deleted file mode 100644 index 2124f3b..0000000 --- a/llvm/test/Transforms/LoopReroll/external_use.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s - -; Check whether rerolling is rejected if values of the base and root -; instruction are used outside the loop block. - -; Only the base/root instructions except a loop increment instruction -define void @test1() { -entry: - br label %loop1 - -loop1: -;CHECK-LABEL: loop1: -;CHECK-NEXT: %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop1 ] -;CHECK-NEXT: %indvar.1 = add nsw i64 %indvar, 1 - - %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop1 ] - %indvar.1 = add nsw i64 %indvar, 1 - %indvar.next = add nsw i64 %indvar, 2 - %cmp = icmp slt i64 %indvar.next, 200 - br i1 %cmp, label %loop1, label %exit - -exit: - %var1 = phi i64 [ %indvar.1, %loop1 ] - %var2 = phi i64 [ %indvar, %loop1 ] - ret void -} - -; Both the base/root instructions and reduction instructions -define void @test2() { -entry: - br label %loop2 - -loop2: -;CHECK-LABEL: loop2: -;CHECK-NEXT: %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %loop2 ] -;CHECK-NEXT: %redvar = phi i32 [ 0, %entry ], [ %add.2, %loop2 ] -;CHECK-NEXT: %indvar.1 = add nuw nsw i32 %indvar, 1 -;CHECK-NEXT: %indvar.2 = add nuw nsw i32 %indvar, 2 - - %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %loop2 ] - %redvar = phi i32 [ 0, %entry ], [ %add.2, %loop2 ] - %indvar.1 = add nuw nsw i32 %indvar, 1 - %indvar.2 = add nuw nsw i32 %indvar, 2 - %mul.0 = mul nsw i32 %indvar, %indvar - %mul.1 = mul nsw i32 %indvar.1, %indvar.1 - %mul.2 = mul nsw i32 %indvar.2, %indvar.2 - %add.0 = add nsw i32 %redvar, %mul.0 - %add.1 = add nsw i32 %add.0, %mul.1 - %add.2 = add nsw i32 %add.1, %mul.2 - %indvar.next = add nuw nsw i32 %indvar, 3 - %cmp = icmp slt i32 %indvar.next, 300 - br i1 %cmp, label %loop2, label %exit - -exit: - %a = phi i32 [ %indvar, %loop2 ] - %b = phi i32 [ %indvar.1, %loop2 ] - %c = phi i32 [ %indvar.2, %loop2 ] - %x = phi i32 [ %add.2, %loop2 ] - ret void -} diff --git a/llvm/test/Transforms/LoopReroll/extra_instr.ll b/llvm/test/Transforms/LoopReroll/extra_instr.ll deleted file mode 100644 index 3114463..0000000 --- a/llvm/test/Transforms/LoopReroll/extra_instr.ll +++ /dev/null @@ -1,361 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt -S -passes=loop-reroll %s | FileCheck %s -target triple = "aarch64--linux-gnu" - -define void @rerollable1(ptr nocapture %a) { -; CHECK-LABEL: define void @rerollable1 -; CHECK-SAME: (ptr nocapture [[A:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[IV]], 2 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 160 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[IV]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], 80 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[VALUE0:%.*]] = load i32, ptr [[SCEVGEP1]], align 4 -; CHECK-NEXT: store i32 [[VALUE0]], ptr [[SCEVGEP]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND2:%.*]] = icmp eq i64 [[IV]], 9 -; CHECK-NEXT: br i1 [[EXITCOND2]], label [[EXIT:%.*]], label [[LOOP]] -; CHECK: exit: -; CHECK-NEXT: ret void -; -entry: - br label %loop - -loop: - - - ; base instruction - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - - ; NO unrerollable instructions - - ; extra simple arithmetic operations, used by root instructions - %plus20 = add nuw nsw i64 %iv, 20 - %plus10 = add nuw nsw i64 %iv, 10 - - ; root instruction 0 - %ldptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 0 - %value0 = load i32, ptr %ldptr0, align 4 - %stptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 0 - store i32 %value0, ptr %stptr0, align 4 - - ; root instruction 1 - %ldptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 1 - %value1 = load i32, ptr %ldptr1, align 4 - %stptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 1 - store i32 %value1, ptr %stptr1, align 4 - - ; loop-increment - %iv.next = add nuw nsw i64 %iv, 1 - - ; latch - %exitcond = icmp eq i64 %iv.next, 5 - br i1 %exitcond, label %exit, label %loop - -exit: - ret void -} - -define void @unrerollable1(ptr nocapture %a) { -; CHECK-LABEL: define void @unrerollable1 -; CHECK-SAME: (ptr nocapture [[A:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[STPTRX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[IV]], i64 0 -; CHECK-NEXT: store i32 999, ptr [[STPTRX]], align 4 -; CHECK-NEXT: [[PLUS20:%.*]] = add nuw nsw i64 [[IV]], 20 -; CHECK-NEXT: [[PLUS10:%.*]] = add nuw nsw i64 [[IV]], 10 -; CHECK-NEXT: [[LDPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 0 -; CHECK-NEXT: [[VALUE0:%.*]] = load i32, ptr [[LDPTR0]], align 4 -; CHECK-NEXT: [[STPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 0 -; CHECK-NEXT: store i32 [[VALUE0]], ptr [[STPTR0]], align 4 -; CHECK-NEXT: [[LDPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 1 -; CHECK-NEXT: [[VALUE1:%.*]] = load i32, ptr [[LDPTR1]], align 4 -; CHECK-NEXT: [[STPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 1 -; CHECK-NEXT: store i32 [[VALUE1]], ptr [[STPTR1]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 5 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]] -; CHECK: exit: -; CHECK-NEXT: ret void -; -entry: - br label %loop - -loop: - - - ; base instruction - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - - ; unrerollable instructions using %iv - %stptrx = getelementptr inbounds [2 x i32], ptr %a, i64 %iv, i64 0 - store i32 999, ptr %stptrx, align 4 - - ; extra simple arithmetic operations, used by root instructions - %plus20 = add nuw nsw i64 %iv, 20 - %plus10 = add nuw nsw i64 %iv, 10 - - ; root instruction 0 - %ldptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 0 - %value0 = load i32, ptr %ldptr0, align 4 - %stptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 0 - store i32 %value0, ptr %stptr0, align 4 - - ; root instruction 1 - %ldptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 1 - %value1 = load i32, ptr %ldptr1, align 4 - %stptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 1 - store i32 %value1, ptr %stptr1, align 4 - - ; loop-increment - %iv.next = add nuw nsw i64 %iv, 1 - - ; latch - %exitcond = icmp eq i64 %iv.next, 5 - br i1 %exitcond, label %exit, label %loop - -exit: - ret void -} - -define void @unrerollable2(ptr nocapture %a) { -; CHECK-LABEL: define void @unrerollable2 -; CHECK-SAME: (ptr nocapture [[A:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[STPTRX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[IV_NEXT]], i64 0 -; CHECK-NEXT: store i32 999, ptr [[STPTRX]], align 4 -; CHECK-NEXT: [[PLUS20:%.*]] = add nuw nsw i64 [[IV]], 20 -; CHECK-NEXT: [[PLUS10:%.*]] = add nuw nsw i64 [[IV]], 10 -; CHECK-NEXT: [[LDPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 0 -; CHECK-NEXT: [[VALUE0:%.*]] = load i32, ptr [[LDPTR0]], align 4 -; CHECK-NEXT: [[STPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 0 -; CHECK-NEXT: store i32 [[VALUE0]], ptr [[STPTR0]], align 4 -; CHECK-NEXT: [[LDPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 1 -; CHECK-NEXT: [[VALUE1:%.*]] = load i32, ptr [[LDPTR1]], align 4 -; CHECK-NEXT: [[STPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 1 -; CHECK-NEXT: store i32 [[VALUE1]], ptr [[STPTR1]], align 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 5 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]] -; CHECK: exit: -; CHECK-NEXT: ret void -; -entry: - br label %loop - -loop: - - - ; base instruction - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - - ; loop-increment - %iv.next = add nuw nsw i64 %iv, 1 - - ; unrerollable instructions using %iv.next - %stptrx = getelementptr inbounds [2 x i32], ptr %a, i64 %iv.next, i64 0 - store i32 999, ptr %stptrx, align 4 - - ; extra simple arithmetic operations, used by root instructions - %plus20 = add nuw nsw i64 %iv, 20 - %plus10 = add nuw nsw i64 %iv, 10 - - ; root instruction 0 - %ldptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 0 - %value0 = load i32, ptr %ldptr0, align 4 - %stptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 0 - store i32 %value0, ptr %stptr0, align 4 - - ; root instruction 1 - %ldptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 1 - %value1 = load i32, ptr %ldptr1, align 4 - %stptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 1 - store i32 %value1, ptr %stptr1, align 4 - - ; latch - %exitcond = icmp eq i64 %iv.next, 5 - br i1 %exitcond, label %exit, label %loop - -exit: - ret void -} - -define dso_local void @rerollable2() { -; CHECK-LABEL: define dso_local void @rerollable2() { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[IV]], 24 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IV]], 20 -; CHECK-NEXT: [[IV_SCALED_DIV5:%.*]] = udiv i32 [[TMP1]], 5 -; CHECK-NEXT: tail call void @bar(i32 [[IV_SCALED_DIV5]]) -; CHECK-NEXT: [[IV_SCALED_ADD4_DIV5:%.*]] = udiv i32 [[TMP0]], 5 -; CHECK-NEXT: tail call void @bar(i32 [[IV_SCALED_ADD4_DIV5]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV]], 8 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]] -; CHECK: exit: -; CHECK-NEXT: ret void -; -entry: - br label %loop - -loop: - - - ; induction variable - %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] - - ; scale instruction - %iv.mul3 = mul nuw nsw i32 %iv, 3 - - ; extra simple arithmetic operations, used by root instructions - %iv.scaled = add nuw nsw i32 %iv.mul3, 20 - - ; NO unrerollable instructions - - ; root set 1 - - ; base instruction - %iv.scaled.div5 = udiv i32 %iv.scaled, 5 - tail call void @bar(i32 %iv.scaled.div5) - ; root instruction 0 - %iv.scaled.add1 = add nuw nsw i32 %iv.scaled, 1 - %iv.scaled.add1.div5 = udiv i32 %iv.scaled.add1, 5 - tail call void @bar(i32 %iv.scaled.add1.div5) - ; root instruction 2 - %iv.scaled.add2 = add nuw nsw i32 %iv.scaled, 2 - %iv.scaled.add2.div5 = udiv i32 %iv.scaled.add2, 5 - tail call void @bar(i32 %iv.scaled.add2.div5) - - ; root set 2 - - ; base instruction - %iv.scaled.add4 = add nuw nsw i32 %iv.scaled, 4 - %iv.scaled.add4.div5 = udiv i32 %iv.scaled.add4, 5 - tail call void @bar(i32 %iv.scaled.add4.div5) - ; root instruction 0 - %iv.scaled.add5 = add nuw nsw i32 %iv.scaled, 5 - %iv.scaled.add5.div5 = udiv i32 %iv.scaled.add5, 5 - tail call void @bar(i32 %iv.scaled.add5.div5) - ; root instruction 2 - %iv.scaled.add6 = add nuw nsw i32 %iv.scaled, 6 - %iv.scaled.add6.div5 = udiv i32 %iv.scaled.add6, 5 - tail call void @bar(i32 %iv.scaled.add6.div5) - - ; loop-increment - %iv.next = add nuw nsw i32 %iv, 1 - - ; latch - %cmp = icmp ult i32 %iv.next, 3 - br i1 %cmp, label %loop, label %exit - -exit: - ret void -} - -define dso_local void @unrerollable3() { -; CHECK-LABEL: define dso_local void @unrerollable3() { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IV_MUL3:%.*]] = mul nuw nsw i32 [[IV]], 3 -; CHECK-NEXT: [[IV_SCALED:%.*]] = add nuw nsw i32 [[IV_MUL3]], 20 -; CHECK-NEXT: [[IV_MUL7:%.*]] = mul nuw nsw i32 [[IV]], 7 -; CHECK-NEXT: tail call void @bar(i32 [[IV_MUL7]]) -; CHECK-NEXT: [[IV_SCALED_DIV5:%.*]] = udiv i32 [[IV_SCALED]], 5 -; CHECK-NEXT: tail call void @bar(i32 [[IV_SCALED_DIV5]]) -; CHECK-NEXT: [[IV_SCALED_ADD1:%.*]] = add nuw nsw i32 [[IV_SCALED]], 1 -; CHECK-NEXT: [[IV_SCALED_ADD1_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD1]], 5 -; CHECK-NEXT: tail call void @bar(i32 [[IV_SCALED_ADD1_DIV5]]) -; CHECK-NEXT: [[IV_SCALED_ADD2:%.*]] = add nuw nsw i32 [[IV_SCALED]], 2 -; CHECK-NEXT: [[IV_SCALED_ADD2_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD2]], 5 -; CHECK-NEXT: tail call void @bar(i32 [[IV_SCALED_ADD2_DIV5]]) -; CHECK-NEXT: [[IV_SCALED_ADD4:%.*]] = add nuw nsw i32 [[IV_SCALED]], 4 -; CHECK-NEXT: [[IV_SCALED_ADD4_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD4]], 5 -; CHECK-NEXT: tail call void @bar(i32 [[IV_SCALED_ADD4_DIV5]]) -; CHECK-NEXT: [[IV_SCALED_ADD5:%.*]] = add nuw nsw i32 [[IV_SCALED]], 5 -; CHECK-NEXT: [[IV_SCALED_ADD5_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD5]], 5 -; CHECK-NEXT: tail call void @bar(i32 [[IV_SCALED_ADD5_DIV5]]) -; CHECK-NEXT: [[IV_SCALED_ADD6:%.*]] = add nuw nsw i32 [[IV_SCALED]], 6 -; CHECK-NEXT: [[IV_SCALED_ADD6_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD6]], 5 -; CHECK-NEXT: tail call void @bar(i32 [[IV_SCALED_ADD6_DIV5]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], 3 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] -; CHECK: exit: -; CHECK-NEXT: ret void -; -entry: - br label %loop - -loop: - - - ; induction variable - %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] - - ; scale instruction - %iv.mul3 = mul nuw nsw i32 %iv, 3 - - ; extra simple arithmetic operations, used by root instructions - %iv.scaled = add nuw nsw i32 %iv.mul3, 20 - - ; unrerollable instructions using %iv - %iv.mul7 = mul nuw nsw i32 %iv, 7 - tail call void @bar(i32 %iv.mul7) - - ; root set 1 - - ; base instruction - %iv.scaled.div5 = udiv i32 %iv.scaled, 5 - tail call void @bar(i32 %iv.scaled.div5) - ; root instruction 0 - %iv.scaled.add1 = add nuw nsw i32 %iv.scaled, 1 - %iv.scaled.add1.div5 = udiv i32 %iv.scaled.add1, 5 - tail call void @bar(i32 %iv.scaled.add1.div5) - ; root instruction 2 - %iv.scaled.add2 = add nuw nsw i32 %iv.scaled, 2 - %iv.scaled.add2.div5 = udiv i32 %iv.scaled.add2, 5 - tail call void @bar(i32 %iv.scaled.add2.div5) - - ; root set 2 - - ; base instruction - %iv.scaled.add4 = add nuw nsw i32 %iv.scaled, 4 - %iv.scaled.add4.div5 = udiv i32 %iv.scaled.add4, 5 - tail call void @bar(i32 %iv.scaled.add4.div5) - ; root instruction 0 - %iv.scaled.add5 = add nuw nsw i32 %iv.scaled, 5 - %iv.scaled.add5.div5 = udiv i32 %iv.scaled.add5, 5 - tail call void @bar(i32 %iv.scaled.add5.div5) - ; root instruction 2 - %iv.scaled.add6 = add nuw nsw i32 %iv.scaled, 6 - %iv.scaled.add6.div5 = udiv i32 %iv.scaled.add6, 5 - tail call void @bar(i32 %iv.scaled.add6.div5) - - ; loop-increment - %iv.next = add nuw nsw i32 %iv, 1 - - ; latch - %cmp = icmp ult i32 %iv.next, 3 - br i1 %cmp, label %loop, label %exit - -exit: - ret void -} - -declare dso_local void @bar(i32) diff --git a/llvm/test/Transforms/LoopReroll/indvar_with_ext.ll b/llvm/test/Transforms/LoopReroll/indvar_with_ext.ll deleted file mode 100644 index 3fcd43f..0000000 --- a/llvm/test/Transforms/LoopReroll/indvar_with_ext.ll +++ /dev/null @@ -1,184 +0,0 @@ -; RUN: opt -S -passes=loop-reroll %s | FileCheck %s -target triple = "aarch64--linux-gnu" - -define void @test(i32 %n, ptr %arrayidx200, ptr %arrayidx164, ptr %arrayidx172) { -entry: - %rem.i = srem i32 %n, 4 - %t22 = load float, ptr %arrayidx172, align 4 - %cmp.9 = icmp eq i32 %n, 0 - %t7 = sext i32 %n to i64 - br i1 %cmp.9, label %while.end, label %while.body.preheader - -while.body.preheader: - br label %while.body - -while.body: -;CHECK-LABEL: while.body: -;CHECK-NEXT: %indvar = phi i64 [ %indvar.next, %while.body ], [ 0, %while.body.preheader ] -;CHECK-NEXT: %arrayidx62.i = getelementptr inbounds float, ptr %arrayidx200, i64 %indvar -;CHECK-NEXT: %t1 = load float, ptr %arrayidx62.i, align 4 -;CHECK-NEXT: %arrayidx64.i = getelementptr inbounds float, ptr %arrayidx164, i64 %indvar -;CHECK-NEXT: %t2 = load float, ptr %arrayidx64.i, align 4 -;CHECK-NEXT: %mul65.i = fmul fast float %t2, %t22 -;CHECK-NEXT: %add66.i = fadd fast float %mul65.i, %t1 -;CHECK-NEXT: store float %add66.i, ptr %arrayidx62.i, align 4 -;CHECK-NEXT: %indvar.next = add i64 %indvar, 1 -;CHECK-NEXT: %exitcond = icmp eq i64 %indvar, %{{[0-9]+}} -;CHECK-NEXT: br i1 %exitcond, label %while.end.loopexit, label %while.body - - %indvars.iv.i423 = phi i64 [ %indvars.iv.next.i424, %while.body ], [ 0, %while.body.preheader ] - %i.22.i = phi i32 [ %add103.i, %while.body ], [ %rem.i, %while.body.preheader ] - %arrayidx62.i = getelementptr inbounds float, ptr %arrayidx200, i64 %indvars.iv.i423 - %t1 = load float, ptr %arrayidx62.i, align 4 - %arrayidx64.i = getelementptr inbounds float, ptr %arrayidx164, i64 %indvars.iv.i423 - %t2 = load float, ptr %arrayidx64.i, align 4 - %mul65.i = fmul fast float %t2, %t22 - %add66.i = fadd fast float %mul65.i, %t1 - store float %add66.i, ptr %arrayidx62.i, align 4 - %t3 = add nsw i64 %indvars.iv.i423, 1 - %arrayidx71.i = getelementptr inbounds float, ptr %arrayidx200, i64 %t3 - %t4 = load float, ptr %arrayidx71.i, align 4 - %arrayidx74.i = getelementptr inbounds float, ptr %arrayidx164, i64 %t3 - %t5 = load float, ptr %arrayidx74.i, align 4 - %mul75.i = fmul fast float %t5, %t22 - %add76.i = fadd fast float %mul75.i, %t4 - store float %add76.i, ptr %arrayidx71.i, align 4 - %add103.i = add nsw i32 %i.22.i, 2 - %t6 = sext i32 %add103.i to i64 - %cmp58.i = icmp slt i64 %t6, %t7 - %indvars.iv.next.i424 = add i64 %indvars.iv.i423, 2 - br i1 %cmp58.i, label %while.body, label %while.end.loopexit - -while.end.loopexit: - br label %while.end - -while.end: - ret void -} - -; Function Attrs: noinline norecurse nounwind -define i32 @test2(i64 %n, ptr nocapture %x, ptr nocapture readonly %y) { -entry: - %cmp18 = icmp sgt i64 %n, 0 - br i1 %cmp18, label %for.body.preheader, label %for.end - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - -;CHECK-LABEL: for.body: -;CHECK-NEXT: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %for.body.preheader ] -;CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvar -;CHECK-NEXT: [[T1:%[0-9]+]] = load i32, ptr %arrayidx, align 4 -;CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvar -;CHECK-NEXT: store i32 [[T1]], ptr %arrayidx3, align 4 -;CHECK-NEXT: %indvar.next = add i64 %indvar, 1 -;CHECK-NEXT: %exitcond = icmp eq i64 %indvar, %{{[0-9]+}} -;CHECK-NEXT: br i1 %exitcond, label %for.end.loopexit, label %for.body - - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvars.iv - %0 = load i32, ptr %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvars.iv - store i32 %0, ptr %arrayidx3, align 4 - %1 = or disjoint i64 %indvars.iv, 1 - %arrayidx5 = getelementptr inbounds i32, ptr %y, i64 %1 - %2 = load i32, ptr %arrayidx5, align 4 - %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %1 - store i32 %2, ptr %arrayidx8, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 - %cmp = icmp slt i64 %indvars.iv.next, %n - br i1 %cmp, label %for.body, label %for.end.loopexit - -for.end.loopexit: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - ret i32 0 -} - -; Function Attrs: noinline norecurse nounwind -define i32 @test3(i32 %n, ptr nocapture %x, ptr nocapture readonly %y) { -entry: - %cmp21 = icmp sgt i32 %n, 0 - br i1 %cmp21, label %for.body.preheader, label %for.end - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - -;CHECK-LABEL: for.body: -;CHECK: %add12 = add i8 %i.022, 2 -;CHECK-NEXT: %conv = sext i8 %add12 to i32 -;CHECK-NEXT: %cmp = icmp slt i32 %conv, %n -;CHECK-NEXT: br i1 %cmp, label %for.body, label %for.end.loopexit - - %conv23 = phi i32 [ %conv, %for.body ], [ 0, %for.body.preheader ] - %i.022 = phi i8 [ %add12, %for.body ], [ 0, %for.body.preheader ] - %idxprom = sext i8 %i.022 to i64 - %arrayidx = getelementptr inbounds i32, ptr %y, i64 %idxprom - %0 = load i32, ptr %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %idxprom - store i32 %0, ptr %arrayidx3, align 4 - %add = or disjoint i32 %conv23, 1 - %idxprom5 = sext i32 %add to i64 - %arrayidx6 = getelementptr inbounds i32, ptr %y, i64 %idxprom5 - %1 = load i32, ptr %arrayidx6, align 4 - %arrayidx10 = getelementptr inbounds i32, ptr %x, i64 %idxprom5 - store i32 %1, ptr %arrayidx10, align 4 - %add12 = add i8 %i.022, 2 - %conv = sext i8 %add12 to i32 - %cmp = icmp slt i32 %conv, %n - br i1 %cmp, label %for.body, label %for.end.loopexit - -for.end.loopexit: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - ret i32 0 -} - -; Function Attrs: noinline norecurse nounwind -define i32 @test4(i64 %n, ptr nocapture %x, ptr nocapture readonly %y) { -entry: - %cmp18 = icmp eq i64 %n, 0 - br i1 %cmp18, label %for.end, label %for.body.preheader - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - -;CHECK-LABEL: for.body: -;CHECK-NEXT: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %for.body.preheader ] -;CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvar -;CHECK-NEXT: [[T1:%[0-9]+]] = load i32, ptr %arrayidx, align 4 -;CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvar -;CHECK-NEXT: store i32 [[T1]], ptr %arrayidx3, align 4 -;CHECK-NEXT: %indvar.next = add i64 %indvar, 1 -;CHECK-NEXT: %exitcond = icmp eq i64 %indvar, %{{[0-9]+}} -;CHECK-NEXT: br i1 %exitcond, label %for.end.loopexit, label %for.body - - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvars.iv - %0 = load i32, ptr %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvars.iv - store i32 %0, ptr %arrayidx3, align 4 - %1 = or disjoint i64 %indvars.iv, 1 - %arrayidx5 = getelementptr inbounds i32, ptr %y, i64 %1 - %2 = load i32, ptr %arrayidx5, align 4 - %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %1 - store i32 %2, ptr %arrayidx8, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 - %cmp = icmp ult i64 %indvars.iv.next, %n - br i1 %cmp, label %for.body, label %for.end.loopexit - -for.end.loopexit: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - ret i32 0 -} - diff --git a/llvm/test/Transforms/LoopReroll/negative.ll b/llvm/test/Transforms/LoopReroll/negative.ll deleted file mode 100644 index ef850c0..0000000 --- a/llvm/test/Transforms/LoopReroll/negative.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: opt -S -passes=loop-reroll %s | FileCheck %s -target triple = "aarch64--linux-gnu" -@buf = global [16 x i8] c"\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A", align 1 - -define i32 @test1(i32 %len, ptr nocapture readonly %buf) #0 { -entry: - %cmp.13 = icmp sgt i32 %len, 1 - br i1 %cmp.13, label %while.body.lr.ph, label %while.end - -while.body.lr.ph: ; preds = %entry - br label %while.body - -while.body: -;CHECK-LABEL: while.body: -;CHECK-NEXT: %indvar = phi i32 [ %indvar.next, %while.body ], [ 0, %while.body.lr.ph ] -;CHECK-NEXT: %sum4.015 = phi i64 [ 0, %while.body.lr.ph ], [ %add, %while.body ] -;CHECK-NOT: %sub5 = add nsw i32 %len.addr.014, -1 -;CHECK-NOT: %sub5 = add nsw i32 %len.addr.014, -2 -;CHECK: br i1 %exitcond, label %while.cond.while.end_crit_edge, label %while.body - - %sum4.015 = phi i64 [ 0, %while.body.lr.ph ], [ %add4, %while.body ] - %len.addr.014 = phi i32 [ %len, %while.body.lr.ph ], [ %sub5, %while.body ] - %idxprom = sext i32 %len.addr.014 to i64 - %arrayidx = getelementptr inbounds i8, ptr %buf, i64 %idxprom - %0 = load i8, ptr %arrayidx, align 1 - %conv = zext i8 %0 to i64 - %add = add i64 %conv, %sum4.015 - %sub = add nsw i32 %len.addr.014, -1 - %idxprom1 = sext i32 %sub to i64 - %arrayidx2 = getelementptr inbounds i8, ptr %buf, i64 %idxprom1 - %1 = load i8, ptr %arrayidx2, align 1 - %conv3 = zext i8 %1 to i64 - %add4 = add i64 %add, %conv3 - %sub5 = add nsw i32 %len.addr.014, -2 - %cmp = icmp sgt i32 %sub5, 1 - br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge - -while.cond.while.end_crit_edge: ; preds = %while.body - %add4.lcssa = phi i64 [ %add4, %while.body ] - %phitmp = trunc i64 %add4.lcssa to i32 - br label %while.end - -while.end: ; preds = %while.cond.while.end_crit_edge, %entry - %sum4.0.lcssa = phi i32 [ %phitmp, %while.cond.while.end_crit_edge ], [ 0, %entry ] - ret i32 %sum4.0.lcssa - unreachable -} - diff --git a/llvm/test/Transforms/LoopReroll/nonconst_lb.ll b/llvm/test/Transforms/LoopReroll/nonconst_lb.ll deleted file mode 100644 index 80ea050..0000000 --- a/llvm/test/Transforms/LoopReroll/nonconst_lb.ll +++ /dev/null @@ -1,168 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" -target triple = "thumbv7-none-linux" - -;void foo(int *A, int *B, int m, int n) { -; for (int i = m; i < n; i+=4) { -; A[i+0] = B[i+0] * 4; -; A[i+1] = B[i+1] * 4; -; A[i+2] = B[i+2] * 4; -; A[i+3] = B[i+3] * 4; -; } -;} -define void @foo(ptr nocapture %A, ptr nocapture readonly %B, i32 %m, i32 %n) { -; CHECK-LABEL: @foo( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP34:%.*]] = icmp slt i32 [[M:%.*]], [[N:%.*]] -; CHECK-NEXT: br i1 [[CMP34]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[M]], 4 -; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 [[TMP0]]) -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SMAX]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], [[M]] -; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP2]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i32 [[TMP3]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 3 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[M]], [[INDVAR]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP7]], 2 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]] -; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], [[TMP5]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.end.loopexit: -; CHECK-NEXT: br label [[FOR_END]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - %cmp34 = icmp slt i32 %m, %n - br i1 %cmp34, label %for.body, label %for.end - -for.body: ; preds = %entry, %for.body - %i.035 = phi i32 [ %add18, %for.body ], [ %m, %entry ] - %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.035 - %0 = load i32, ptr %arrayidx, align 4 - %mul = shl nsw i32 %0, 2 - %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.035 - store i32 %mul, ptr %arrayidx2, align 4 - %add3 = add nsw i32 %i.035, 1 - %arrayidx4 = getelementptr inbounds i32, ptr %B, i32 %add3 - %1 = load i32, ptr %arrayidx4, align 4 - %mul5 = shl nsw i32 %1, 2 - %arrayidx7 = getelementptr inbounds i32, ptr %A, i32 %add3 - store i32 %mul5, ptr %arrayidx7, align 4 - %add8 = add nsw i32 %i.035, 2 - %arrayidx9 = getelementptr inbounds i32, ptr %B, i32 %add8 - %2 = load i32, ptr %arrayidx9, align 4 - %mul10 = shl nsw i32 %2, 2 - %arrayidx12 = getelementptr inbounds i32, ptr %A, i32 %add8 - store i32 %mul10, ptr %arrayidx12, align 4 - %add13 = add nsw i32 %i.035, 3 - %arrayidx14 = getelementptr inbounds i32, ptr %B, i32 %add13 - %3 = load i32, ptr %arrayidx14, align 4 - %mul15 = shl nsw i32 %3, 2 - %arrayidx17 = getelementptr inbounds i32, ptr %A, i32 %add13 - store i32 %mul15, ptr %arrayidx17, align 4 - %add18 = add nsw i32 %i.035, 4 - %cmp = icmp slt i32 %add18, %n - br i1 %cmp, label %for.body, label %for.end - -for.end: ; preds = %for.body, %entry - ret void -} - -;void daxpy_ur(int n,float da,ptr dx,ptr dy) -; { -; int m = n % 4; -; for (int i = m; i < n; i = i + 4) -; { -; dy[i] = dy[i] + da*dx[i]; -; dy[i+1] = dy[i+1] + da*dx[i+1]; -; dy[i+2] = dy[i+2] + da*dx[i+2]; -; dy[i+3] = dy[i+3] + da*dx[i+3]; -; } -; } -define void @daxpy_ur(i32 %n, float %da, ptr nocapture readonly %dx, ptr nocapture %dy) { -; CHECK-LABEL: @daxpy_ur( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[REM:%.*]] = srem i32 [[N:%.*]], 4 -; CHECK-NEXT: [[CMP55:%.*]] = icmp slt i32 [[REM]], [[N]] -; CHECK-NEXT: br i1 [[CMP55]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[REM]] -; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 3 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[REM]], [[INDVAR]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DY:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[DX:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP7]], [[DA:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP6]], [[MUL]] -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], [[TMP4]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.end.loopexit: -; CHECK-NEXT: br label [[FOR_END]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -entry: - %rem = srem i32 %n, 4 - %cmp55 = icmp slt i32 %rem, %n - br i1 %cmp55, label %for.body, label %for.end - -for.body: ; preds = %entry, %for.body - %i.056 = phi i32 [ %add27, %for.body ], [ %rem, %entry ] - %arrayidx = getelementptr inbounds float, ptr %dy, i32 %i.056 - %0 = load float, ptr %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds float, ptr %dx, i32 %i.056 - %1 = load float, ptr %arrayidx1, align 4 - %mul = fmul float %1, %da - %add = fadd float %0, %mul - store float %add, ptr %arrayidx, align 4 - %add3 = add nsw i32 %i.056, 1 - %arrayidx4 = getelementptr inbounds float, ptr %dy, i32 %add3 - %2 = load float, ptr %arrayidx4, align 4 - %arrayidx6 = getelementptr inbounds float, ptr %dx, i32 %add3 - %3 = load float, ptr %arrayidx6, align 4 - %mul7 = fmul float %3, %da - %add8 = fadd float %2, %mul7 - store float %add8, ptr %arrayidx4, align 4 - %add11 = add nsw i32 %i.056, 2 - %arrayidx12 = getelementptr inbounds float, ptr %dy, i32 %add11 - %4 = load float, ptr %arrayidx12, align 4 - %arrayidx14 = getelementptr inbounds float, ptr %dx, i32 %add11 - %5 = load float, ptr %arrayidx14, align 4 - %mul15 = fmul float %5, %da - %add16 = fadd float %4, %mul15 - store float %add16, ptr %arrayidx12, align 4 - %add19 = add nsw i32 %i.056, 3 - %arrayidx20 = getelementptr inbounds float, ptr %dy, i32 %add19 - %6 = load float, ptr %arrayidx20, align 4 - %arrayidx22 = getelementptr inbounds float, ptr %dx, i32 %add19 - %7 = load float, ptr %arrayidx22, align 4 - %mul23 = fmul float %7, %da - %add24 = fadd float %6, %mul23 - store float %add24, ptr %arrayidx20, align 4 - %add27 = add nsw i32 %i.056, 4 - %cmp = icmp slt i32 %add27, %n - br i1 %cmp, label %for.body, label %for.end - -for.end: ; preds = %for.body, %entry - ret void -} - diff --git a/llvm/test/Transforms/LoopReroll/ptrindvar.ll b/llvm/test/Transforms/LoopReroll/ptrindvar.ll deleted file mode 100644 index 90f6353..0000000 --- a/llvm/test/Transforms/LoopReroll/ptrindvar.ll +++ /dev/null @@ -1,125 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt -S -passes=loop-reroll %s | FileCheck %s -target triple = "aarch64--linux-gnu" - -define i32 @test(ptr readonly %buf, ptr readnone %end) #0 { -; CHECK-LABEL: define i32 @test -; CHECK-SAME: (ptr readonly [[BUF:%.*]], ptr readnone [[END:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[BUF2:%.*]] = ptrtoint ptr [[BUF]] to i64 -; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 -; CHECK-NEXT: [[CMP_9:%.*]] = icmp eq ptr [[BUF]], [[END]] -; CHECK-NEXT: br i1 [[CMP_9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] -; CHECK: while.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 -; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[BUF2]] -; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[S_011:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ undef, [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[INDVAR]], 2 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[BUF]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4 -; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP6]], [[S_011]] -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR]], [[TMP4]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ] -; CHECK-NEXT: br label [[WHILE_END]] -; CHECK: while.end: -; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD2_LCSSA]], [[WHILE_END_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[S_0_LCSSA]] -; -entry: - %cmp.9 = icmp eq ptr %buf, %end - br i1 %cmp.9, label %while.end, label %while.body.preheader - -while.body.preheader: - br label %while.body - -while.body: - - %S.011 = phi i32 [ %add2, %while.body ], [ undef, %while.body.preheader ] - %buf.addr.010 = phi ptr [ %add.ptr, %while.body ], [ %buf, %while.body.preheader ] - %0 = load i32, ptr %buf.addr.010, align 4 - %add = add nsw i32 %0, %S.011 - %arrayidx1 = getelementptr inbounds i32, ptr %buf.addr.010, i64 1 - %1 = load i32, ptr %arrayidx1, align 4 - %add2 = add nsw i32 %add, %1 - %add.ptr = getelementptr inbounds i32, ptr %buf.addr.010, i64 2 - %cmp = icmp eq ptr %add.ptr, %end - br i1 %cmp, label %while.end.loopexit, label %while.body - -while.end.loopexit: - %add2.lcssa = phi i32 [ %add2, %while.body ] - br label %while.end - -while.end: - %S.0.lcssa = phi i32 [ undef, %entry ], [ %add2.lcssa, %while.end.loopexit ] - ret i32 %S.0.lcssa -} - -define i32 @test2(ptr readonly %buf, ptr readnone %end) #0 { -; CHECK-LABEL: define i32 @test2 -; CHECK-SAME: (ptr readonly [[BUF:%.*]], ptr readnone [[END:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[END2:%.*]] = ptrtoint ptr [[END]] to i64 -; CHECK-NEXT: [[BUF1:%.*]] = ptrtoint ptr [[BUF]] to i64 -; CHECK-NEXT: [[CMP_9:%.*]] = icmp eq ptr [[BUF]], [[END]] -; CHECK-NEXT: br i1 [[CMP_9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] -; CHECK: while.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[BUF1]], -8 -; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[END2]] -; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[S_011:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ undef, [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[TMP5:%.*]] = mul nsw i64 [[INDVAR]], -4 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[BUF]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4 -; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP6]], [[S_011]] -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR]], [[TMP4]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ] -; CHECK-NEXT: br label [[WHILE_END]] -; CHECK: while.end: -; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD2_LCSSA]], [[WHILE_END_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[S_0_LCSSA]] -; -entry: - %cmp.9 = icmp eq ptr %buf, %end - br i1 %cmp.9, label %while.end, label %while.body.preheader - -while.body.preheader: - br label %while.body - -while.body: - - %S.011 = phi i32 [ %add2, %while.body ], [ undef, %while.body.preheader ] - %buf.addr.010 = phi ptr [ %add.ptr, %while.body ], [ %buf, %while.body.preheader ] - %0 = load i32, ptr %buf.addr.010, align 4 - %add = add nsw i32 %0, %S.011 - %arrayidx1 = getelementptr inbounds i32, ptr %buf.addr.010, i64 -1 - %1 = load i32, ptr %arrayidx1, align 4 - %add2 = add nsw i32 %add, %1 - %add.ptr = getelementptr inbounds i32, ptr %buf.addr.010, i64 -2 - %cmp = icmp eq ptr %add.ptr, %end - br i1 %cmp, label %while.end.loopexit, label %while.body - -while.end.loopexit: - %add2.lcssa = phi i32 [ %add2, %while.body ] - br label %while.end - -while.end: - %S.0.lcssa = phi i32 [ undef, %entry ], [ %add2.lcssa, %while.end.loopexit ] - ret i32 %S.0.lcssa -} diff --git a/llvm/test/Transforms/LoopReroll/reduction.ll b/llvm/test/Transforms/LoopReroll/reduction.ll deleted file mode 100644 index 94f4d53..0000000 --- a/llvm/test/Transforms/LoopReroll/reduction.ll +++ /dev/null @@ -1,132 +0,0 @@ -; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define i32 @foo(ptr nocapture readonly %x) #0 { -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %r.029 = phi i32 [ 0, %entry ], [ %add12, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvars.iv - %0 = load i32, ptr %arrayidx, align 4 - %add = add nsw i32 %0, %r.029 - %1 = or disjoint i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %1 - %2 = load i32, ptr %arrayidx3, align 4 - %add4 = add nsw i32 %add, %2 - %3 = or disjoint i64 %indvars.iv, 2 - %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %3 - %4 = load i32, ptr %arrayidx7, align 4 - %add8 = add nsw i32 %add4, %4 - %5 = or disjoint i64 %indvars.iv, 3 - %arrayidx11 = getelementptr inbounds i32, ptr %x, i64 %5 - %6 = load i32, ptr %arrayidx11, align 4 - %add12 = add nsw i32 %add8, %6 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 - %7 = trunc i64 %indvars.iv.next to i32 - %cmp = icmp slt i32 %7, 400 - br i1 %cmp, label %for.body, label %for.end - -; CHECK-LABEL: @foo - -; CHECK: for.body: -; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ] -; CHECK: %r.029 = phi i32 [ 0, %entry ], [ %add, %for.body ] -; CHECK: %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvar -; CHECK: %1 = load i32, ptr %arrayidx, align 4 -; CHECK: %add = add nsw i32 %1, %r.029 -; CHECK: %indvar.next = add i64 %indvar, 1 -; CHECK: %exitcond = icmp eq i32 %0, 399 -; CHECK: br i1 %exitcond, label %for.end, label %for.body - -; CHECK: ret - -for.end: ; preds = %for.body - ret i32 %add12 -} - -define float @bar(ptr nocapture readonly %x) #0 { -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %r.029 = phi float [ 0.0, %entry ], [ %add12, %for.body ] - %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvars.iv - %0 = load float, ptr %arrayidx, align 4 - %add = fadd float %0, %r.029 - %1 = or disjoint i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds float, ptr %x, i64 %1 - %2 = load float, ptr %arrayidx3, align 4 - %add4 = fadd float %add, %2 - %3 = or disjoint i64 %indvars.iv, 2 - %arrayidx7 = getelementptr inbounds float, ptr %x, i64 %3 - %4 = load float, ptr %arrayidx7, align 4 - %add8 = fadd float %add4, %4 - %5 = or disjoint i64 %indvars.iv, 3 - %arrayidx11 = getelementptr inbounds float, ptr %x, i64 %5 - %6 = load float, ptr %arrayidx11, align 4 - %add12 = fadd float %add8, %6 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 - %7 = trunc i64 %indvars.iv.next to i32 - %cmp = icmp slt i32 %7, 400 - br i1 %cmp, label %for.body, label %for.end - -; CHECK-LABEL: @bar - -; CHECK: for.body: -; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ] -; CHECK: %r.029 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] -; CHECK: %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvar -; CHECK: %1 = load float, ptr %arrayidx, align 4 -; CHECK: %add = fadd float %1, %r.029 -; CHECK: %indvar.next = add i64 %indvar, 1 -; CHECK: %exitcond = icmp eq i32 %0, 399 -; CHECK: br i1 %exitcond, label %for.end, label %for.body - -; CHECK: ret - -for.end: ; preds = %for.body - ret float %add12 -} - -define i32 @foo_unusedphi(ptr nocapture readonly %x) #0 { -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %r.029 = phi i32 [ 0, %entry ], [ %add12, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvars.iv - %0 = load i32, ptr %arrayidx, align 4 - %add = add nsw i32 %0, %0 - %1 = or disjoint i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %1 - %2 = load i32, ptr %arrayidx3, align 4 - %add4 = add nsw i32 %add, %2 - %3 = or disjoint i64 %indvars.iv, 2 - %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %3 - %4 = load i32, ptr %arrayidx7, align 4 - %add8 = add nsw i32 %add4, %4 - %5 = or disjoint i64 %indvars.iv, 3 - %arrayidx11 = getelementptr inbounds i32, ptr %x, i64 %5 - %6 = load i32, ptr %arrayidx11, align 4 - %add12 = add nsw i32 %add8, %6 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 - %7 = trunc i64 %indvars.iv.next to i32 - %cmp = icmp slt i32 %7, 400 - br i1 %cmp, label %for.body, label %for.end - -; CHECK-LABEL: @foo_unusedphi -; The above is just testing for a crash - no specific output expected. - -; CHECK: ret - -for.end: ; preds = %for.body - ret i32 %add12 -} - -attributes #0 = { nounwind readonly uwtable } - diff --git a/llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll b/llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll deleted file mode 100644 index e720e76..0000000 --- a/llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll +++ /dev/null @@ -1,130 +0,0 @@ -;RUN: opt < %s -passes=loop-reroll -S | FileCheck %s -;void foo(ptr restrict a, ptr restrict b, int n) { -; for(int i = 0; i < n; i+=4) { -; a[i] = b[i]; -; a[i+1] = b[i+1]; -; a[i+2] = b[i+2]; -; a[i+3] = b[i+3]; -; } -;} -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" -target triple = "armv4t--linux-gnueabi" - -; Function Attrs: nounwind -define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %n) #0 !dbg !4 { -entry: -;CHECK-LABEL: @foo - - tail call void @llvm.dbg.value(metadata ptr %a, metadata !12, metadata !22), !dbg !23 - tail call void @llvm.dbg.value(metadata ptr %b, metadata !13, metadata !22), !dbg !24 - tail call void @llvm.dbg.value(metadata i32 %n, metadata !14, metadata !22), !dbg !25 - tail call void @llvm.dbg.value(metadata i32 0, metadata !15, metadata !22), !dbg !26 - %cmp.30 = icmp sgt i32 %n, 0, !dbg !27 - br i1 %cmp.30, label %for.body.preheader, label %for.cond.cleanup, !dbg !29 - -for.body.preheader: ; preds = %entry - br label %for.body, !dbg !30 - -for.cond.cleanup.loopexit: ; preds = %for.body - br label %for.cond.cleanup, !dbg !32 - -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - ret void, !dbg !32 - -for.body: ; preds = %for.body.preheader, %for.body -;CHECK: for.body: -;CHECK: %indvar = phi i32 [ %indvar.next, %for.body ], [ 0, {{.*}} ] -;CHECK: load -;CHECK: store -;CHECK-NOT: load -;CHECK-NOT: store -;CHECK: call void @llvm.dbg.value -;CHECK: %indvar.next = add i32 %indvar, 1 -;CHECK: icmp eq i32 %indvar - %i.031 = phi i32 [ %add13, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds float, ptr %b, i32 %i.031, !dbg !30 - %0 = load i32, ptr %arrayidx, align 4, !dbg !30, !tbaa !33 - %arrayidx1 = getelementptr inbounds float, ptr %a, i32 %i.031, !dbg !37 - store i32 %0, ptr %arrayidx1, align 4, !dbg !38, !tbaa !33 - %add = or disjoint i32 %i.031, 1, !dbg !39 - %arrayidx2 = getelementptr inbounds float, ptr %b, i32 %add, !dbg !40 - %1 = load i32, ptr %arrayidx2, align 4, !dbg !40, !tbaa !33 - %arrayidx4 = getelementptr inbounds float, ptr %a, i32 %add, !dbg !41 - store i32 %1, ptr %arrayidx4, align 4, !dbg !42, !tbaa !33 - %add5 = or disjoint i32 %i.031, 2, !dbg !43 - %arrayidx6 = getelementptr inbounds float, ptr %b, i32 %add5, !dbg !44 - %2 = load i32, ptr %arrayidx6, align 4, !dbg !44, !tbaa !33 - %arrayidx8 = getelementptr inbounds float, ptr %a, i32 %add5, !dbg !45 - store i32 %2, ptr %arrayidx8, align 4, !dbg !46, !tbaa !33 - %add9 = or disjoint i32 %i.031, 3, !dbg !47 - %arrayidx10 = getelementptr inbounds float, ptr %b, i32 %add9, !dbg !48 - %3 = load i32, ptr %arrayidx10, align 4, !dbg !48, !tbaa !33 - %arrayidx12 = getelementptr inbounds float, ptr %a, i32 %add9, !dbg !49 - store i32 %3, ptr %arrayidx12, align 4, !dbg !50, !tbaa !33 - %add13 = add nuw nsw i32 %i.031, 4, !dbg !51 - tail call void @llvm.dbg.value(metadata i32 %add13, metadata !15, metadata !22), !dbg !26 - %cmp = icmp slt i32 %add13, %n, !dbg !27 - br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !dbg !29 -} - -; Function Attrs: nounwind readnone -declare void @llvm.dbg.value(metadata, metadata, metadata) #1 - -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+strict-align" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!17, !18, !19, !20} -!llvm.ident = !{!21} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) -!1 = !DIFile(filename: "test.c", directory: "/home/weimingz/llvm-build/release/community-tip") -!2 = !{} -!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11) -!5 = !DISubroutineType(types: !6) -!6 = !{null, !7, !7, !10} -!7 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !8) -!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 32, align: 32) -!9 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float) -!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) -!11 = !{!12, !13, !14, !15} -!12 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 1, type: !7) -!13 = !DILocalVariable(name: "b", arg: 2, scope: !4, file: !1, line: 1, type: !7) -!14 = !DILocalVariable(name: "n", arg: 3, scope: !4, file: !1, line: 1, type: !10) -!15 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 2, type: !10) -!16 = distinct !DILexicalBlock(scope: !4, file: !1, line: 2, column: 3) -!17 = !{i32 2, !"Dwarf Version", i32 4} -!18 = !{i32 2, !"Debug Info Version", i32 3} -!19 = !{i32 1, !"wchar_size", i32 4} -!20 = !{i32 1, !"min_enum_size", i32 4} -!21 = !{!"clang version 3.8.0"} -!22 = !DIExpression() -!23 = !DILocation(line: 1, column: 27, scope: !4) -!24 = !DILocation(line: 1, column: 47, scope: !4) -!25 = !DILocation(line: 1, column: 54, scope: !4) -!26 = !DILocation(line: 2, column: 11, scope: !16) -!27 = !DILocation(line: 2, column: 20, scope: !28) -!28 = distinct !DILexicalBlock(scope: !16, file: !1, line: 2, column: 3) -!29 = !DILocation(line: 2, column: 3, scope: !16) -!30 = !DILocation(line: 3, column: 12, scope: !31) -!31 = distinct !DILexicalBlock(scope: !28, file: !1, line: 2, column: 31) -!32 = !DILocation(line: 8, column: 1, scope: !4) -!33 = !{!34, !34, i64 0} -!34 = !{!"float", !35, i64 0} -!35 = !{!"omnipotent char", !36, i64 0} -!36 = !{!"Simple C/C++ TBAA"} -!37 = !DILocation(line: 3, column: 5, scope: !31) -!38 = !DILocation(line: 3, column: 10, scope: !31) -!39 = !DILocation(line: 4, column: 17, scope: !31) -!40 = !DILocation(line: 4, column: 14, scope: !31) -!41 = !DILocation(line: 4, column: 5, scope: !31) -!42 = !DILocation(line: 4, column: 12, scope: !31) -!43 = !DILocation(line: 5, column: 17, scope: !31) -!44 = !DILocation(line: 5, column: 14, scope: !31) -!45 = !DILocation(line: 5, column: 5, scope: !31) -!46 = !DILocation(line: 5, column: 12, scope: !31) -!47 = !DILocation(line: 6, column: 17, scope: !31) -!48 = !DILocation(line: 6, column: 14, scope: !31) -!49 = !DILocation(line: 6, column: 5, scope: !31) -!50 = !DILocation(line: 6, column: 12, scope: !31) -!51 = !DILocation(line: 2, column: 26, scope: !28) -- cgit v1.1 From 75798f21ca7626419ed341cf723ba51889e85500 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 9 Feb 2024 10:20:23 +0000 Subject: [gn build] Port ac3bd2bd5301 --- llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn index bed26df..a1c0427 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn @@ -50,7 +50,6 @@ static_library("Scalar") { "LoopLoadElimination.cpp", "LoopPassManager.cpp", "LoopPredication.cpp", - "LoopRerollPass.cpp", "LoopRotation.cpp", "LoopSimplifyCFG.cpp", "LoopSink.cpp", -- cgit v1.1 From 1198c3aaff63d4ce63b760b4effc14babc0bdd8a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Feb 2024 18:51:48 +0000 Subject: [X86] PromoteMaskArithmetic - use ISD::isBitwiseLogicOp wrapper. NFC. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f310010..881524f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48012,8 +48012,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG, if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); - if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND && - N->getOpcode() != ISD::OR) + if (!ISD::isBitwiseLogicOp(N->getOpcode())) return SDValue(); SDValue N0 = N->getOperand(0); -- cgit v1.1 From 713fe6dfd4803fba41f2102479580bed058ca0b3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Feb 2024 19:00:49 +0000 Subject: [X86] PromoteMaskArithmetic - consistently use SDValue instead of underlying SDNode. NFC. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 38 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 881524f..7db1b8d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48006,24 +48006,24 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, // given x, y and z are of type \p VT. We can do so, if operands are either // truncates from VT types, the second operand is a vector of constants or can // be recursively promoted. -static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG, +static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG, unsigned Depth) { // Limit recursion to avoid excessive compile times. if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); - if (!ISD::isBitwiseLogicOp(N->getOpcode())) + if (!ISD::isBitwiseLogicOp(N.getOpcode())) return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); SDLoc DL(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT)) + if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT)) return SDValue(); - if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1)) + if (SDValue NN0 = PromoteMaskArithmetic(N0, VT, DAG, Depth + 1)) N0 = NN0; else { // The Left side has to be a trunc. @@ -48037,7 +48037,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG, N0 = N0.getOperand(0); } - if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1)) + if (SDValue NN1 = PromoteMaskArithmetic(N1, VT, DAG, Depth + 1)) N1 = NN1; else { // The right side has to be a 'trunc' or a constant vector. @@ -48052,7 +48052,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG, N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); } - return DAG.getNode(N->getOpcode(), DL, VT, N0, N1); + return DAG.getNode(N.getOpcode(), DL, VT, N0, N1); } // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized @@ -48061,24 +48061,24 @@ static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG, // some of the transition sequences. // Even with AVX-512 this is still useful for removing casts around logical // operations on vXi1 mask types. -static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, +static SDValue PromoteMaskArithmetic(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - EVT VT = N->getValueType(0); + EVT VT = N.getValueType(); assert(VT.isVector() && "Expected vector type"); SDLoc DL(N); - assert((N->getOpcode() == ISD::ANY_EXTEND || - N->getOpcode() == ISD::ZERO_EXTEND || - N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); + assert((N.getOpcode() == ISD::ANY_EXTEND || + N.getOpcode() == ISD::ZERO_EXTEND || + N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); - SDValue Narrow = N->getOperand(0); + SDValue Narrow = N.getOperand(0); EVT NarrowVT = Narrow.getValueType(); // Generate the wide operation. - SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0); + SDValue Op = PromoteMaskArithmetic(Narrow, VT, DAG, 0); if (!Op) return SDValue(); - switch (N->getOpcode()) { + switch (N.getOpcode()) { default: llvm_unreachable("Unexpected opcode"); case ISD::ANY_EXTEND: return Op; @@ -52549,7 +52549,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, // Attempt to promote any comparison mask ops before moving the // SIGN_EXTEND_INREG in the way. - if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget)) + if (SDValue Promote = PromoteMaskArithmetic(N0, DAG, Subtarget)) return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1); if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { @@ -52770,7 +52770,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, return V; if (VT.isVector()) { - if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget)) + if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DAG, Subtarget)) return R; if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) @@ -52984,7 +52984,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, return V; if (VT.isVector()) - if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget)) + if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DAG, Subtarget)) return R; if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) -- cgit v1.1 From 3902f9b6e2d925d50f9a4861d78e5aba07b6ef11 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 9 Feb 2024 10:23:49 +0000 Subject: [X86] PromoteMaskArithmetic - explicitly attempt to constant fold zext(c) instead of relying on getNode() Don't rely on isBuildVectorOfConstantSDNodes/getNode to constant fold, this could also help in cases where the constant is behind a bitcast. Noticed while investigating #80668 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7db1b8d..5d8a3a9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48026,7 +48026,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG, if (SDValue NN0 = PromoteMaskArithmetic(N0, VT, DAG, Depth + 1)) N0 = NN0; else { - // The Left side has to be a trunc. + // The left side has to be a trunc. if (N0.getOpcode() != ISD::TRUNCATE) return SDValue(); @@ -48040,16 +48040,16 @@ static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG, if (SDValue NN1 = PromoteMaskArithmetic(N1, VT, DAG, Depth + 1)) N1 = NN1; else { - // The right side has to be a 'trunc' or a constant vector. + // The right side has to be a 'trunc' or a (foldable) constant. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getValueType() == VT; - if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) - return SDValue(); - if (RHSTrunc) N1 = N1.getOperand(0); + else if (SDValue Cst = + DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1})) + N1 = Cst; else - N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); + return SDValue(); } return DAG.getNode(N.getOpcode(), DL, VT, N0, N1); -- cgit v1.1 From 2cb61a1d117e2c20e3372bc23bf12b919feaaca2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 11:54:10 +0100 Subject: [clang][Interp] Fix initializing PredefinedExprs --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 3 +-- clang/test/Sema/ms_predefined_expr.cpp | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 2539e75..aaa8ac8 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -1750,8 +1750,7 @@ bool ByteCodeExprGen::VisitPredefinedExpr(const PredefinedExpr *E) { if (DiscardResult) return true; - assert(!Initializing); - return this->visit(E->getFunctionName()); + return this->delegate(E->getFunctionName()); } template diff --git a/clang/test/Sema/ms_predefined_expr.cpp b/clang/test/Sema/ms_predefined_expr.cpp index 9f4eb27..b42a494 100644 --- a/clang/test/Sema/ms_predefined_expr.cpp +++ b/clang/test/Sema/ms_predefined_expr.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions +// RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions -fexperimental-new-constant-interpreter using size_t = __SIZE_TYPE__; -- cgit v1.1 From 316373abcc2abde414d4b9601f4752b6939a2133 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 9 Feb 2024 11:15:48 +0000 Subject: [llvm][AArch64] Refactor expansion of CALL_BTI and CALL_RVMARKER (#80419) After a lot of churn in expandCALL_BTI, it ended up doing the exact same thing that expandCALL_RVMARKER does. This change factors out the common code to make that clear. --- .../Target/AArch64/AArch64ExpandPseudoInsts.cpp | 87 ++++++++++------------ 1 file changed, 40 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 1af064b..b2c52b4 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -774,6 +774,39 @@ bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB, return true; } +// Create a call to CallTarget, copying over all the operands from *MBBI, +// starting at the regmask. +static MachineInstr *createCall(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const AArch64InstrInfo *TII, + MachineOperand &CallTarget, + unsigned RegMaskStartIdx) { + unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR; + MachineInstr *Call = + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opc)).getInstr(); + + assert((CallTarget.isGlobal() || CallTarget.isReg()) && + "invalid operand for regular call"); + Call->addOperand(CallTarget); + + // Register arguments are added during ISel, but cannot be added as explicit + // operands of the branch as it expects to be B which is only one + // operand. Instead they are implicit operands used by the branch. + while (!MBBI->getOperand(RegMaskStartIdx).isRegMask()) { + auto MOP = MBBI->getOperand(RegMaskStartIdx); + assert(MOP.isReg() && "can only add register operands"); + Call->addOperand(MachineOperand::CreateReg( + MOP.getReg(), /*Def=*/false, /*Implicit=*/true, /*isKill=*/false, + /*isDead=*/false, /*isUndef=*/MOP.isUndef())); + RegMaskStartIdx++; + } + for (const MachineOperand &MO : + llvm::drop_begin(MBBI->operands(), RegMaskStartIdx)) + Call->addOperand(MO); + + return Call; +} + bool AArch64ExpandPseudo::expandCALL_RVMARKER( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { // Expand CALL_RVMARKER pseudo to: @@ -782,31 +815,12 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER( // - another branch, to the runtime function // Mark the sequence as bundle, to avoid passes moving other code in between. MachineInstr &MI = *MBBI; - - MachineInstr *OriginalCall; MachineOperand &RVTarget = MI.getOperand(0); - MachineOperand &CallTarget = MI.getOperand(1); - assert((CallTarget.isGlobal() || CallTarget.isReg()) && - "invalid operand for regular call"); assert(RVTarget.isGlobal() && "invalid operand for attached call"); - unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR; - OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr(); - OriginalCall->addOperand(CallTarget); - - unsigned RegMaskStartIdx = 2; - // Skip register arguments. Those are added during ISel, but are not - // needed for the concrete branch. - while (!MI.getOperand(RegMaskStartIdx).isRegMask()) { - auto MOP = MI.getOperand(RegMaskStartIdx); - assert(MOP.isReg() && "can only add register operands"); - OriginalCall->addOperand(MachineOperand::CreateReg( - MOP.getReg(), /*Def=*/false, /*Implicit=*/true, /*isKill=*/false, - /*isDead=*/false, /*isUndef=*/MOP.isUndef())); - RegMaskStartIdx++; - } - for (const MachineOperand &MO : - llvm::drop_begin(MI.operands(), RegMaskStartIdx)) - OriginalCall->addOperand(MO); + MachineInstr *OriginalCall = + createCall(MBB, MBBI, TII, MI.getOperand(1), + // Regmask starts after the RV and call targets. + /*RegMaskStartIdx=*/2); BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs)) .addReg(AArch64::FP, RegState::Define) @@ -834,31 +848,10 @@ bool AArch64ExpandPseudo::expandCALL_BTI(MachineBasicBlock &MBB, // - a BTI instruction // Mark the sequence as a bundle, to avoid passes moving other code in // between. - MachineInstr &MI = *MBBI; - MachineOperand &CallTarget = MI.getOperand(0); - assert((CallTarget.isGlobal() || CallTarget.isReg()) && - "invalid operand for regular call"); - unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR; - MachineInstr *Call = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr(); - Call->addOperand(CallTarget); - - // 1 because we already added the branch target above. - unsigned RegMaskStartIdx = 1; - // The branch is BL , so we cannot attach the arguments of the called - // function to it. Those must be added as implicitly used by the branch. - while (!MI.getOperand(RegMaskStartIdx).isRegMask()) { - auto MOP = MI.getOperand(RegMaskStartIdx); - assert(MOP.isReg() && "can only add register operands"); - Call->addOperand(MachineOperand::CreateReg( - MOP.getReg(), /*Def=*/false, /*Implicit=*/true, /*isKill=*/false, - /*isDead=*/false, /*isUndef=*/MOP.isUndef())); - RegMaskStartIdx++; - } - for (const MachineOperand &MO : - llvm::drop_begin(MI.operands(), RegMaskStartIdx)) - Call->addOperand(MO); + MachineInstr *Call = createCall(MBB, MBBI, TII, MI.getOperand(0), + // Regmask starts after the call target. + /*RegMaskStartIdx=*/1); Call->setCFIType(*MBB.getParent(), MI.getCFIType()); -- cgit v1.1 From b5a273a1cfe6f509f8d2541e04d9186438f33348 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Fri, 9 Feb 2024 12:02:59 +0000 Subject: [Polly][DebugInfo] Use getStableDebugLoc to avoid intrinsic-dependent behaviour (#81246) Polly currently uses `getDebugLoc` in a few places to produce diagnostic output; this is correct when interacting with specific instructions, but may be incorrect when dealing with instruction ranges if debug intrinsics are included. As a general rule, the debug locations attached to debug intrinsics may be misleading compared to the surrounding instructions, and are not generally used for anything other than determining variable scope info; the recommended approach is therefore to use `getStableDebugLoc` instead, which skips over debug intrinsics. This is necessary to fix test failures that occur when enabling non-instruction debug info, which removes debug intrinsics from basic blocks and thus alters the diagnostic output of Polly (despite causing no functional change). --- polly/lib/Analysis/ScopDetectionDiagnostic.cpp | 4 ++-- polly/lib/Support/ScopLocation.cpp | 2 +- polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll | 8 ++++---- polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp index 364e21a..30fbd17 100644 --- a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp +++ b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp @@ -122,7 +122,7 @@ void getDebugLocations(const BBPair &P, DebugLoc &Begin, DebugLoc &End) { continue; Todo.append(succ_begin(BB), succ_end(BB)); for (const Instruction &Inst : *BB) { - DebugLoc DL = Inst.getDebugLoc(); + DebugLoc DL = Inst.getStableDebugLoc(); if (!DL) continue; @@ -821,7 +821,7 @@ std::string ReportUnprofitable::getEndUserMessage() const { const DebugLoc &ReportUnprofitable::getDebugLoc() const { for (const BasicBlock *BB : R->blocks()) for (const Instruction &Inst : *BB) - if (const DebugLoc &DL = Inst.getDebugLoc()) + if (const DebugLoc &DL = Inst.getStableDebugLoc()) return DL; return R->getEntry()->getTerminator()->getDebugLoc(); diff --git a/polly/lib/Support/ScopLocation.cpp b/polly/lib/Support/ScopLocation.cpp index 01f3d68..9f9941d 100644 --- a/polly/lib/Support/ScopLocation.cpp +++ b/polly/lib/Support/ScopLocation.cpp @@ -25,7 +25,7 @@ void getDebugLocation(const Region *R, unsigned &LineBegin, unsigned &LineEnd, for (const BasicBlock *BB : R->blocks()) for (const Instruction &Inst : *BB) { - DebugLoc DL = Inst.getDebugLoc(); + DebugLoc DL = Inst.getStableDebugLoc(); if (!DL) continue; diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll index 6182371..35986b5 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll @@ -19,20 +19,20 @@ ; If we reject non-affine loops the non-affine loop bound will be reported: ; -; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:1:12: The following errors keep this region from being a Scop. +; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:2:8: The following errors keep this region from being a Scop. ; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:2:8: Failed to derive an affine function from the loop bounds. ; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:3:5: Invalid Scop candidate ends here. ; If we allow non-affine loops the non-affine access will be reported: ; -; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:1:12: The following errors keep this region from being a Scop. +; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:2:8: The following errors keep this region from being a Scop. ; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:3:5: The array subscript of "A" is not affine ; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:3:5: Invalid Scop candidate ends here. ; If we allow non-affine loops and non-affine accesses the region will be reported as not profitable: ; -; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:1:12: The following errors keep this region from being a Scop. -; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:1:12: No profitable polyhedral optimization found +; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:2:8: The following errors keep this region from being a Scop. +; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:2:8: No profitable polyhedral optimization found ; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:3:5: Invalid Scop candidate ends here. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll index 7661bd0..a0f2704 100644 --- a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll +++ b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll @@ -2,7 +2,7 @@ ; ; Derived from test-suite/MultiSource/Benchmarks/BitBench/uuencode/uuencode.c ; -; CHECK: remark: uuencode.c:75:18: The following errors keep this region from being a Scop. +; CHECK: remark: uuencode.c:76:13: The following errors keep this region from being a Scop. ; CHECK: remark: uuencode.c:83:3: Loop cannot be handled because it has multiple exits. ; CHECK: remark: uuencode.c:95:21: Invalid Scop candidate ends here. -- cgit v1.1 From 614fab49b0b47c6463fb4d9d788790345bfdb6ce Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 9 Feb 2024 11:16:34 +0000 Subject: [X86] PromoteMaskArithmetic - share the same SDLoc argument instead of recreating it over and over again. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5d8a3a9..0c2d5f8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48006,8 +48006,8 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, // given x, y and z are of type \p VT. We can do so, if operands are either // truncates from VT types, the second operand is a vector of constants or can // be recursively promoted. -static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG, - unsigned Depth) { +static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, + SelectionDAG &DAG, unsigned Depth) { // Limit recursion to avoid excessive compile times. if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); @@ -48017,13 +48017,12 @@ static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG, SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); - SDLoc DL(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT)) return SDValue(); - if (SDValue NN0 = PromoteMaskArithmetic(N0, VT, DAG, Depth + 1)) + if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1)) N0 = NN0; else { // The left side has to be a trunc. @@ -48037,7 +48036,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG, N0 = N0.getOperand(0); } - if (SDValue NN1 = PromoteMaskArithmetic(N1, VT, DAG, Depth + 1)) + if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1)) N1 = NN1; else { // The right side has to be a 'trunc' or a (foldable) constant. @@ -48061,12 +48060,11 @@ static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG, // some of the transition sequences. // Even with AVX-512 this is still useful for removing casts around logical // operations on vXi1 mask types. -static SDValue PromoteMaskArithmetic(SDValue N, SelectionDAG &DAG, +static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N.getValueType(); assert(VT.isVector() && "Expected vector type"); - - SDLoc DL(N); assert((N.getOpcode() == ISD::ANY_EXTEND || N.getOpcode() == ISD::ZERO_EXTEND || N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); @@ -48075,7 +48073,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, SelectionDAG &DAG, EVT NarrowVT = Narrow.getValueType(); // Generate the wide operation. - SDValue Op = PromoteMaskArithmetic(Narrow, VT, DAG, 0); + SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0); if (!Op) return SDValue(); switch (N.getOpcode()) { @@ -52549,7 +52547,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, // Attempt to promote any comparison mask ops before moving the // SIGN_EXTEND_INREG in the way. - if (SDValue Promote = PromoteMaskArithmetic(N0, DAG, Subtarget)) + if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget)) return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1); if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { @@ -52770,7 +52768,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, return V; if (VT.isVector()) { - if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DAG, Subtarget)) + if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget)) return R; if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) @@ -52984,7 +52982,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, return V; if (VT.isVector()) - if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DAG, Subtarget)) + if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget)) return R; if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) -- cgit v1.1 From 047f8321f14a53caad7b564f7f654a470fdca8a9 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 9 Feb 2024 11:32:14 +0000 Subject: [X86] ctpop-mask.ll - add 32-bit with SSE2 test coverage 32-bit targets will try to use SSE2 <2 x i64> CTPOP expansion for i64 CTPOP --- llvm/test/CodeGen/X86/ctpop-mask.ll | 306 +++++++++++++++++++++++++++--------- 1 file changed, 229 insertions(+), 77 deletions(-) diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll index e0a96a9..6d4fa4a4 100644 --- a/llvm/test/CodeGen/X86/ctpop-mask.ll +++ b/llvm/test/CodeGen/X86/ctpop-mask.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s -check-prefixes=X86-POPCOUNT ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s -check-prefixes=X64-POPCOUNT -; RUN: llc < %s -mtriple=i686-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT +; RUN: llc < %s -mtriple=i686-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT,X86-NO-SSE2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT,X86-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X64-NO-POPCOUNT declare i8 @llvm.ctpop.i8(i8) nounwind readnone @@ -28,17 +29,42 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-POPCOUNT-LABEL: ctpop_mask2: -; X86-NO-POPCOUNT: # %bb.0: -; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-POPCOUNT-NEXT: andl $3, %eax -; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-POPCOUNT-NEXT: shrl $3, %eax -; X86-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: shrl $28, %eax -; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx -; X86-NO-POPCOUNT-NEXT: retl +; X86-NO-SSE2-LABEL: ctpop_mask2: +; X86-NO-SSE2: # %bb.0: +; X86-NO-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SSE2-NEXT: andl $3, %eax +; X86-NO-SSE2-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NO-SSE2-NEXT: shrl $3, %eax +; X86-NO-SSE2-NEXT: andl $17895697, %eax # imm = 0x1111111 +; X86-NO-SSE2-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NO-SSE2-NEXT: shrl $28, %eax +; X86-NO-SSE2-NEXT: xorl %edx, %edx +; X86-NO-SSE2-NEXT: retl +; +; X86-SSE2-LABEL: ctpop_mask2: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: andl $3, %eax +; X86-SSE2-NEXT: movd %eax, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl %edx, %edx +; X86-SSE2-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_mask2: ; X64-NO-POPCOUNT: # %bb.0: @@ -192,17 +218,42 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-POPCOUNT-LABEL: ctpop_mask4: -; X86-NO-POPCOUNT: # %bb.0: -; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-POPCOUNT-NEXT: andl $15, %eax -; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-POPCOUNT-NEXT: shrl $3, %eax -; X86-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: shrl $28, %eax -; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx -; X86-NO-POPCOUNT-NEXT: retl +; X86-NO-SSE2-LABEL: ctpop_mask4: +; X86-NO-SSE2: # %bb.0: +; X86-NO-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SSE2-NEXT: andl $15, %eax +; X86-NO-SSE2-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NO-SSE2-NEXT: shrl $3, %eax +; X86-NO-SSE2-NEXT: andl $17895697, %eax # imm = 0x1111111 +; X86-NO-SSE2-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NO-SSE2-NEXT: shrl $28, %eax +; X86-NO-SSE2-NEXT: xorl %edx, %edx +; X86-NO-SSE2-NEXT: retl +; +; X86-SSE2-LABEL: ctpop_mask4: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: andl $15, %eax +; X86-SSE2-NEXT: movd %eax, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl %edx, %edx +; X86-SSE2-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_mask4: ; X64-NO-POPCOUNT: # %bb.0: @@ -274,17 +325,42 @@ define i64 @ctpop_mask5(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-POPCOUNT-LABEL: ctpop_mask5: -; X86-NO-POPCOUNT: # %bb.0: -; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-POPCOUNT-NEXT: andl $31, %eax -; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-POPCOUNT-NEXT: shrl $3, %eax -; X86-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: shrl $28, %eax -; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx -; X86-NO-POPCOUNT-NEXT: retl +; X86-NO-SSE2-LABEL: ctpop_mask5: +; X86-NO-SSE2: # %bb.0: +; X86-NO-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SSE2-NEXT: andl $31, %eax +; X86-NO-SSE2-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NO-SSE2-NEXT: shrl $3, %eax +; X86-NO-SSE2-NEXT: andl $286331153, %eax # imm = 0x11111111 +; X86-NO-SSE2-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NO-SSE2-NEXT: shrl $28, %eax +; X86-NO-SSE2-NEXT: xorl %edx, %edx +; X86-NO-SSE2-NEXT: retl +; +; X86-SSE2-LABEL: ctpop_mask5: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: andl $31, %eax +; X86-SSE2-NEXT: movd %eax, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl %edx, %edx +; X86-SSE2-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_mask5: ; X64-NO-POPCOUNT: # %bb.0: @@ -395,18 +471,43 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6: -; X86-NO-POPCOUNT: # %bb.0: -; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-POPCOUNT-NEXT: shrl $9, %eax -; X86-NO-POPCOUNT-NEXT: andl $51, %eax -; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-POPCOUNT-NEXT: shrl $3, %eax -; X86-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: shrl $28, %eax -; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx -; X86-NO-POPCOUNT-NEXT: retl +; X86-NO-SSE2-LABEL: ctpop_shifted_mask6: +; X86-NO-SSE2: # %bb.0: +; X86-NO-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SSE2-NEXT: shrl $9, %eax +; X86-NO-SSE2-NEXT: andl $51, %eax +; X86-NO-SSE2-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NO-SSE2-NEXT: shrl $3, %eax +; X86-NO-SSE2-NEXT: andl $286331153, %eax # imm = 0x11111111 +; X86-NO-SSE2-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NO-SSE2-NEXT: shrl $28, %eax +; X86-NO-SSE2-NEXT: xorl %edx, %edx +; X86-NO-SSE2-NEXT: retl +; +; X86-SSE2-LABEL: ctpop_shifted_mask6: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl $26112, %eax # imm = 0x6600 +; X86-SSE2-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movd %eax, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl %edx, %edx +; X86-SSE2-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask6: ; X64-NO-POPCOUNT: # %bb.0: @@ -559,16 +660,41 @@ define i64 @ctpop_shifted_mask8(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask8: -; X86-NO-POPCOUNT: # %bb.0: -; X86-NO-POPCOUNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-POPCOUNT-NEXT: shrl $3, %eax -; X86-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: shrl $28, %eax -; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx -; X86-NO-POPCOUNT-NEXT: retl +; X86-NO-SSE2-LABEL: ctpop_shifted_mask8: +; X86-NO-SSE2: # %bb.0: +; X86-NO-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-SSE2-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NO-SSE2-NEXT: shrl $3, %eax +; X86-NO-SSE2-NEXT: andl $286331153, %eax # imm = 0x11111111 +; X86-NO-SSE2-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NO-SSE2-NEXT: shrl $28, %eax +; X86-NO-SSE2-NEXT: xorl %edx, %edx +; X86-NO-SSE2-NEXT: retl +; +; X86-SSE2-LABEL: ctpop_shifted_mask8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: shll $8, %eax +; X86-SSE2-NEXT: movd %eax, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl %edx, %edx +; X86-SSE2-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask8: ; X64-NO-POPCOUNT: # %bb.0: @@ -657,27 +783,53 @@ define i64 @ctpop_shifted_mask16(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntq %rax, %rax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask16: -; X86-NO-POPCOUNT: # %bb.0: -; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-POPCOUNT-NEXT: movl %ecx, %eax -; X86-NO-POPCOUNT-NEXT: andl $524280, %eax # imm = 0x7FFF8 -; X86-NO-POPCOUNT-NEXT: shrl %ecx -; X86-NO-POPCOUNT-NEXT: andl $87380, %ecx # imm = 0x15554 -; X86-NO-POPCOUNT-NEXT: subl %ecx, %eax -; X86-NO-POPCOUNT-NEXT: movl %eax, %ecx -; X86-NO-POPCOUNT-NEXT: andl $858993456, %ecx # imm = 0x33333330 -; X86-NO-POPCOUNT-NEXT: shrl $2, %eax -; X86-NO-POPCOUNT-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X86-NO-POPCOUNT-NEXT: addl %ecx, %eax -; X86-NO-POPCOUNT-NEXT: movl %eax, %ecx -; X86-NO-POPCOUNT-NEXT: shrl $4, %ecx -; X86-NO-POPCOUNT-NEXT: addl %eax, %ecx -; X86-NO-POPCOUNT-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; X86-NO-POPCOUNT-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; X86-NO-POPCOUNT-NEXT: shrl $24, %eax -; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx -; X86-NO-POPCOUNT-NEXT: retl +; X86-NO-SSE2-LABEL: ctpop_shifted_mask16: +; X86-NO-SSE2: # %bb.0: +; X86-NO-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SSE2-NEXT: movl %ecx, %eax +; X86-NO-SSE2-NEXT: andl $524280, %eax # imm = 0x7FFF8 +; X86-NO-SSE2-NEXT: shrl %ecx +; X86-NO-SSE2-NEXT: andl $87380, %ecx # imm = 0x15554 +; X86-NO-SSE2-NEXT: subl %ecx, %eax +; X86-NO-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SSE2-NEXT: andl $858993456, %ecx # imm = 0x33333330 +; X86-NO-SSE2-NEXT: shrl $2, %eax +; X86-NO-SSE2-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NO-SSE2-NEXT: addl %ecx, %eax +; X86-NO-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SSE2-NEXT: shrl $4, %ecx +; X86-NO-SSE2-NEXT: addl %eax, %ecx +; X86-NO-SSE2-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NO-SSE2-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 +; X86-NO-SSE2-NEXT: shrl $24, %eax +; X86-NO-SSE2-NEXT: xorl %edx, %edx +; X86-NO-SSE2-NEXT: retl +; +; X86-SSE2-LABEL: ctpop_shifted_mask16: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl $524280, %eax # imm = 0x7FFF8 +; X86-SSE2-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movd %eax, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl %edx, %edx +; X86-SSE2-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask16: ; X64-NO-POPCOUNT: # %bb.0: -- cgit v1.1 From 9ba265636f3310b5b5b39767715e1843a06ea603 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 9 Feb 2024 11:51:40 +0000 Subject: [X86] ReplaceNodeResults - shrink i64 CTPOP to (shifted) CTPOP i32 if 32 or less active bits to avoid SSE2 codegen 32-bit targets perform i64 CTPOP as a v2i64 CTPOP - if we can perform this as a i32 CTPOP by shifting the source bits, then do so to avoid the gpr<->xmm This also triggers on non-SSE2 capable targets, as can be seen with the minor codegen diffs in ctpop_shifted_mask16 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 14 ++ llvm/test/CodeGen/X86/ctpop-mask.ll | 308 ++++++++------------------------ 2 files changed, 93 insertions(+), 229 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0c2d5f8..18f9871 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -32109,6 +32109,20 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::CTPOP: { assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + // If we have at most 32 active bits, then perform as i32 CTPOP. + // TODO: Perform this in generic legalizer? + KnownBits Known = DAG.computeKnownBits(N->getOperand(0)); + unsigned LZ = Known.countMinLeadingZeros(); + unsigned TZ = Known.countMinTrailingZeros(); + if ((LZ + TZ) >= 32) { + SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0), + DAG.getShiftAmountConstant(TZ, MVT::i64, dl)); + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op); + Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op); + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op); + Results.push_back(Op); + return; + } // Use a v2i64 if possible. bool NoImplicitFloatOps = DAG.getMachineFunction().getFunction().hasFnAttribute( diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll index 6d4fa4a4..97c634a 100644 --- a/llvm/test/CodeGen/X86/ctpop-mask.ll +++ b/llvm/test/CodeGen/X86/ctpop-mask.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s -check-prefixes=X86-POPCOUNT ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s -check-prefixes=X64-POPCOUNT -; RUN: llc < %s -mtriple=i686-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT,X86-NO-SSE2 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT,X86-SSE2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X64-NO-POPCOUNT declare i8 @llvm.ctpop.i8(i8) nounwind readnone @@ -29,42 +29,17 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-SSE2-LABEL: ctpop_mask2: -; X86-NO-SSE2: # %bb.0: -; X86-NO-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-SSE2-NEXT: andl $3, %eax -; X86-NO-SSE2-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-SSE2-NEXT: shrl $3, %eax -; X86-NO-SSE2-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X86-NO-SSE2-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-SSE2-NEXT: shrl $28, %eax -; X86-NO-SSE2-NEXT: xorl %edx, %edx -; X86-NO-SSE2-NEXT: retl -; -; X86-SSE2-LABEL: ctpop_mask2: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: andl $3, %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: psubb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: paddb %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: paddb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax -; X86-SSE2-NEXT: xorl %edx, %edx -; X86-SSE2-NEXT: retl +; X86-NO-POPCOUNT-LABEL: ctpop_mask2: +; X86-NO-POPCOUNT: # %bb.0: +; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-POPCOUNT-NEXT: andl $3, %eax +; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NO-POPCOUNT-NEXT: shrl $3, %eax +; X86-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 +; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NO-POPCOUNT-NEXT: shrl $28, %eax +; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx +; X86-NO-POPCOUNT-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_mask2: ; X64-NO-POPCOUNT: # %bb.0: @@ -218,42 +193,17 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-SSE2-LABEL: ctpop_mask4: -; X86-NO-SSE2: # %bb.0: -; X86-NO-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-SSE2-NEXT: andl $15, %eax -; X86-NO-SSE2-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-SSE2-NEXT: shrl $3, %eax -; X86-NO-SSE2-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X86-NO-SSE2-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-SSE2-NEXT: shrl $28, %eax -; X86-NO-SSE2-NEXT: xorl %edx, %edx -; X86-NO-SSE2-NEXT: retl -; -; X86-SSE2-LABEL: ctpop_mask4: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: andl $15, %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: psubb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: paddb %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: paddb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax -; X86-SSE2-NEXT: xorl %edx, %edx -; X86-SSE2-NEXT: retl +; X86-NO-POPCOUNT-LABEL: ctpop_mask4: +; X86-NO-POPCOUNT: # %bb.0: +; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-POPCOUNT-NEXT: andl $15, %eax +; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NO-POPCOUNT-NEXT: shrl $3, %eax +; X86-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 +; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NO-POPCOUNT-NEXT: shrl $28, %eax +; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx +; X86-NO-POPCOUNT-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_mask4: ; X64-NO-POPCOUNT: # %bb.0: @@ -325,42 +275,17 @@ define i64 @ctpop_mask5(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-SSE2-LABEL: ctpop_mask5: -; X86-NO-SSE2: # %bb.0: -; X86-NO-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-SSE2-NEXT: andl $31, %eax -; X86-NO-SSE2-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-SSE2-NEXT: shrl $3, %eax -; X86-NO-SSE2-NEXT: andl $286331153, %eax # imm = 0x11111111 -; X86-NO-SSE2-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-SSE2-NEXT: shrl $28, %eax -; X86-NO-SSE2-NEXT: xorl %edx, %edx -; X86-NO-SSE2-NEXT: retl -; -; X86-SSE2-LABEL: ctpop_mask5: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: andl $31, %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: psubb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: paddb %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: paddb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax -; X86-SSE2-NEXT: xorl %edx, %edx -; X86-SSE2-NEXT: retl +; X86-NO-POPCOUNT-LABEL: ctpop_mask5: +; X86-NO-POPCOUNT: # %bb.0: +; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-POPCOUNT-NEXT: andl $31, %eax +; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NO-POPCOUNT-NEXT: shrl $3, %eax +; X86-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111 +; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NO-POPCOUNT-NEXT: shrl $28, %eax +; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx +; X86-NO-POPCOUNT-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_mask5: ; X64-NO-POPCOUNT: # %bb.0: @@ -471,43 +396,18 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-SSE2-LABEL: ctpop_shifted_mask6: -; X86-NO-SSE2: # %bb.0: -; X86-NO-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-SSE2-NEXT: shrl $9, %eax -; X86-NO-SSE2-NEXT: andl $51, %eax -; X86-NO-SSE2-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-SSE2-NEXT: shrl $3, %eax -; X86-NO-SSE2-NEXT: andl $286331153, %eax # imm = 0x11111111 -; X86-NO-SSE2-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-SSE2-NEXT: shrl $28, %eax -; X86-NO-SSE2-NEXT: xorl %edx, %edx -; X86-NO-SSE2-NEXT: retl -; -; X86-SSE2-LABEL: ctpop_shifted_mask6: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl $26112, %eax # imm = 0x6600 -; X86-SSE2-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: psubb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: paddb %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: paddb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax -; X86-SSE2-NEXT: xorl %edx, %edx -; X86-SSE2-NEXT: retl +; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6: +; X86-NO-POPCOUNT: # %bb.0: +; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-POPCOUNT-NEXT: shrl $9, %eax +; X86-NO-POPCOUNT-NEXT: andl $51, %eax +; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NO-POPCOUNT-NEXT: shrl $3, %eax +; X86-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111 +; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NO-POPCOUNT-NEXT: shrl $28, %eax +; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx +; X86-NO-POPCOUNT-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask6: ; X64-NO-POPCOUNT: # %bb.0: @@ -660,41 +560,16 @@ define i64 @ctpop_shifted_mask8(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-SSE2-LABEL: ctpop_shifted_mask8: -; X86-NO-SSE2: # %bb.0: -; X86-NO-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NO-SSE2-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-SSE2-NEXT: shrl $3, %eax -; X86-NO-SSE2-NEXT: andl $286331153, %eax # imm = 0x11111111 -; X86-NO-SSE2-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-SSE2-NEXT: shrl $28, %eax -; X86-NO-SSE2-NEXT: xorl %edx, %edx -; X86-NO-SSE2-NEXT: retl -; -; X86-SSE2-LABEL: ctpop_shifted_mask8: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: shll $8, %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: psubb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: paddb %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: paddb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax -; X86-SSE2-NEXT: xorl %edx, %edx -; X86-SSE2-NEXT: retl +; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask8: +; X86-NO-POPCOUNT: # %bb.0: +; X86-NO-POPCOUNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NO-POPCOUNT-NEXT: shrl $3, %eax +; X86-NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111 +; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NO-POPCOUNT-NEXT: shrl $28, %eax +; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx +; X86-NO-POPCOUNT-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask8: ; X64-NO-POPCOUNT: # %bb.0: @@ -783,53 +658,28 @@ define i64 @ctpop_shifted_mask16(i64 %x) nounwind readnone { ; X64-POPCOUNT-NEXT: popcntq %rax, %rax ; X64-POPCOUNT-NEXT: retq ; -; X86-NO-SSE2-LABEL: ctpop_shifted_mask16: -; X86-NO-SSE2: # %bb.0: -; X86-NO-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-SSE2-NEXT: movl %ecx, %eax -; X86-NO-SSE2-NEXT: andl $524280, %eax # imm = 0x7FFF8 -; X86-NO-SSE2-NEXT: shrl %ecx -; X86-NO-SSE2-NEXT: andl $87380, %ecx # imm = 0x15554 -; X86-NO-SSE2-NEXT: subl %ecx, %eax -; X86-NO-SSE2-NEXT: movl %eax, %ecx -; X86-NO-SSE2-NEXT: andl $858993456, %ecx # imm = 0x33333330 -; X86-NO-SSE2-NEXT: shrl $2, %eax -; X86-NO-SSE2-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X86-NO-SSE2-NEXT: addl %ecx, %eax -; X86-NO-SSE2-NEXT: movl %eax, %ecx -; X86-NO-SSE2-NEXT: shrl $4, %ecx -; X86-NO-SSE2-NEXT: addl %eax, %ecx -; X86-NO-SSE2-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; X86-NO-SSE2-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; X86-NO-SSE2-NEXT: shrl $24, %eax -; X86-NO-SSE2-NEXT: xorl %edx, %edx -; X86-NO-SSE2-NEXT: retl -; -; X86-SSE2-LABEL: ctpop_shifted_mask16: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl $524280, %eax # imm = 0x7FFF8 -; X86-SSE2-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: psubb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: paddb %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: paddb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax -; X86-SSE2-NEXT: xorl %edx, %edx -; X86-SSE2-NEXT: retl +; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask16: +; X86-NO-POPCOUNT: # %bb.0: +; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-POPCOUNT-NEXT: movl %ecx, %eax +; X86-NO-POPCOUNT-NEXT: andl $524280, %eax # imm = 0x7FFF8 +; X86-NO-POPCOUNT-NEXT: shrl $4, %ecx +; X86-NO-POPCOUNT-NEXT: andl $21845, %ecx # imm = 0x5555 +; X86-NO-POPCOUNT-NEXT: shrl $3, %eax +; X86-NO-POPCOUNT-NEXT: subl %ecx, %eax +; X86-NO-POPCOUNT-NEXT: movl %eax, %ecx +; X86-NO-POPCOUNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NO-POPCOUNT-NEXT: shrl $2, %eax +; X86-NO-POPCOUNT-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NO-POPCOUNT-NEXT: addl %ecx, %eax +; X86-NO-POPCOUNT-NEXT: movl %eax, %ecx +; X86-NO-POPCOUNT-NEXT: shrl $4, %ecx +; X86-NO-POPCOUNT-NEXT: addl %eax, %ecx +; X86-NO-POPCOUNT-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NO-POPCOUNT-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 +; X86-NO-POPCOUNT-NEXT: shrl $24, %eax +; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx +; X86-NO-POPCOUNT-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask16: ; X64-NO-POPCOUNT: # %bb.0: -- cgit v1.1 From eb9cd800b3c8c787f75c00e7d9de3ae6a2e5f876 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Fri, 9 Feb 2024 20:28:34 +0800 Subject: [Clang][TableGen] Add Features to TargetBuiltin RISCV target will use this parameter, so we need a way to specify it. Reviewers: AaronBallman, philnik777 Reviewed By: AaronBallman Pull Request: https://github.com/llvm/llvm-project/pull/80279 --- clang/include/clang/Basic/BuiltinsBase.td | 4 +++- clang/utils/TableGen/ClangBuiltinsEmitter.cpp | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsBase.td b/clang/include/clang/Basic/BuiltinsBase.td index b65b41b..bfccff5 100644 --- a/clang/include/clang/Basic/BuiltinsBase.td +++ b/clang/include/clang/Basic/BuiltinsBase.td @@ -87,7 +87,9 @@ class CustomEntry { } class AtomicBuiltin : Builtin; -class TargetBuiltin : Builtin; +class TargetBuiltin : Builtin { + string Features = ""; +} class LibBuiltin : Builtin { string Header = header; diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp index dc10fa1..48f55b8 100644 --- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp +++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp @@ -219,7 +219,7 @@ void EmitBuiltinDef(llvm::raw_ostream &OS, StringRef Substitution, break; } case BuiltinType::TargetBuiltin: - OS << ", \"\""; + OS << ", \"" << Builtin->getValueAsString("Features") << "\""; break; case BuiltinType::AtomicBuiltin: case BuiltinType::Builtin: -- cgit v1.1 From a8d4a024e6bea3ae71d6187f0c040b2b25e4bf69 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Fri, 9 Feb 2024 20:27:17 +0800 Subject: [Clang][RISCV] Refactor builtins to TableGen This mechanism is introduced by #68324. This refactor makes the prototype and attributes clear. Reviewers: asb, kito-cheng, philnik777, topperc, preames Reviewed By: topperc Pull Request: https://github.com/llvm/llvm-project/pull/80280 --- clang/include/clang/Basic/BuiltinsRISCV.def | 93 ----------------- clang/include/clang/Basic/BuiltinsRISCV.td | 148 ++++++++++++++++++++++++++++ clang/include/clang/Basic/CMakeLists.txt | 4 + clang/include/clang/Basic/TargetBuiltins.h | 2 +- clang/include/module.modulemap | 1 - clang/lib/Basic/Targets/RISCV.cpp | 2 +- 6 files changed, 154 insertions(+), 96 deletions(-) delete mode 100644 clang/include/clang/Basic/BuiltinsRISCV.def create mode 100644 clang/include/clang/Basic/BuiltinsRISCV.td diff --git a/clang/include/clang/Basic/BuiltinsRISCV.def b/clang/include/clang/Basic/BuiltinsRISCV.def deleted file mode 100644 index 1528b18..0000000 --- a/clang/include/clang/Basic/BuiltinsRISCV.def +++ /dev/null @@ -1,93 +0,0 @@ -//==- BuiltinsRISCV.def - RISC-V Builtin function database -------*- C++ -*-==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the RISC-V-specific builtin function database. Users of -// this file must define the BUILTIN macro to make use of this information. -// -//===----------------------------------------------------------------------===// - -#if defined(BUILTIN) && !defined(TARGET_BUILTIN) -# define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS) -#endif - -// Zbb extension -TARGET_BUILTIN(__builtin_riscv_orc_b_32, "UiUi", "nc", "zbb") -TARGET_BUILTIN(__builtin_riscv_orc_b_64, "UWiUWi", "nc", "zbb,64bit") -TARGET_BUILTIN(__builtin_riscv_clz_32, "UiUi", "nc", "zbb|xtheadbb") -TARGET_BUILTIN(__builtin_riscv_clz_64, "UiUWi", "nc", "zbb|xtheadbb,64bit") -TARGET_BUILTIN(__builtin_riscv_ctz_32, "UiUi", "nc", "zbb") -TARGET_BUILTIN(__builtin_riscv_ctz_64, "UiUWi", "nc", "zbb,64bit") - -// Zbc or Zbkc extension -TARGET_BUILTIN(__builtin_riscv_clmul_32, "UiUiUi", "nc", "zbc|zbkc") -TARGET_BUILTIN(__builtin_riscv_clmul_64, "UWiUWiUWi", "nc", "zbc|zbkc,64bit") -TARGET_BUILTIN(__builtin_riscv_clmulh_32, "UiUiUi", "nc", "zbc|zbkc,32bit") -TARGET_BUILTIN(__builtin_riscv_clmulh_64, "UWiUWiUWi", "nc", "zbc|zbkc,64bit") -TARGET_BUILTIN(__builtin_riscv_clmulr_32, "UiUiUi", "nc", "zbc,32bit") -TARGET_BUILTIN(__builtin_riscv_clmulr_64, "UWiUWiUWi", "nc", "zbc,64bit") - -// Zbkx -TARGET_BUILTIN(__builtin_riscv_xperm4_32, "UiUiUi", "nc", "zbkx,32bit") -TARGET_BUILTIN(__builtin_riscv_xperm4_64, "UWiUWiUWi", "nc", "zbkx,64bit") -TARGET_BUILTIN(__builtin_riscv_xperm8_32, "UiUiUi", "nc", "zbkx,32bit") -TARGET_BUILTIN(__builtin_riscv_xperm8_64, "UWiUWiUWi", "nc", "zbkx,64bit") - -// Zbkb extension -TARGET_BUILTIN(__builtin_riscv_brev8_32, "UiUi", "nc", "zbkb") -TARGET_BUILTIN(__builtin_riscv_brev8_64, "UWiUWi", "nc", "zbkb,64bit") -TARGET_BUILTIN(__builtin_riscv_zip_32, "UiUi", "nc", "zbkb,32bit") -TARGET_BUILTIN(__builtin_riscv_unzip_32, "UiUi", "nc", "zbkb,32bit") - -// Zknd extension -TARGET_BUILTIN(__builtin_riscv_aes32dsi, "UiUiUiIUi", "nc", "zknd,32bit") -TARGET_BUILTIN(__builtin_riscv_aes32dsmi, "UiUiUiIUi", "nc", "zknd,32bit") -TARGET_BUILTIN(__builtin_riscv_aes64ds, "UWiUWiUWi", "nc", "zknd,64bit") -TARGET_BUILTIN(__builtin_riscv_aes64dsm, "UWiUWiUWi", "nc", "zknd,64bit") -TARGET_BUILTIN(__builtin_riscv_aes64im, "UWiUWi", "nc", "zknd,64bit") - -// Zknd & Zkne -TARGET_BUILTIN(__builtin_riscv_aes64ks1i, "UWiUWiIUi", "nc", "zknd|zkne,64bit") -TARGET_BUILTIN(__builtin_riscv_aes64ks2, "UWiUWiUWi", "nc", "zknd|zkne,64bit") - -// Zkne extension -TARGET_BUILTIN(__builtin_riscv_aes32esi, "UiUiUiIUi", "nc", "zkne,32bit") -TARGET_BUILTIN(__builtin_riscv_aes32esmi, "UiUiUiIUi", "nc", "zkne,32bit") -TARGET_BUILTIN(__builtin_riscv_aes64es, "UWiUWiUWi", "nc", "zkne,64bit") -TARGET_BUILTIN(__builtin_riscv_aes64esm, "UWiUWiUWi", "nc", "zkne,64bit") - -// Zknh extension -TARGET_BUILTIN(__builtin_riscv_sha256sig0, "UiUi", "nc", "zknh") -TARGET_BUILTIN(__builtin_riscv_sha256sig1, "UiUi", "nc", "zknh") -TARGET_BUILTIN(__builtin_riscv_sha256sum0, "UiUi", "nc", "zknh") -TARGET_BUILTIN(__builtin_riscv_sha256sum1, "UiUi", "nc", "zknh") - -TARGET_BUILTIN(__builtin_riscv_sha512sig0h, "UiUiUi", "nc", "zknh,32bit") -TARGET_BUILTIN(__builtin_riscv_sha512sig0l, "UiUiUi", "nc", "zknh,32bit") -TARGET_BUILTIN(__builtin_riscv_sha512sig1h, "UiUiUi", "nc", "zknh,32bit") -TARGET_BUILTIN(__builtin_riscv_sha512sig1l, "UiUiUi", "nc", "zknh,32bit") -TARGET_BUILTIN(__builtin_riscv_sha512sum0r, "UiUiUi", "nc", "zknh,32bit") -TARGET_BUILTIN(__builtin_riscv_sha512sum1r, "UiUiUi", "nc", "zknh,32bit") -TARGET_BUILTIN(__builtin_riscv_sha512sig0, "UWiUWi", "nc", "zknh,64bit") -TARGET_BUILTIN(__builtin_riscv_sha512sig1, "UWiUWi", "nc", "zknh,64bit") -TARGET_BUILTIN(__builtin_riscv_sha512sum0, "UWiUWi", "nc", "zknh,64bit") -TARGET_BUILTIN(__builtin_riscv_sha512sum1, "UWiUWi", "nc", "zknh,64bit") - -// Zksed extension -TARGET_BUILTIN(__builtin_riscv_sm4ed, "UiUiUiIUi", "nc", "zksed") -TARGET_BUILTIN(__builtin_riscv_sm4ks, "UiUiUiIUi", "nc", "zksed") - -// Zksh extension -TARGET_BUILTIN(__builtin_riscv_sm3p0, "UiUi", "nc", "zksh") -TARGET_BUILTIN(__builtin_riscv_sm3p1, "UiUi", "nc", "zksh") - -// Zihintntl extension -TARGET_BUILTIN(__builtin_riscv_ntl_load, "v.", "t", "zihintntl") -TARGET_BUILTIN(__builtin_riscv_ntl_store, "v.", "t", "zihintntl") - -#undef BUILTIN -#undef TARGET_BUILTIN diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td b/clang/include/clang/Basic/BuiltinsRISCV.td new file mode 100644 index 0000000..4cc89a8 --- /dev/null +++ b/clang/include/clang/Basic/BuiltinsRISCV.td @@ -0,0 +1,148 @@ +//==- BuiltinsRISCV.td - RISC-V Builtin function database ---*- tablegen -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the RISC-V-specific builtin function database. +// +//===----------------------------------------------------------------------===// + +include "clang/Basic/BuiltinsBase.td" + +class RISCVBuiltin : TargetBuiltin { + let Spellings = ["__builtin_riscv_" # NAME]; + let Prototype = prototype; + let Features = features; +} + +let Attributes = [NoThrow, Const] in { +//===----------------------------------------------------------------------===// +// Zbb extension. +//===----------------------------------------------------------------------===// +def orc_b_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbb">; +def orc_b_64 : RISCVBuiltin<"uint64_t(uint64_t)", "zbb,64bit">; +def clz_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbb|xtheadbb">; +def clz_64 : RISCVBuiltin<"unsigned int(uint64_t)", "zbb|xtheadbb,64bit">; +def ctz_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbb">; +def ctz_64 : RISCVBuiltin<"unsigned int(uint64_t)", "zbb,64bit">; + +//===----------------------------------------------------------------------===// +// Zbc or Zbkc extension. +//===----------------------------------------------------------------------===// +def clmul_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)", "zbc|zbkc">; +def clmul_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)", "zbc|zbkc,64bit">; +def clmulh_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)", "zbc|zbkc,32bit">; +def clmulh_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)", "zbc|zbkc,64bit">; +def clmulr_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)", "zbc,32bit">; +def clmulr_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)", "zbc,64bit">; + +//===----------------------------------------------------------------------===// +// Zbkx extension. +//===----------------------------------------------------------------------===// +let Features = "zbkx,32bit" in { +def xperm4_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">; +def xperm8_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">; +} // Features = "zbkx,32bit" + +let Features = "zbkx,64bit" in { +def xperm4_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">; +def xperm8_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">; +} // Features = "zbkx,64bit" + +//===----------------------------------------------------------------------===// +// Zbkb extension. +//===----------------------------------------------------------------------===// +def brev8_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbkb">; +def brev8_64 : RISCVBuiltin<"uint64_t(uint64_t)", "zbkb,64bit">; +def zip_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbkb,32bit">; +def unzip_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbkb,32bit">; + +//===----------------------------------------------------------------------===// +// Zknd extension. +//===----------------------------------------------------------------------===// +let Features = "zknd,32bit" in { +def aes32dsi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">; +def aes32dsmi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">; +} // Features = "zknd,32bit" + +let Features = "zknd,64bit" in { +def aes64ds : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">; +def aes64dsm : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">; +def aes64im : RISCVBuiltin<"uint64_t(uint64_t)">; +} // Features = "zknd,64bit" + +//===----------------------------------------------------------------------===// +// Zknd & Zkne extension. +//===----------------------------------------------------------------------===// +let Features = "zknd|zkne,64bit" in { +def aes64ks1i : RISCVBuiltin<"uint64_t(uint64_t, _Constant unsigned int)">; +def aes64ks2 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">; +} // Features = "zknd|zkne,64bit" + +//===----------------------------------------------------------------------===// +// Zkne extension. +//===----------------------------------------------------------------------===// +let Features = "zkne,32bit" in { +def aes32esi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">; +def aes32esmi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">; +} // Features = "zkne,32bit" + +let Features = "zkne,64bit" in { +def aes64es : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">; +def aes64esm : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">; +} // Features = "zkne,64bit" + +//===----------------------------------------------------------------------===// +// Zknh extension. +//===----------------------------------------------------------------------===// +let Features = "zknh" in { +def sha256sig0 : RISCVBuiltin<"unsigned int(unsigned int)">; +def sha256sig1 : RISCVBuiltin<"unsigned int(unsigned int)">; +def sha256sum0 : RISCVBuiltin<"unsigned int(unsigned int)">; +def sha256sum1 : RISCVBuiltin<"unsigned int(unsigned int)">; +} // Features = "zknh" + +let Features = "zknh,32bit" in { +def sha512sig0h : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">; +def sha512sig0l : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">; +def sha512sig1h : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">; +def sha512sig1l : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">; +def sha512sum0r : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">; +def sha512sum1r : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">; +} // Features = "zknh,32bit" + +let Features = "zknh,64bit" in { +def sha512sig0 : RISCVBuiltin<"uint64_t(uint64_t)">; +def sha512sig1 : RISCVBuiltin<"uint64_t(uint64_t)">; +def sha512sum0 : RISCVBuiltin<"uint64_t(uint64_t)">; +def sha512sum1 : RISCVBuiltin<"uint64_t(uint64_t)">; +} // Features = "zknh,64bit" + +//===----------------------------------------------------------------------===// +// Zksed extension. +//===----------------------------------------------------------------------===// +let Features = "zksed" in { +def sm4ed : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int )">; +def sm4ks : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">; +} // Features = "zksed" + +//===----------------------------------------------------------------------===// +// Zksh extension. +//===----------------------------------------------------------------------===// +let Features = "zksh" in { +def sm3p0 : RISCVBuiltin<"unsigned int(unsigned int)">; +def sm3p1 : RISCVBuiltin<"unsigned int(unsigned int)">; +} // Features = "zksh" + +} // Attributes = [Const, NoThrow] + +//===----------------------------------------------------------------------===// +// Zihintntl extension. +//===----------------------------------------------------------------------===// +let Features = "zihintntl", Attributes = [CustomTypeChecking] in { +def ntl_load : RISCVBuiltin<"void(...)">; +def ntl_store : RISCVBuiltin<"void(...)">; +} // Features = "zihintntl", Attributes = [CustomTypeChecking] diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt index 9689a0f..7785fb4 100644 --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -65,6 +65,10 @@ clang_tablegen(BuiltinsBPF.inc -gen-clang-builtins SOURCE BuiltinsBPF.td TARGET ClangBuiltinsBPF) +clang_tablegen(BuiltinsRISCV.inc -gen-clang-builtins + SOURCE BuiltinsRISCV.td + TARGET ClangBuiltinsRISCV) + # ARM NEON and MVE clang_tablegen(arm_neon.inc -gen-arm-neon-sema SOURCE arm_neon.td diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h index a4abaae..4333830 100644 --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -159,7 +159,7 @@ namespace clang { FirstRVVBuiltin = clang::Builtin::FirstTSBuiltin, LastRVVBuiltin = RISCVVector::FirstTSBuiltin - 1, #define BUILTIN(ID, TYPE, ATTRS) BI##ID, -#include "clang/Basic/BuiltinsRISCV.def" +#include "clang/Basic/BuiltinsRISCV.inc" LastTSBuiltin }; } // namespace RISCV diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap index 9285595..acd960c 100644 --- a/clang/include/module.modulemap +++ b/clang/include/module.modulemap @@ -54,7 +54,6 @@ module Clang_Basic { textual header "clang/Basic/BuiltinsNEON.def" textual header "clang/Basic/BuiltinsNVPTX.def" textual header "clang/Basic/BuiltinsPPC.def" - textual header "clang/Basic/BuiltinsRISCV.def" textual header "clang/Basic/BuiltinsRISCVVector.def" textual header "clang/Basic/BuiltinsSME.def" textual header "clang/Basic/BuiltinsSVE.def" diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp index 837a6e7..a6d4af2 100644 --- a/clang/lib/Basic/Targets/RISCV.cpp +++ b/clang/lib/Basic/Targets/RISCV.cpp @@ -234,7 +234,7 @@ static constexpr Builtin::Info BuiltinInfo[] = { {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, #define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \ {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, -#include "clang/Basic/BuiltinsRISCV.def" +#include "clang/Basic/BuiltinsRISCV.inc" }; ArrayRef RISCVTargetInfo::getTargetBuiltins() const { -- cgit v1.1 From 1f780bfac919dc34deface0f8e276d3573240291 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 9 Feb 2024 08:19:31 -0500 Subject: [gn] port a8d4a024e6bea3a (BuiltinsRISCV.td) --- llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn | 4 ++++ llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn | 1 + 2 files changed, 5 insertions(+) diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn index 4babd37..d484ff9 100644 --- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn @@ -100,6 +100,10 @@ clang_tablegen("BuiltinsBPF") { args = [ "-gen-clang-builtins" ] } +clang_tablegen("BuiltinsRISCV") { + args = [ "-gen-clang-builtins" ] +} + # ARM CDE, MVE, and NEON. clang_tablegen("arm_neon") { diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn index 1486d16..bbe9373 100644 --- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn @@ -27,6 +27,7 @@ static_library("Basic") { "//clang/include/clang/Basic:AttrSubMatchRulesList", "//clang/include/clang/Basic:Builtins", "//clang/include/clang/Basic:BuiltinsBPF", + "//clang/include/clang/Basic:BuiltinsRISCV", "//clang/include/clang/Basic:DiagnosticGroups", "//clang/include/clang/Basic:RegularKeywordAttrInfo", "//clang/include/clang/Basic:arm_cde_builtins", -- cgit v1.1 From 413e82a0875222e19993b1038ea803814e5ee48c Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 9 Feb 2024 13:33:09 +0000 Subject: [mlir][ArmSVE] Add intrinsics for the SME2 multi-vector zips (#80985) These are added to the ArmSVE dialect for consistency with LLVM, which registers SME2 intrinsics that don't require ZA under SVE. --- mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td | 25 ++++++++++++++-- mlir/test/Target/LLVMIR/arm-sve.mlir | 42 +++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td index e3f3d9e..f237f23 100644 --- a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td +++ b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td @@ -59,14 +59,15 @@ class ArmSVE_Op traits = []> : class ArmSVE_IntrOp traits = [], list overloadedOperands = [], - list overloadedResults = []> : + list overloadedResults = [], + int numResults = 1> : LLVM_IntrOpBase overloadedResults=*/overloadedResults, /*list overloadedOperands=*/overloadedOperands, /*list traits=*/traits, - /*int numResults=*/1>; + /*int numResults=*/numResults>; class ArmSVE_IntrBinaryOverloadedOp traits = []>: @@ -410,4 +411,24 @@ def ConvertToSvboolIntrOp : /*overloadedResults=*/[]>, Arguments<(ins SVEPredicate:$mask)>; +// Note: This multi-vector intrinsic requires SME2. +def ZipX2IntrOp : ArmSVE_IntrOp<"zip.x2", + /*traits=*/[], + /*overloadedOperands=*/[0], + /*overloadedResults=*/[], + /*numResults=*/2>, + Arguments<(ins Arg:$v1, + Arg:$v2)>; + +// Note: This multi-vector intrinsic requires SME2. +def ZipX4IntrOp : ArmSVE_IntrOp<"zip.x4", + /*traits=*/[], + /*overloadedOperands=*/[0], + /*overloadedResults=*/[], + /*numResults=*/4>, + Arguments<(ins Arg:$v1, + Arg:$v2, + Arg:$v3, + Arg:$v4)>; + #endif // ARMSVE_OPS diff --git a/mlir/test/Target/LLVMIR/arm-sve.mlir b/mlir/test/Target/LLVMIR/arm-sve.mlir index b63d3f0..c7cd1b7 100644 --- a/mlir/test/Target/LLVMIR/arm-sve.mlir +++ b/mlir/test/Target/LLVMIR/arm-sve.mlir @@ -314,3 +314,45 @@ llvm.func @arm_sve_convert_to_svbool( : (vector<[1]xi1>) -> vector<[16]xi1> llvm.return } + +// CHECK-LABEL: arm_sve_zip_x2( +// CHECK-SAME: %[[V1:[0-9]+]], +// CHECK-SAME: %[[V2:[0-9]+]], +// CHECK-SAME: %[[V3:[0-9]+]], +// CHECK-SAME: %[[V4:[0-9]+]]) +llvm.func @arm_sve_zip_x2(%nxv16i8: vector<[16]xi8>, %nxv8i16: vector<[8]xi16>, %nxv4i32: vector<[4]xi32>, %nxv2i64: vector<[2]xi64>) { + // CHECK: call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( %[[V1]], %[[V1]]) + %0 = "arm_sve.intr.zip.x2"(%nxv16i8, %nxv16i8) : (vector<[16]xi8>, vector<[16]xi8>) + -> !llvm.struct<(vector<[16]xi8>, vector<[16]xi8>)> + // CHECK: call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( %[[V2]], %[[V2]]) + %1 = "arm_sve.intr.zip.x2"(%nxv8i16, %nxv8i16) : (vector<[8]xi16>, vector<[8]xi16>) + -> !llvm.struct<(vector<[8]xi16>, vector<[8]xi16>)> + // CHECK: call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( %[[V3]], %[[V3]]) + %2 = "arm_sve.intr.zip.x2"(%nxv4i32, %nxv4i32) : (vector<[4]xi32>, vector<[4]xi32>) + -> !llvm.struct<(vector<[4]xi32>, vector<[4]xi32>)> + // CHECK: call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( %[[V4]], %[[V4]]) + %3 = "arm_sve.intr.zip.x2"(%nxv2i64, %nxv2i64) : (vector<[2]xi64>, vector<[2]xi64>) + -> !llvm.struct<(vector<[2]xi64>, vector<[2]xi64>)> + llvm.return +} + +// CHECK-LABEL: arm_sve_zip_x4( +// CHECK-SAME: %[[V1:[0-9]+]], +// CHECK-SAME: %[[V2:[0-9]+]], +// CHECK-SAME: %[[V3:[0-9]+]], +// CHECK-SAME: %[[V4:[0-9]+]]) +llvm.func @arm_sve_zip_x4(%nxv16i8: vector<[16]xi8>, %nxv8i16: vector<[8]xi16>, %nxv4i32: vector<[4]xi32>, %nxv2i64: vector<[2]xi64>) { + // CHECK: call { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( %[[V1]], %[[V1]], %[[V1]], %[[V1]]) + %0 = "arm_sve.intr.zip.x4"(%nxv16i8, %nxv16i8, %nxv16i8, %nxv16i8) : (vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>) + -> !llvm.struct<(vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>)> + // CHECK: call { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( %[[V2]], %[[V2]], %[[V2]], %[[V2]]) + %1 = "arm_sve.intr.zip.x4"(%nxv8i16, %nxv8i16, %nxv8i16, %nxv8i16) : (vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>) + -> !llvm.struct<(vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>)> + // CHECK: call { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( %[[V3]], %[[V3]], %[[V3]], %[[V3]]) + %2 = "arm_sve.intr.zip.x4"(%nxv4i32, %nxv4i32, %nxv4i32, %nxv4i32) : (vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>) + -> !llvm.struct<(vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>)> + // CHECK: call { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( %[[V4]], %[[V4]], %[[V4]], %[[V4]]) + %3 = "arm_sve.intr.zip.x4"(%nxv2i64, %nxv2i64, %nxv2i64, %nxv2i64) : (vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>) + -> !llvm.struct<(vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>)> + llvm.return +} -- cgit v1.1 From a9e546cc71e72f9febda174ed1ada70c584628c2 Mon Sep 17 00:00:00 2001 From: Tomas Matheson <76168689+tmatheson-arm@users.noreply.github.com> Date: Fri, 9 Feb 2024 13:35:42 +0000 Subject: [TableGen][NFC] convert TreePatternNode pointers to references (#81134) Almost all uses of `*TreePatternNode` expect it to be non-null. There was the occasional check that it wasn't, which I have removed. Making them references makes it clear that they exist. This was attempted in 2018 (1b465767d6ca69f4b7201503f5f21e6125fe049a) for `TreePatternNode::getChild()` but that was reverted. --- llvm/utils/TableGen/CodeGenDAGPatterns.cpp | 364 +++++++++++----------- llvm/utils/TableGen/CodeGenDAGPatterns.h | 20 +- llvm/utils/TableGen/DAGISelEmitter.cpp | 38 +-- llvm/utils/TableGen/DAGISelMatcher.cpp | 4 +- llvm/utils/TableGen/DAGISelMatcherEmitter.cpp | 12 +- llvm/utils/TableGen/DAGISelMatcherGen.cpp | 281 +++++++++-------- llvm/utils/TableGen/DAGISelMatcherOpt.cpp | 4 +- llvm/utils/TableGen/FastISelEmitter.cpp | 98 +++--- llvm/utils/TableGen/GlobalISelEmitter.cpp | 422 +++++++++++++------------- 9 files changed, 615 insertions(+), 628 deletions(-) diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp index 62e0482..a9046e0 100644 --- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp @@ -1406,10 +1406,10 @@ std::string TreePredicateFn::getCodeToRunOnSDNode() const { // PatternToMatch implementation // -static bool isImmAllOnesAllZerosMatch(const TreePatternNode *P) { - if (!P->isLeaf()) +static bool isImmAllOnesAllZerosMatch(const TreePatternNode &P) { + if (!P.isLeaf()) return false; - DefInit *DI = dyn_cast(P->getLeafValue()); + DefInit *DI = dyn_cast(P.getLeafValue()); if (!DI) return false; @@ -1420,15 +1420,15 @@ static bool isImmAllOnesAllZerosMatch(const TreePatternNode *P) { /// getPatternSize - Return the 'size' of this pattern. We want to match large /// patterns before small ones. This is used to determine the size of a /// pattern. -static unsigned getPatternSize(const TreePatternNode *P, +static unsigned getPatternSize(const TreePatternNode &P, const CodeGenDAGPatterns &CGP) { unsigned Size = 3; // The node itself. // If the root node is a ConstantSDNode, increases its size. // e.g. (set R32:$dst, 0). - if (P->isLeaf() && isa(P->getLeafValue())) + if (P.isLeaf() && isa(P.getLeafValue())) Size += 2; - if (const ComplexPattern *AM = P->getComplexPatternInfo(CGP)) { + if (const ComplexPattern *AM = P.getComplexPatternInfo(CGP)) { Size += AM->getComplexity(); // We don't want to count any children twice, so return early. return Size; @@ -1436,14 +1436,14 @@ static unsigned getPatternSize(const TreePatternNode *P, // If this node has some predicate function that must match, it adds to the // complexity of this node. - if (!P->getPredicateCalls().empty()) + if (!P.getPredicateCalls().empty()) ++Size; // Count children in the count if they are also nodes. - for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i) { - const TreePatternNode *Child = P->getChild(i); - if (!Child->isLeaf() && Child->getNumTypes()) { - const TypeSetByHwMode &T0 = Child->getExtType(0); + for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i) { + const TreePatternNode &Child = P.getChild(i); + if (!Child.isLeaf() && Child.getNumTypes()) { + const TypeSetByHwMode &T0 = Child.getExtType(0); // At this point, all variable type sets should be simple, i.e. only // have a default mode. if (T0.getMachineValueType() != MVT::Other) { @@ -1451,14 +1451,14 @@ static unsigned getPatternSize(const TreePatternNode *P, continue; } } - if (Child->isLeaf()) { - if (isa(Child->getLeafValue())) + if (Child.isLeaf()) { + if (isa(Child.getLeafValue())) Size += 5; // Matches a ConstantSDNode (+3) and a specific value (+2). - else if (Child->getComplexPatternInfo(CGP)) + else if (Child.getComplexPatternInfo(CGP)) Size += getPatternSize(Child, CGP); else if (isImmAllOnesAllZerosMatch(Child)) Size += 4; // Matches a build_vector(+3) and a predicate (+1). - else if (!Child->getPredicateCalls().empty()) + else if (!Child.getPredicateCalls().empty()) ++Size; } } @@ -1582,7 +1582,7 @@ SDTypeConstraint::SDTypeConstraint(Record *R, const CodeGenHwModes &CGH) { /// getOperandNum - Return the node corresponding to operand #OpNo in tree /// N, and the result number in ResNo. -static TreePatternNode *getOperandNum(unsigned OpNo, TreePatternNode *N, +static TreePatternNode &getOperandNum(unsigned OpNo, TreePatternNode &N, const SDNodeInfo &NodeInfo, unsigned &ResNo) { unsigned NumResults = NodeInfo.getNumResults(); @@ -1593,120 +1593,120 @@ static TreePatternNode *getOperandNum(unsigned OpNo, TreePatternNode *N, OpNo -= NumResults; - if (OpNo >= N->getNumChildren()) { + if (OpNo >= N.getNumChildren()) { std::string S; raw_string_ostream OS(S); OS << "Invalid operand number in type constraint " << (OpNo + NumResults) << " "; - N->print(OS); + N.print(OS); PrintFatalError(S); } - return N->getChild(OpNo); + return N.getChild(OpNo); } /// ApplyTypeConstraint - Given a node in a pattern, apply this type /// constraint to the nodes operands. This returns true if it makes a /// change, false otherwise. If a type contradiction is found, flag an error. -bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N, +bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode &N, const SDNodeInfo &NodeInfo, TreePattern &TP) const { if (TP.hasError()) return false; unsigned ResNo = 0; // The result number being referenced. - TreePatternNode *NodeToApply = getOperandNum(OperandNo, N, NodeInfo, ResNo); + TreePatternNode &NodeToApply = getOperandNum(OperandNo, N, NodeInfo, ResNo); TypeInfer &TI = TP.getInfer(); switch (ConstraintType) { case SDTCisVT: // Operand must be a particular type. - return NodeToApply->UpdateNodeType(ResNo, VVT, TP); + return NodeToApply.UpdateNodeType(ResNo, VVT, TP); case SDTCisPtrTy: // Operand must be same as target pointer type. - return NodeToApply->UpdateNodeType(ResNo, MVT::iPTR, TP); + return NodeToApply.UpdateNodeType(ResNo, MVT::iPTR, TP); case SDTCisInt: // Require it to be one of the legal integer VTs. - return TI.EnforceInteger(NodeToApply->getExtType(ResNo)); + return TI.EnforceInteger(NodeToApply.getExtType(ResNo)); case SDTCisFP: // Require it to be one of the legal fp VTs. - return TI.EnforceFloatingPoint(NodeToApply->getExtType(ResNo)); + return TI.EnforceFloatingPoint(NodeToApply.getExtType(ResNo)); case SDTCisVec: // Require it to be one of the legal vector VTs. - return TI.EnforceVector(NodeToApply->getExtType(ResNo)); + return TI.EnforceVector(NodeToApply.getExtType(ResNo)); case SDTCisSameAs: { unsigned OResNo = 0; - TreePatternNode *OtherNode = + TreePatternNode &OtherNode = getOperandNum(x.SDTCisSameAs_Info.OtherOperandNum, N, NodeInfo, OResNo); - return (int)NodeToApply->UpdateNodeType(ResNo, - OtherNode->getExtType(OResNo), TP) | - (int)OtherNode->UpdateNodeType(OResNo, - NodeToApply->getExtType(ResNo), TP); + return (int)NodeToApply.UpdateNodeType(ResNo, OtherNode.getExtType(OResNo), + TP) | + (int)OtherNode.UpdateNodeType(OResNo, NodeToApply.getExtType(ResNo), + TP); } case SDTCisVTSmallerThanOp: { // The NodeToApply must be a leaf node that is a VT. OtherOperandNum must // have an integer type that is smaller than the VT. - if (!NodeToApply->isLeaf() || !isa(NodeToApply->getLeafValue()) || - !cast(NodeToApply->getLeafValue()) + if (!NodeToApply.isLeaf() || !isa(NodeToApply.getLeafValue()) || + !cast(NodeToApply.getLeafValue()) ->getDef() ->isSubClassOf("ValueType")) { - TP.error(N->getOperator()->getName() + " expects a VT operand!"); + TP.error(N.getOperator()->getName() + " expects a VT operand!"); return false; } - DefInit *DI = cast(NodeToApply->getLeafValue()); + DefInit *DI = cast(NodeToApply.getLeafValue()); const CodeGenTarget &T = TP.getDAGPatterns().getTargetInfo(); auto VVT = getValueTypeByHwMode(DI->getDef(), T.getHwModes()); TypeSetByHwMode TypeListTmp(VVT); unsigned OResNo = 0; - TreePatternNode *OtherNode = getOperandNum( + TreePatternNode &OtherNode = getOperandNum( x.SDTCisVTSmallerThanOp_Info.OtherOperandNum, N, NodeInfo, OResNo); - return TI.EnforceSmallerThan(TypeListTmp, OtherNode->getExtType(OResNo), + return TI.EnforceSmallerThan(TypeListTmp, OtherNode.getExtType(OResNo), /*SmallIsVT*/ true); } case SDTCisOpSmallerThanOp: { unsigned BResNo = 0; - TreePatternNode *BigOperand = getOperandNum( + TreePatternNode &BigOperand = getOperandNum( x.SDTCisOpSmallerThanOp_Info.BigOperandNum, N, NodeInfo, BResNo); - return TI.EnforceSmallerThan(NodeToApply->getExtType(ResNo), - BigOperand->getExtType(BResNo)); + return TI.EnforceSmallerThan(NodeToApply.getExtType(ResNo), + BigOperand.getExtType(BResNo)); } case SDTCisEltOfVec: { unsigned VResNo = 0; - TreePatternNode *VecOperand = getOperandNum( + TreePatternNode &VecOperand = getOperandNum( x.SDTCisEltOfVec_Info.OtherOperandNum, N, NodeInfo, VResNo); // Filter vector types out of VecOperand that don't have the right element // type. - return TI.EnforceVectorEltTypeIs(VecOperand->getExtType(VResNo), - NodeToApply->getExtType(ResNo)); + return TI.EnforceVectorEltTypeIs(VecOperand.getExtType(VResNo), + NodeToApply.getExtType(ResNo)); } case SDTCisSubVecOfVec: { unsigned VResNo = 0; - TreePatternNode *BigVecOperand = getOperandNum( + TreePatternNode &BigVecOperand = getOperandNum( x.SDTCisSubVecOfVec_Info.OtherOperandNum, N, NodeInfo, VResNo); // Filter vector types out of BigVecOperand that don't have the // right subvector type. - return TI.EnforceVectorSubVectorTypeIs(BigVecOperand->getExtType(VResNo), - NodeToApply->getExtType(ResNo)); + return TI.EnforceVectorSubVectorTypeIs(BigVecOperand.getExtType(VResNo), + NodeToApply.getExtType(ResNo)); } case SDTCVecEltisVT: { - return TI.EnforceVectorEltTypeIs(NodeToApply->getExtType(ResNo), VVT); + return TI.EnforceVectorEltTypeIs(NodeToApply.getExtType(ResNo), VVT); } case SDTCisSameNumEltsAs: { unsigned OResNo = 0; - TreePatternNode *OtherNode = getOperandNum( + TreePatternNode &OtherNode = getOperandNum( x.SDTCisSameNumEltsAs_Info.OtherOperandNum, N, NodeInfo, OResNo); - return TI.EnforceSameNumElts(OtherNode->getExtType(OResNo), - NodeToApply->getExtType(ResNo)); + return TI.EnforceSameNumElts(OtherNode.getExtType(OResNo), + NodeToApply.getExtType(ResNo)); } case SDTCisSameSizeAs: { unsigned OResNo = 0; - TreePatternNode *OtherNode = getOperandNum( + TreePatternNode &OtherNode = getOperandNum( x.SDTCisSameSizeAs_Info.OtherOperandNum, N, NodeInfo, OResNo); - return TI.EnforceSameSize(OtherNode->getExtType(OResNo), - NodeToApply->getExtType(ResNo)); + return TI.EnforceSameSize(OtherNode.getExtType(OResNo), + NodeToApply.getExtType(ResNo)); } } llvm_unreachable("Invalid ConstraintType!"); @@ -1751,7 +1751,7 @@ bool TreePatternNode::ContainsUnresolvedType(TreePattern &TP) const { if (!TP.getInfer().isConcrete(Types[i], true)) return true; for (unsigned i = 0, e = getNumChildren(); i != e; ++i) - if (getChild(i)->ContainsUnresolvedType(TP)) + if (getChild(i).ContainsUnresolvedType(TP)) return true; return false; } @@ -1929,7 +1929,7 @@ void TreePatternNode::print(raw_ostream &OS) const { ListSeparator LS; for (unsigned i = 0, e = getNumChildren(); i != e; ++i) { OS << LS; - getChild(i)->print(OS); + getChild(i).print(OS); } } OS << ")"; @@ -1958,37 +1958,37 @@ void TreePatternNode::dump() const { print(errs()); } /// the assigned name is present in the dependent variable set, then /// the assigned name is considered significant and the node is /// isomorphic if the names match. -bool TreePatternNode::isIsomorphicTo(const TreePatternNode *N, +bool TreePatternNode::isIsomorphicTo(const TreePatternNode &N, const MultipleUseVarSet &DepVars) const { - if (N == this) + if (&N == this) return true; - if (N->isLeaf() != isLeaf()) + if (N.isLeaf() != isLeaf()) return false; // Check operator of non-leaves early since it can be cheaper than checking // types. if (!isLeaf()) - if (N->getOperator() != getOperator() || - N->getNumChildren() != getNumChildren()) + if (N.getOperator() != getOperator() || + N.getNumChildren() != getNumChildren()) return false; - if (getExtTypes() != N->getExtTypes() || - getPredicateCalls() != N->getPredicateCalls() || - getTransformFn() != N->getTransformFn()) + if (getExtTypes() != N.getExtTypes() || + getPredicateCalls() != N.getPredicateCalls() || + getTransformFn() != N.getTransformFn()) return false; if (isLeaf()) { if (DefInit *DI = dyn_cast(getLeafValue())) { - if (DefInit *NDI = dyn_cast(N->getLeafValue())) { + if (DefInit *NDI = dyn_cast(N.getLeafValue())) { return ((DI->getDef() == NDI->getDef()) && - (!DepVars.contains(getName()) || getName() == N->getName())); + (!DepVars.contains(getName()) || getName() == N.getName())); } } - return getLeafValue() == N->getLeafValue(); + return getLeafValue() == N.getLeafValue(); } for (unsigned i = 0, e = getNumChildren(); i != e; ++i) - if (!getChild(i)->isIsomorphicTo(N->getChild(i), DepVars)) + if (!getChild(i).isIsomorphicTo(N.getChild(i), DepVars)) return false; return true; } @@ -2003,7 +2003,7 @@ TreePatternNodePtr TreePatternNode::clone() const { std::vector CChildren; CChildren.reserve(Children.size()); for (unsigned i = 0, e = getNumChildren(); i != e; ++i) - CChildren.push_back(getChild(i)->clone()); + CChildren.push_back(getChild(i).clone()); New = makeIntrusiveRefCnt( getOperator(), std::move(CChildren), getNumTypes()); } @@ -2023,7 +2023,7 @@ void TreePatternNode::RemoveAllTypes() { if (isLeaf()) return; for (unsigned i = 0, e = getNumChildren(); i != e; ++i) - getChild(i)->RemoveAllTypes(); + getChild(i).RemoveAllTypes(); } /// SubstituteFormalArguments - Replace the formal arguments in this tree @@ -2034,24 +2034,24 @@ void TreePatternNode::SubstituteFormalArguments( return; for (unsigned i = 0, e = getNumChildren(); i != e; ++i) { - TreePatternNode *Child = getChild(i); - if (Child->isLeaf()) { - Init *Val = Child->getLeafValue(); + TreePatternNode &Child = getChild(i); + if (Child.isLeaf()) { + Init *Val = Child.getLeafValue(); // Note that, when substituting into an output pattern, Val might be an // UnsetInit. if (isa(Val) || (isa(Val) && cast(Val)->getDef()->getName() == "node")) { // We found a use of a formal argument, replace it with its value. - TreePatternNodePtr NewChild = ArgMap[Child->getName()]; + TreePatternNodePtr NewChild = ArgMap[Child.getName()]; assert(NewChild && "Couldn't find formal argument!"); - assert((Child->getPredicateCalls().empty() || - NewChild->getPredicateCalls() == Child->getPredicateCalls()) && + assert((Child.getPredicateCalls().empty() || + NewChild->getPredicateCalls() == Child.getPredicateCalls()) && "Non-empty child predicate clobbered!"); setChild(i, std::move(NewChild)); } } else { - getChild(i)->SubstituteFormalArguments(ArgMap); + getChild(i).SubstituteFormalArguments(ArgMap); } } } @@ -2325,7 +2325,7 @@ TreePatternNode::getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const { getOperator() != CDP.get_intrinsic_wo_chain_sdnode()) return nullptr; - unsigned IID = cast(getChild(0)->getLeafValue())->getValue(); + unsigned IID = cast(getChild(0).getLeafValue())->getValue(); return &CDP.getIntrinsicInfo(IID); } @@ -2397,7 +2397,7 @@ bool TreePatternNode::TreeHasProperty(SDNP Property, if (NodeHasProperty(Property, CGP)) return true; for (unsigned i = 0, e = getNumChildren(); i != e; ++i) - if (getChild(i)->TreeHasProperty(Property, CGP)) + if (getChild(i).TreeHasProperty(Property, CGP)) return true; return false; } @@ -2411,11 +2411,11 @@ bool TreePatternNode::isCommutativeIntrinsic( return false; } -static bool isOperandClass(const TreePatternNode *N, StringRef Class) { - if (!N->isLeaf()) - return N->getOperator()->isSubClassOf(Class); +static bool isOperandClass(const TreePatternNode &N, StringRef Class) { + if (!N.isLeaf()) + return N.getOperator()->isSubClassOf(Class); - DefInit *DI = dyn_cast(N->getLeafValue()); + DefInit *DI = dyn_cast(N.getLeafValue()); if (DI && DI->getDef()->isSubClassOf(Class)) return true; @@ -2506,15 +2506,15 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { } // Apply type info to the intrinsic ID. - MadeChange |= getChild(0)->UpdateNodeType(0, MVT::iPTR, TP); + MadeChange |= getChild(0).UpdateNodeType(0, MVT::iPTR, TP); for (unsigned i = 0, e = getNumChildren() - 1; i != e; ++i) { - MadeChange |= getChild(i + 1)->ApplyTypeConstraints(TP, NotRegisters); + MadeChange |= getChild(i + 1).ApplyTypeConstraints(TP, NotRegisters); MVT::SimpleValueType OpVT = getValueType(Int->IS.ParamTys[i]->getValueAsDef("VT")); - assert(getChild(i + 1)->getNumTypes() == 1 && "Unhandled case"); - MadeChange |= getChild(i + 1)->UpdateNodeType(0, OpVT, TP); + assert(getChild(i + 1).getNumTypes() == 1 && "Unhandled case"); + MadeChange |= getChild(i + 1).UpdateNodeType(0, OpVT, TP); } return MadeChange; } @@ -2532,8 +2532,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { bool MadeChange = false; for (unsigned i = 0, e = getNumChildren(); i != e; ++i) - MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters); - MadeChange |= NI.ApplyTypeConstraints(this, TP); + MadeChange |= getChild(i).ApplyTypeConstraints(TP, NotRegisters); + MadeChange |= NI.ApplyTypeConstraints(*this, TP); return MadeChange; } @@ -2568,9 +2568,9 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { // If this is an INSERT_SUBREG, constrain the source and destination VTs to // be the same. if (getOperator()->getName() == "INSERT_SUBREG") { - assert(getChild(0)->getNumTypes() == 1 && "FIXME: Unhandled"); - MadeChange |= UpdateNodeType(0, getChild(0)->getExtType(0), TP); - MadeChange |= getChild(0)->UpdateNodeType(0, getExtType(0), TP); + assert(getChild(0).getNumTypes() == 1 && "FIXME: Unhandled"); + MadeChange |= UpdateNodeType(0, getChild(0).getExtType(0), TP); + MadeChange |= getChild(0).UpdateNodeType(0, getExtType(0), TP); } else if (getOperator()->getName() == "REG_SEQUENCE") { // We need to do extra, custom typechecking for REG_SEQUENCE since it is // variadic. @@ -2592,7 +2592,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { } for (unsigned I = 1; I < NChild; I += 2) { - TreePatternNode *SubIdxChild = getChild(I + 1); + TreePatternNode &SubIdxChild = getChild(I + 1); if (!isOperandClass(SubIdxChild, "SubRegIndex")) { TP.error("REG_SEQUENCE requires a SubRegIndex for operand " + Twine(I + 1) + "!"); @@ -2637,7 +2637,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { return false; } - TreePatternNode *Child = getChild(ChildNo++); + TreePatternNode *Child = &getChild(ChildNo++); unsigned ChildResNo = 0; // Instructions always use res #0 of their op. // If the operand has sub-operands, they may be provided by distinct @@ -2660,7 +2660,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { getNumChildren()); return false; } - Child = getChild(ChildNo++); + Child = &getChild(ChildNo++); SubRec = cast(MIOpInfo->getArg(Arg))->getDef(); MadeChange |= @@ -2683,7 +2683,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { } for (unsigned i = 0, e = getNumChildren(); i != e; ++i) - MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters); + MadeChange |= getChild(i).ApplyTypeConstraints(TP, NotRegisters); return MadeChange; } @@ -2707,7 +2707,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { } for (unsigned i = 0; i < getNumChildren(); ++i) - MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters); + MadeChange |= getChild(i).ApplyTypeConstraints(TP, NotRegisters); return MadeChange; } @@ -2721,16 +2721,16 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { return false; } - bool MadeChange = getChild(0)->ApplyTypeConstraints(TP, NotRegisters); + bool MadeChange = getChild(0).ApplyTypeConstraints(TP, NotRegisters); return MadeChange; } /// OnlyOnRHSOfCommutative - Return true if this value is only allowed on the /// RHS of a commutative operation, not the on LHS. -static bool OnlyOnRHSOfCommutative(TreePatternNode *N) { - if (!N->isLeaf() && N->getOperator()->getName() == "imm") +static bool OnlyOnRHSOfCommutative(TreePatternNode &N) { + if (!N.isLeaf() && N.getOperator()->getName() == "imm") return true; - if (N->isLeaf() && isa(N->getLeafValue())) + if (N.isLeaf() && isa(N.getLeafValue())) return true; if (isImmAllOnesAllZerosMatch(N)) return true; @@ -2748,7 +2748,7 @@ bool TreePatternNode::canPatternMatch(std::string &Reason, return true; for (unsigned i = 0, e = getNumChildren(); i != e; ++i) - if (!getChild(i)->canPatternMatch(Reason, CDP)) + if (!getChild(i).canPatternMatch(Reason, CDP)) return false; // If this is an intrinsic, handle cases that would make it not match. For @@ -2818,15 +2818,15 @@ void TreePattern::error(const Twine &Msg) { void TreePattern::ComputeNamedNodes() { for (TreePatternNodePtr &Tree : Trees) - ComputeNamedNodes(Tree.get()); + ComputeNamedNodes(*Tree); } -void TreePattern::ComputeNamedNodes(TreePatternNode *N) { - if (!N->getName().empty()) - NamedNodes[N->getName()].push_back(N); +void TreePattern::ComputeNamedNodes(TreePatternNode &N) { + if (!N.getName().empty()) + NamedNodes[N.getName()].push_back(&N); - for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i) - ComputeNamedNodes(N->getChild(i)); + for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i) + ComputeNamedNodes(N.getChild(i)); } TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, @@ -3031,7 +3031,7 @@ static bool SimplifyTree(TreePatternNodePtr &N) { if (N->getOperator()->getName() == "bitconvert" && N->getExtType(0).isValueTypeByHwMode(false) && !N->getExtType(0).empty() && - N->getExtType(0) == N->getChild(0)->getExtType(0) && + N->getExtType(0) == N->getChild(0).getExtType(0) && N->getName().empty()) { N = N->getChildShared(0); SimplifyTree(N); @@ -3451,11 +3451,11 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs( if (Pat->getOperator()->getName() == "implicit") { for (unsigned i = 0, e = Pat->getNumChildren(); i != e; ++i) { - TreePatternNode *Dest = Pat->getChild(i); - if (!Dest->isLeaf()) + TreePatternNode &Dest = Pat->getChild(i); + if (!Dest.isLeaf()) I.error("implicitly defined value should be a register!"); - DefInit *Val = dyn_cast(Dest->getLeafValue()); + DefInit *Val = dyn_cast(Dest.getLeafValue()); if (!Val || !Val->getDef()->isSubClassOf("Register")) I.error("implicitly defined value should be a register!"); if (Val) @@ -3468,7 +3468,7 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs( // If this is not a set, verify that the children nodes are not void typed, // and recurse. for (unsigned i = 0, e = Pat->getNumChildren(); i != e; ++i) { - if (Pat->getChild(i)->getNumTypes() == 0) + if (Pat->getChild(i).getNumTypes() == 0) I.error("Cannot have void nodes inside of patterns!"); FindPatternInputsAndOutputs(I, Pat->getChildShared(i), InstInputs, InstResults, InstImpResults); @@ -3550,35 +3550,35 @@ public: isBitcast(false), isVariadic(false), hasChain(false) {} void Analyze(const PatternToMatch &Pat) { - const TreePatternNode *N = Pat.getSrcPattern(); + const TreePatternNode &N = Pat.getSrcPattern(); AnalyzeNode(N); // These properties are detected only on the root node. isBitcast = IsNodeBitcast(N); } private: - bool IsNodeBitcast(const TreePatternNode *N) const { + bool IsNodeBitcast(const TreePatternNode &N) const { if (hasSideEffects || mayLoad || mayStore || isVariadic) return false; - if (N->isLeaf()) + if (N.isLeaf()) return false; - if (N->getNumChildren() != 1 || !N->getChild(0)->isLeaf()) + if (N.getNumChildren() != 1 || !N.getChild(0).isLeaf()) return false; - if (N->getOperator()->isSubClassOf("ComplexPattern")) + if (N.getOperator()->isSubClassOf("ComplexPattern")) return false; - const SDNodeInfo &OpInfo = CDP.getSDNodeInfo(N->getOperator()); + const SDNodeInfo &OpInfo = CDP.getSDNodeInfo(N.getOperator()); if (OpInfo.getNumResults() != 1 || OpInfo.getNumOperands() != 1) return false; return OpInfo.getEnumName() == "ISD::BITCAST"; } public: - void AnalyzeNode(const TreePatternNode *N) { - if (N->isLeaf()) { - if (DefInit *DI = dyn_cast(N->getLeafValue())) { + void AnalyzeNode(const TreePatternNode &N) { + if (N.isLeaf()) { + if (DefInit *DI = dyn_cast(N.getLeafValue())) { Record *LeafRec = DI->getDef(); // Handle ComplexPattern leaves. if (LeafRec->isSubClassOf("ComplexPattern")) { @@ -3595,22 +3595,22 @@ public: } // Analyze children. - for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i) - AnalyzeNode(N->getChild(i)); + for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i) + AnalyzeNode(N.getChild(i)); // Notice properties of the node. - if (N->NodeHasProperty(SDNPMayStore, CDP)) + if (N.NodeHasProperty(SDNPMayStore, CDP)) mayStore = true; - if (N->NodeHasProperty(SDNPMayLoad, CDP)) + if (N.NodeHasProperty(SDNPMayLoad, CDP)) mayLoad = true; - if (N->NodeHasProperty(SDNPSideEffect, CDP)) + if (N.NodeHasProperty(SDNPSideEffect, CDP)) hasSideEffects = true; - if (N->NodeHasProperty(SDNPVariadic, CDP)) + if (N.NodeHasProperty(SDNPVariadic, CDP)) isVariadic = true; - if (N->NodeHasProperty(SDNPHasChain, CDP)) + if (N.NodeHasProperty(SDNPHasChain, CDP)) hasChain = true; - if (const CodeGenIntrinsic *IntInfo = N->getIntrinsicInfo(CDP)) { + if (const CodeGenIntrinsic *IntInfo = N.getIntrinsicInfo(CDP)) { ModRefInfo MR = IntInfo->ME.getModRef(); // If this is an intrinsic, analyze it. if (isRefSet(MR)) @@ -3723,14 +3723,14 @@ static bool hasNullFragReference(ListInit *LI) { } /// Get all the instructions in a tree. -static void getInstructionsInTree(TreePatternNode *Tree, +static void getInstructionsInTree(TreePatternNode &Tree, SmallVectorImpl &Instrs) { - if (Tree->isLeaf()) + if (Tree.isLeaf()) return; - if (Tree->getOperator()->isSubClassOf("Instruction")) - Instrs.push_back(Tree->getOperator()); - for (unsigned i = 0, e = Tree->getNumChildren(); i != e; ++i) - getInstructionsInTree(Tree->getChild(i), Instrs); + if (Tree.getOperator()->isSubClassOf("Instruction")) + Instrs.push_back(Tree.getOperator()); + for (unsigned i = 0, e = Tree.getNumChildren(); i != e; ++i) + getInstructionsInTree(Tree.getChild(i), Instrs); } /// Check the class of a pattern leaf node against the instruction operand it @@ -3917,7 +3917,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI, TreePatternNodePtr Pattern = I.getTree(0); TreePatternNodePtr SrcPattern; if (Pattern->getOperator()->getName() == "set") { - SrcPattern = Pattern->getChild(Pattern->getNumChildren() - 1)->clone(); + SrcPattern = Pattern->getChild(Pattern->getNumChildren() - 1).clone(); } else { // Not a set (store or something?) SrcPattern = Pattern; @@ -3995,22 +3995,22 @@ void CodeGenDAGPatterns::ParseInstructions() { typedef std::pair NameRecord; -static void FindNames(TreePatternNode *P, +static void FindNames(TreePatternNode &P, std::map &Names, TreePattern *PatternTop) { - if (!P->getName().empty()) { - NameRecord &Rec = Names[P->getName()]; + if (!P.getName().empty()) { + NameRecord &Rec = Names[P.getName()]; // If this is the first instance of the name, remember the node. if (Rec.second++ == 0) - Rec.first = P; - else if (Rec.first->getExtTypes() != P->getExtTypes()) - PatternTop->error("repetition of value: $" + P->getName() + + Rec.first = &P; + else if (Rec.first->getExtTypes() != P.getExtTypes()) + PatternTop->error("repetition of value: $" + P.getName() + " where different uses have different types!"); } - if (!P->isLeaf()) { - for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i) - FindNames(P->getChild(i), Names, PatternTop); + if (!P.isLeaf()) { + for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i) + FindNames(P.getChild(i), Names, PatternTop); } } @@ -4018,7 +4018,7 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern, PatternToMatch &&PTM) { // Do some sanity checking on the pattern we're about to match. std::string Reason; - if (!PTM.getSrcPattern()->canPatternMatch(Reason, *this)) { + if (!PTM.getSrcPattern().canPatternMatch(Reason, *this)) { PrintWarning(Pattern->getRecord()->getLoc(), Twine("Pattern can never match: ") + Reason); return; @@ -4027,7 +4027,7 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern, // If the source pattern's root is a complex pattern, that complex pattern // must specify the nodes it can potentially match. if (const ComplexPattern *CP = - PTM.getSrcPattern()->getComplexPatternInfo(*this)) + PTM.getSrcPattern().getComplexPatternInfo(*this)) if (CP->getRootNodes().empty()) Pattern->error("ComplexPattern at root must specify list of opcodes it" " could match"); @@ -4189,27 +4189,27 @@ void CodeGenDAGPatterns::VerifyInstructionFlags() { /// Given a pattern result with an unresolved type, see if we can find one /// instruction with an unresolved result type. Force this result type to an /// arbitrary element if it's possible types to converge results. -static bool ForceArbitraryInstResultType(TreePatternNode *N, TreePattern &TP) { - if (N->isLeaf()) +static bool ForceArbitraryInstResultType(TreePatternNode &N, TreePattern &TP) { + if (N.isLeaf()) return false; // Analyze children. - for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i) - if (ForceArbitraryInstResultType(N->getChild(i), TP)) + for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i) + if (ForceArbitraryInstResultType(N.getChild(i), TP)) return true; - if (!N->getOperator()->isSubClassOf("Instruction")) + if (!N.getOperator()->isSubClassOf("Instruction")) return false; // If this type is already concrete or completely unknown we can't do // anything. TypeInfer &TI = TP.getInfer(); - for (unsigned i = 0, e = N->getNumTypes(); i != e; ++i) { - if (N->getExtType(i).empty() || TI.isConcrete(N->getExtType(i), false)) + for (unsigned i = 0, e = N.getNumTypes(); i != e; ++i) { + if (N.getExtType(i).empty() || TI.isConcrete(N.getExtType(i), false)) continue; // Otherwise, force its type to an arbitrary choice. - if (TI.forceArbitrary(N->getExtType(i))) + if (TI.forceArbitrary(N.getExtType(i))) return true; } @@ -4285,7 +4285,7 @@ void CodeGenDAGPatterns::ParseOnePattern( // arbitrary types to the result pattern's nodes. if (!IterateInference && InferredAllPatternTypes && !InferredAllResultTypes) IterateInference = - ForceArbitraryInstResultType(Result.getTree(0).get(), Result); + ForceArbitraryInstResultType(*Result.getTree(0), Result); } while (IterateInference); // Verify that we inferred enough types that we can do something with the @@ -4372,13 +4372,13 @@ void CodeGenDAGPatterns::ParsePatterns() { } } -static void collectModes(std::set &Modes, const TreePatternNode *N) { - for (const TypeSetByHwMode &VTS : N->getExtTypes()) +static void collectModes(std::set &Modes, const TreePatternNode &N) { + for (const TypeSetByHwMode &VTS : N.getExtTypes()) for (const auto &I : VTS) Modes.insert(I.first); - for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i) - collectModes(Modes, N->getChild(i)); + for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i) + collectModes(Modes, N.getChild(i)); } void CodeGenDAGPatterns::ExpandHwModeBasedTypes() { @@ -4391,8 +4391,8 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() { auto AppendPattern = [this](PatternToMatch &P, unsigned Mode, StringRef Check) { - TreePatternNodePtr NewSrc = P.getSrcPattern()->clone(); - TreePatternNodePtr NewDst = P.getDstPattern()->clone(); + TreePatternNodePtr NewSrc = P.getSrcPattern().clone(); + TreePatternNodePtr NewDst = P.getDstPattern().clone(); if (!NewSrc->setDefaultMode(Mode) || !NewDst->setDefaultMode(Mode)) { return; } @@ -4405,10 +4405,10 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() { for (PatternToMatch &P : Copy) { const TreePatternNode *SrcP = nullptr, *DstP = nullptr; - if (P.getSrcPattern()->hasProperTypeByHwMode()) - SrcP = P.getSrcPattern(); - if (P.getDstPattern()->hasProperTypeByHwMode()) - DstP = P.getDstPattern(); + if (P.getSrcPattern().hasProperTypeByHwMode()) + SrcP = &P.getSrcPattern(); + if (P.getDstPattern().hasProperTypeByHwMode()) + DstP = &P.getDstPattern(); if (!SrcP && !DstP) { PatternsToMatch.push_back(P); continue; @@ -4416,9 +4416,9 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() { std::set Modes; if (SrcP) - collectModes(Modes, SrcP); + collectModes(Modes, *SrcP); if (DstP) - collectModes(Modes, DstP); + collectModes(Modes, *DstP); // The predicate for the default mode needs to be constructed for each // pattern separately. @@ -4458,18 +4458,18 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() { /// Dependent variable map for CodeGenDAGPattern variant generation typedef StringMap DepVarMap; -static void FindDepVarsOf(TreePatternNode *N, DepVarMap &DepMap) { - if (N->isLeaf()) { - if (N->hasName() && isa(N->getLeafValue())) - DepMap[N->getName()]++; +static void FindDepVarsOf(TreePatternNode &N, DepVarMap &DepMap) { + if (N.isLeaf()) { + if (N.hasName() && isa(N.getLeafValue())) + DepMap[N.getName()]++; } else { - for (size_t i = 0, e = N->getNumChildren(); i != e; ++i) - FindDepVarsOf(N->getChild(i), DepMap); + for (size_t i = 0, e = N.getNumChildren(); i != e; ++i) + FindDepVarsOf(N.getChild(i), DepMap); } } /// Find dependent variables within child patterns -static void FindDepVars(TreePatternNode *N, MultipleUseVarSet &DepVars) { +static void FindDepVars(TreePatternNode &N, MultipleUseVarSet &DepVars) { DepVarMap depcounts; FindDepVarsOf(N, depcounts); for (const auto &Pair : depcounts) { @@ -4543,7 +4543,7 @@ static void CombineChildVariants( // which are the same pattern. Ignore the dups. if (R->canPatternMatch(ErrString, CDP) && none_of(OutVariants, [&](TreePatternNodePtr Variant) { - return R->isIsomorphicTo(Variant.get(), DepVars); + return R->isIsomorphicTo(*Variant, DepVars); })) OutVariants.push_back(R); @@ -4589,12 +4589,12 @@ GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N, return; } - if (N->getChild(0)->isLeaf() || N->getChild(0)->getOperator() != Operator) + if (N->getChild(0).isLeaf() || N->getChild(0).getOperator() != Operator) Children.push_back(N->getChildShared(0)); else GatherChildrenOfAssociativeOpcode(N->getChildShared(0), Children); - if (N->getChild(1)->isLeaf() || N->getChild(1)->getOperator() != Operator) + if (N->getChild(1).isLeaf() || N->getChild(1).getOperator() != Operator) Children.push_back(N->getChildShared(1)); else GatherChildrenOfAssociativeOpcode(N->getChildShared(1), Children); @@ -4688,9 +4688,9 @@ static void GenerateVariantsOf(TreePatternNodePtr N, unsigned i = 0 + Skip; unsigned e = 2 + Skip; for (; i != e; ++i) { - TreePatternNode *Child = N->getChild(i); - if (Child->isLeaf()) - if (DefInit *DI = dyn_cast(Child->getLeafValue())) { + TreePatternNode &Child = N->getChild(i); + if (Child.isLeaf()) + if (DefInit *DI = dyn_cast(Child.getLeafValue())) { Record *RR = DI->getDef(); if (RR->isSubClassOf("Register")) NoRegisters = false; @@ -4738,7 +4738,7 @@ void CodeGenDAGPatterns::GenerateVariants() { continue; LLVM_DEBUG(errs() << "FOUND VARIANTS OF: "; - PatternsToMatch[i].getSrcPattern()->dump(); errs() << "\n"); + PatternsToMatch[i].getSrcPattern().dump(); errs() << "\n"); for (unsigned v = 0, e = Variants.size(); v != e; ++v) { TreePatternNodePtr Variant = Variants[v]; diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.h b/llvm/utils/TableGen/CodeGenDAGPatterns.h index ea6219c..823c40c 100644 --- a/llvm/utils/TableGen/CodeGenDAGPatterns.h +++ b/llvm/utils/TableGen/CodeGenDAGPatterns.h @@ -406,7 +406,7 @@ struct SDTypeConstraint { /// constraint to the nodes operands. This returns true if it makes a /// change, false otherwise. If a type contradiction is found, an error /// is flagged. - bool ApplyTypeConstraint(TreePatternNode *N, const SDNodeInfo &NodeInfo, + bool ApplyTypeConstraint(TreePatternNode &N, const SDNodeInfo &NodeInfo, TreePattern &TP) const; }; @@ -474,7 +474,7 @@ public: /// constraints for this node to the operands of the node. This returns /// true if it makes a change, false otherwise. If a type contradiction is /// found, an error is flagged. - bool ApplyTypeConstraints(TreePatternNode *N, TreePattern &TP) const; + bool ApplyTypeConstraints(TreePatternNode &N, TreePattern &TP) const; }; /// TreePredicateFn - This is an abstraction that represents the predicates on @@ -722,10 +722,10 @@ public: } unsigned getNumChildren() const { return Children.size(); } - const TreePatternNode *getChild(unsigned N) const { - return Children[N].get(); + const TreePatternNode &getChild(unsigned N) const { + return *Children[N].get(); } - TreePatternNode *getChild(unsigned N) { return Children[N].get(); } + TreePatternNode &getChild(unsigned N) { return *Children[N].get(); } const TreePatternNodePtr &getChildShared(unsigned N) const { return Children[N]; } @@ -812,7 +812,7 @@ public: // Higher level manipulation routines. /// the specified node. For this comparison, all of the state of the node /// is considered, except for the assigned name. Nodes with differing names /// that are otherwise identical are considered isomorphic. - bool isIsomorphicTo(const TreePatternNode *N, + bool isIsomorphicTo(const TreePatternNode &N, const MultipleUseVarSet &DepVars) const; /// SubstituteFormalArguments - Replace the formal arguments in this tree @@ -974,7 +974,7 @@ public: private: TreePatternNodePtr ParseTreePattern(Init *DI, StringRef OpName); void ComputeNamedNodes(); - void ComputeNamedNodes(TreePatternNode *N); + void ComputeNamedNodes(TreePatternNode &N); }; inline bool TreePatternNode::UpdateNodeType(unsigned ResNo, @@ -1071,9 +1071,9 @@ public: Record *getSrcRecord() const { return SrcRecord; } ListInit *getPredicates() const { return Predicates; } - TreePatternNode *getSrcPattern() const { return SrcPattern.get(); } + TreePatternNode &getSrcPattern() const { return *SrcPattern; } TreePatternNodePtr getSrcPatternShared() const { return SrcPattern; } - TreePatternNode *getDstPattern() const { return DstPattern.get(); } + TreePatternNode &getDstPattern() const { return *DstPattern; } TreePatternNodePtr getDstPatternShared() const { return DstPattern; } const std::vector &getDstRegs() const { return Dstregs; } StringRef getHwModeFeatures() const { return HwModeFeatures; } @@ -1250,7 +1250,7 @@ private: std::vector &InstImpResults); }; -inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode *N, +inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode &N, TreePattern &TP) const { bool MadeChange = false; for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i) diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp index 32b2746..336cee0 100644 --- a/llvm/utils/TableGen/DAGISelEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelEmitter.cpp @@ -38,41 +38,41 @@ public: // DAGISelEmitter Helper methods // -/// getResultPatternCost - Compute the number of instructions for this pattern. +/// Compute the number of instructions for this pattern. /// This is a temporary hack. We should really include the instruction /// latencies in this calculation. -static unsigned getResultPatternCost(TreePatternNode *P, - CodeGenDAGPatterns &CGP) { - if (P->isLeaf()) +static unsigned getResultPatternCost(TreePatternNode &P, + const CodeGenDAGPatterns &CGP) { + if (P.isLeaf()) return 0; unsigned Cost = 0; - Record *Op = P->getOperator(); + Record *Op = P.getOperator(); if (Op->isSubClassOf("Instruction")) { Cost++; CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op); if (II.usesCustomInserter) Cost += 10; } - for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i) - Cost += getResultPatternCost(P->getChild(i), CGP); + for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i) + Cost += getResultPatternCost(P.getChild(i), CGP); return Cost; } /// getResultPatternCodeSize - Compute the code size of instructions for this /// pattern. -static unsigned getResultPatternSize(TreePatternNode *P, - CodeGenDAGPatterns &CGP) { - if (P->isLeaf()) +static unsigned getResultPatternSize(TreePatternNode &P, + const CodeGenDAGPatterns &CGP) { + if (P.isLeaf()) return 0; unsigned Cost = 0; - Record *Op = P->getOperator(); + Record *Op = P.getOperator(); if (Op->isSubClassOf("Instruction")) { Cost += Op->getValueAsInt("CodeSize"); } - for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i) - Cost += getResultPatternSize(P->getChild(i), CGP); + for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i) + Cost += getResultPatternSize(P.getChild(i), CGP); return Cost; } @@ -85,11 +85,11 @@ struct PatternSortingPredicate { CodeGenDAGPatterns &CGP; bool operator()(const PatternToMatch *LHS, const PatternToMatch *RHS) { - const TreePatternNode *LT = LHS->getSrcPattern(); - const TreePatternNode *RT = RHS->getSrcPattern(); + const TreePatternNode < = LHS->getSrcPattern(); + const TreePatternNode &RT = RHS->getSrcPattern(); - MVT LHSVT = LT->getNumTypes() != 0 ? LT->getSimpleType(0) : MVT::Other; - MVT RHSVT = RT->getNumTypes() != 0 ? RT->getSimpleType(0) : MVT::Other; + MVT LHSVT = LT.getNumTypes() != 0 ? LT.getSimpleType(0) : MVT::Other; + MVT RHSVT = RT.getNumTypes() != 0 ? RT.getSimpleType(0) : MVT::Other; if (LHSVT.isVector() != RHSVT.isVector()) return RHSVT.isVector(); @@ -156,9 +156,9 @@ void DAGISelEmitter::run(raw_ostream &OS) { E = CGP.ptm_end(); I != E; ++I) { errs() << "PATTERN: "; - I->getSrcPattern()->dump(); + I->getSrcPattern().dump(); errs() << "\nRESULT: "; - I->getDstPattern()->dump(); + I->getDstPattern().dump(); errs() << "\n"; }); diff --git a/llvm/utils/TableGen/DAGISelMatcher.cpp b/llvm/utils/TableGen/DAGISelMatcher.cpp index 5461481..3298965 100644 --- a/llvm/utils/TableGen/DAGISelMatcher.cpp +++ b/llvm/utils/TableGen/DAGISelMatcher.cpp @@ -302,8 +302,8 @@ void EmitNodeMatcherCommon::printImpl(raw_ostream &OS, unsigned indent) const { void CompleteMatchMatcher::printImpl(raw_ostream &OS, unsigned indent) const { OS.indent(indent) << "CompleteMatch \n"; - OS.indent(indent) << "Src = " << *Pattern.getSrcPattern() << "\n"; - OS.indent(indent) << "Dst = " << *Pattern.getDstPattern() << "\n"; + OS.indent(indent) << "Src = " << Pattern.getSrcPattern() << "\n"; + OS.indent(indent) << "Dst = " << Pattern.getDstPattern() << "\n"; } bool CheckOpcodeMatcher::isEqualImpl(const Matcher *M) const { diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp index 8d002e5..b475c98 100644 --- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp @@ -214,10 +214,10 @@ private: }; } // end anonymous namespace. -static std::string GetPatFromTreePatternNode(const TreePatternNode *N) { +static std::string GetPatFromTreePatternNode(const TreePatternNode &N) { std::string str; raw_string_ostream Stream(str); - Stream << *N; + Stream << N; return str; } @@ -983,11 +983,11 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N, if (const MorphNodeToMatcher *SNT = dyn_cast(N)) { OS.indent(FullIndexWidth + Indent) - << "// Src: " << *SNT->getPattern().getSrcPattern() + << "// Src: " << SNT->getPattern().getSrcPattern() << " - Complexity = " << SNT->getPattern().getPatternComplexity(CGP) << '\n'; OS.indent(FullIndexWidth + Indent) - << "// Dst: " << *SNT->getPattern().getDstPattern() << '\n'; + << "// Dst: " << SNT->getPattern().getDstPattern() << '\n'; } } else OS << '\n'; @@ -1019,11 +1019,11 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N, OS << '\n'; if (!OmitComments) { OS.indent(FullIndexWidth + Indent) - << " // Src: " << *CM->getPattern().getSrcPattern() + << " // Src: " << CM->getPattern().getSrcPattern() << " - Complexity = " << CM->getPattern().getPatternComplexity(CGP) << '\n'; OS.indent(FullIndexWidth + Indent) - << " // Dst: " << *CM->getPattern().getDstPattern(); + << " // Dst: " << CM->getPattern().getDstPattern(); } OS << '\n'; return 2 + NumResultBytes + NumCoveredBytes; diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp index 8ca7aae..956cb5e 100644 --- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp @@ -113,10 +113,10 @@ private: void InferPossibleTypes(); // Matcher Generation. - void EmitMatchCode(const TreePatternNode *N, TreePatternNode *NodeNoTypes); - void EmitLeafMatchCode(const TreePatternNode *N); - void EmitOperatorMatchCode(const TreePatternNode *N, - TreePatternNode *NodeNoTypes); + void EmitMatchCode(const TreePatternNode &N, TreePatternNode &NodeNoTypes); + void EmitLeafMatchCode(const TreePatternNode &N); + void EmitOperatorMatchCode(const TreePatternNode &N, + TreePatternNode &NodeNoTypes); /// If this is the first time a node with unique identifier Name has been /// seen, record it. Otherwise, emit a check to make sure this is the same @@ -131,15 +131,15 @@ private: return VarMapEntry - 1; } - void EmitResultOperand(const TreePatternNode *N, + void EmitResultOperand(const TreePatternNode &N, SmallVectorImpl &ResultOps); - void EmitResultOfNamedOperand(const TreePatternNode *N, + void EmitResultOfNamedOperand(const TreePatternNode &N, SmallVectorImpl &ResultOps); - void EmitResultLeafAsOperand(const TreePatternNode *N, + void EmitResultLeafAsOperand(const TreePatternNode &N, SmallVectorImpl &ResultOps); - void EmitResultInstructionAsOperand(const TreePatternNode *N, + void EmitResultInstructionAsOperand(const TreePatternNode &N, SmallVectorImpl &ResultOps); - void EmitResultSDNodeXFormAsOperand(const TreePatternNode *N, + void EmitResultSDNodeXFormAsOperand(const TreePatternNode &N, SmallVectorImpl &ResultOps); }; @@ -162,7 +162,7 @@ MatcherGen::MatcherGen(const PatternToMatch &pattern, // apply the type to the tree, then rerun type inference. Iterate until all // types are resolved. // - PatWithNoTypes = Pattern.getSrcPattern()->clone(); + PatWithNoTypes = Pattern.getSrcPattern().clone(); PatWithNoTypes->RemoveAllTypes(); // If there are types that are manifestly known, infer them. @@ -198,15 +198,15 @@ void MatcherGen::AddMatcher(Matcher *NewNode) { //===----------------------------------------------------------------------===// /// EmitLeafMatchCode - Generate matching code for leaf nodes. -void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) { - assert(N->isLeaf() && "Not a leaf?"); +void MatcherGen::EmitLeafMatchCode(const TreePatternNode &N) { + assert(N.isLeaf() && "Not a leaf?"); // Direct match against an integer constant. - if (IntInit *II = dyn_cast(N->getLeafValue())) { + if (IntInit *II = dyn_cast(N.getLeafValue())) { // If this is the root of the dag we're matching, we emit a redundant opcode // check to ensure that this gets folded into the normal top-level // OpcodeSwitch. - if (N == Pattern.getSrcPattern()) { + if (&N == &Pattern.getSrcPattern()) { const SDNodeInfo &NI = CGP.getSDNodeInfo(CGP.getSDNodeNamed("imm")); AddMatcher(new CheckOpcodeMatcher(NI)); } @@ -215,14 +215,14 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) { } // An UnsetInit represents a named node without any constraints. - if (isa(N->getLeafValue())) { - assert(N->hasName() && "Unnamed ? leaf"); + if (isa(N.getLeafValue())) { + assert(N.hasName() && "Unnamed ? leaf"); return; } - DefInit *DI = dyn_cast(N->getLeafValue()); + DefInit *DI = dyn_cast(N.getLeafValue()); if (!DI) { - errs() << "Unknown leaf kind: " << *N << "\n"; + errs() << "Unknown leaf kind: " << N << "\n"; abort(); } @@ -232,7 +232,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) { // unnamed. if (LeafRec->isSubClassOf("ValueType")) { // A named ValueType leaf always matches: (add i32:$a, i32:$b). - if (N->hasName()) + if (N.hasName()) return; // An unnamed ValueType as in (sext_inreg GPR:$foo, i8). return AddMatcher(new CheckValueTypeMatcher(LeafRec->getName())); @@ -262,17 +262,17 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) { if (LeafRec->isSubClassOf("ComplexPattern")) { // We can't model ComplexPattern uses that don't have their name taken yet. // The OPC_CheckComplexPattern operation implicitly records the results. - if (N->getName().empty()) { + if (N.getName().empty()) { std::string S; raw_string_ostream OS(S); - OS << "We expect complex pattern uses to have names: " << *N; + OS << "We expect complex pattern uses to have names: " << N; PrintFatalError(S); } // Remember this ComplexPattern so that we can emit it after all the other // structural matches are done. - unsigned InputOperand = VariableMap[N->getName()] - 1; - MatchedComplexPatterns.push_back(std::make_pair(N, InputOperand)); + unsigned InputOperand = VariableMap[N.getName()] - 1; + MatchedComplexPatterns.push_back(std::make_pair(&N, InputOperand)); return; } @@ -281,8 +281,8 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) { // If this is the root of the dag we're matching, we emit a redundant opcode // check to ensure that this gets folded into the normal top-level // OpcodeSwitch. - if (N == Pattern.getSrcPattern()) { - MVT VT = N->getSimpleType(0); + if (&N == &Pattern.getSrcPattern()) { + MVT VT = N.getSimpleType(0); StringRef Name = VT.isScalableVector() ? "splat_vector" : "build_vector"; const SDNodeInfo &NI = CGP.getSDNodeInfo(CGP.getSDNodeNamed(Name)); AddMatcher(new CheckOpcodeMatcher(NI)); @@ -294,33 +294,33 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) { return; } - errs() << "Unknown leaf kind: " << *N << "\n"; + errs() << "Unknown leaf kind: " << N << "\n"; abort(); } -void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N, - TreePatternNode *NodeNoTypes) { - assert(!N->isLeaf() && "Not an operator?"); +void MatcherGen::EmitOperatorMatchCode(const TreePatternNode &N, + TreePatternNode &NodeNoTypes) { + assert(!N.isLeaf() && "Not an operator?"); - if (N->getOperator()->isSubClassOf("ComplexPattern")) { + if (N.getOperator()->isSubClassOf("ComplexPattern")) { // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is // "MY_PAT:op1:op2". We should already have validated that the uses are // consistent. - std::string PatternName = std::string(N->getOperator()->getName()); - for (unsigned i = 0; i < N->getNumChildren(); ++i) { + std::string PatternName = std::string(N.getOperator()->getName()); + for (unsigned i = 0; i < N.getNumChildren(); ++i) { PatternName += ":"; - PatternName += N->getChild(i)->getName(); + PatternName += N.getChild(i).getName(); } if (recordUniqueNode(PatternName)) { - auto NodeAndOpNum = std::make_pair(N, NextRecordedOperandNo - 1); + auto NodeAndOpNum = std::make_pair(&N, NextRecordedOperandNo - 1); MatchedComplexPatterns.push_back(NodeAndOpNum); } return; } - const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N->getOperator()); + const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N.getOperator()); // If this is an 'and R, 1234' where the operation is AND/OR and the RHS is // a constant without a predicate fn that has more than one bit set, handle @@ -332,28 +332,28 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N, // them from the mask in the dag. For example, it might turn 'AND X, 255' // into 'AND X, 254' if it knows the low bit is set. Emit code that checks // to handle this. - if ((N->getOperator()->getName() == "and" || - N->getOperator()->getName() == "or") && - N->getChild(1)->isLeaf() && N->getChild(1)->getPredicateCalls().empty() && - N->getPredicateCalls().empty()) { - if (IntInit *II = dyn_cast(N->getChild(1)->getLeafValue())) { + if ((N.getOperator()->getName() == "and" || + N.getOperator()->getName() == "or") && + N.getChild(1).isLeaf() && N.getChild(1).getPredicateCalls().empty() && + N.getPredicateCalls().empty()) { + if (IntInit *II = dyn_cast(N.getChild(1).getLeafValue())) { if (!llvm::has_single_bit( II->getValue())) { // Don't bother with single bits. // If this is at the root of the pattern, we emit a redundant // CheckOpcode so that the following checks get factored properly under // a single opcode check. - if (N == Pattern.getSrcPattern()) + if (&N == &Pattern.getSrcPattern()) AddMatcher(new CheckOpcodeMatcher(CInfo)); // Emit the CheckAndImm/CheckOrImm node. - if (N->getOperator()->getName() == "and") + if (N.getOperator()->getName() == "and") AddMatcher(new CheckAndImmMatcher(II->getValue())); else AddMatcher(new CheckOrImmMatcher(II->getValue())); // Match the LHS of the AND as appropriate. AddMatcher(new MoveChildMatcher(0)); - EmitMatchCode(N->getChild(0), NodeNoTypes->getChild(0)); + EmitMatchCode(N.getChild(0), NodeNoTypes.getChild(0)); AddMatcher(new MoveParentMatcher()); return; } @@ -365,15 +365,15 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N, // If this node has memory references (i.e. is a load or store), tell the // interpreter to capture them in the memref array. - if (N->NodeHasProperty(SDNPMemOperand, CGP)) + if (N.NodeHasProperty(SDNPMemOperand, CGP)) AddMatcher(new RecordMemRefMatcher()); // If this node has a chain, then the chain is operand #0 is the SDNode, and // the child numbers of the node are all offset by one. unsigned OpNo = 0; - if (N->NodeHasProperty(SDNPHasChain, CGP)) { + if (N.NodeHasProperty(SDNPHasChain, CGP)) { // Record the node and remember it in our chained nodes list. - AddMatcher(new RecordMatcher("'" + N->getOperator()->getName().str() + + AddMatcher(new RecordMatcher("'" + N.getOperator()->getName().str() + "' chained node", NextRecordedOperandNo)); // Remember all of the input chains our pattern will match. @@ -404,22 +404,22 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N, // To prevent this, we emit a dynamic check for legality before allowing // this to be folded. // - const TreePatternNode *Root = Pattern.getSrcPattern(); - if (N != Root) { // Not the root of the pattern. + const TreePatternNode &Root = Pattern.getSrcPattern(); + if (&N != &Root) { // Not the root of the pattern. // If there is a node between the root and this node, then we definitely // need to emit the check. - bool NeedCheck = !Root->hasChild(N); + bool NeedCheck = !Root.hasChild(&N); // If it *is* an immediate child of the root, we can still need a check if // the root SDNode has multiple inputs. For us, this means that it is an // intrinsic, has multiple operands, or has other inputs like chain or // glue). if (!NeedCheck) { - const SDNodeInfo &PInfo = CGP.getSDNodeInfo(Root->getOperator()); + const SDNodeInfo &PInfo = CGP.getSDNodeInfo(Root.getOperator()); NeedCheck = - Root->getOperator() == CGP.get_intrinsic_void_sdnode() || - Root->getOperator() == CGP.get_intrinsic_w_chain_sdnode() || - Root->getOperator() == CGP.get_intrinsic_wo_chain_sdnode() || + Root.getOperator() == CGP.get_intrinsic_void_sdnode() || + Root.getOperator() == CGP.get_intrinsic_w_chain_sdnode() || + Root.getOperator() == CGP.get_intrinsic_wo_chain_sdnode() || PInfo.getNumOperands() > 1 || PInfo.hasProperty(SDNPHasChain) || PInfo.hasProperty(SDNPInGlue) || PInfo.hasProperty(SDNPOptInGlue); } @@ -430,26 +430,26 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N, } // If this node has an output glue and isn't the root, remember it. - if (N->NodeHasProperty(SDNPOutGlue, CGP) && N != Pattern.getSrcPattern()) { + if (N.NodeHasProperty(SDNPOutGlue, CGP) && &N != &Pattern.getSrcPattern()) { // TODO: This redundantly records nodes with both glues and chains. // Record the node and remember it in our chained nodes list. - AddMatcher(new RecordMatcher("'" + N->getOperator()->getName().str() + + AddMatcher(new RecordMatcher("'" + N.getOperator()->getName().str() + "' glue output node", NextRecordedOperandNo)); } // If this node is known to have an input glue or if it *might* have an input // glue, capture it as the glue input of the pattern. - if (N->NodeHasProperty(SDNPOptInGlue, CGP) || - N->NodeHasProperty(SDNPInGlue, CGP)) + if (N.NodeHasProperty(SDNPOptInGlue, CGP) || + N.NodeHasProperty(SDNPInGlue, CGP)) AddMatcher(new CaptureGlueInputMatcher()); - for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i, ++OpNo) { + for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i, ++OpNo) { // Get the code suitable for matching this child. Move to the child, check // it then move back to the parent. AddMatcher(new MoveChildMatcher(OpNo)); - EmitMatchCode(N->getChild(i), NodeNoTypes->getChild(i)); + EmitMatchCode(N.getChild(i), NodeNoTypes.getChild(i)); AddMatcher(new MoveParentMatcher()); } } @@ -489,17 +489,17 @@ bool MatcherGen::recordUniqueNode(ArrayRef Names) { return NewRecord; } -void MatcherGen::EmitMatchCode(const TreePatternNode *N, - TreePatternNode *NodeNoTypes) { +void MatcherGen::EmitMatchCode(const TreePatternNode &N, + TreePatternNode &NodeNoTypes) { // If N and NodeNoTypes don't agree on a type, then this is a case where we // need to do a type check. Emit the check, apply the type to NodeNoTypes and // reinfer any correlated types. SmallVector ResultsToTypeCheck; - for (unsigned i = 0, e = NodeNoTypes->getNumTypes(); i != e; ++i) { - if (NodeNoTypes->getExtType(i) == N->getExtType(i)) + for (unsigned i = 0, e = NodeNoTypes.getNumTypes(); i != e; ++i) { + if (NodeNoTypes.getExtType(i) == N.getExtType(i)) continue; - NodeNoTypes->setType(i, N->getExtType(i)); + NodeNoTypes.setType(i, N.getExtType(i)); InferPossibleTypes(); ResultsToTypeCheck.push_back(i); } @@ -507,10 +507,10 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N, // If this node has a name associated with it, capture it in VariableMap. If // we already saw this in the pattern, emit code to verify dagness. SmallVector Names; - if (!N->getName().empty()) - Names.push_back(N->getName()); + if (!N.getName().empty()) + Names.push_back(N.getName()); - for (const ScopedName &Name : N->getNamesAsPredicateArg()) { + for (const ScopedName &Name : N.getNamesAsPredicateArg()) { Names.push_back( ("pred:" + Twine(Name.getScope()) + ":" + Name.getIdentifier()).str()); } @@ -520,14 +520,14 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N, return; } - if (N->isLeaf()) + if (N.isLeaf()) EmitLeafMatchCode(N); else EmitOperatorMatchCode(N, NodeNoTypes); // If there are node predicates for this node, generate their checks. - for (unsigned i = 0, e = N->getPredicateCalls().size(); i != e; ++i) { - const TreePredicateCall &Pred = N->getPredicateCalls()[i]; + for (unsigned i = 0, e = N.getPredicateCalls().size(); i != e; ++i) { + const TreePredicateCall &Pred = N.getPredicateCalls()[i]; SmallVector Operands; if (Pred.Fn.usesOperands()) { TreePattern *TP = Pred.Fn.getOrigPatFragRecord(); @@ -541,7 +541,7 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N, } for (unsigned i = 0, e = ResultsToTypeCheck.size(); i != e; ++i) - AddMatcher(new CheckTypeMatcher(N->getSimpleType(ResultsToTypeCheck[i]), + AddMatcher(new CheckTypeMatcher(N.getSimpleType(ResultsToTypeCheck[i]), ResultsToTypeCheck[i])); } @@ -554,7 +554,7 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) { // Depending on which variant we're generating code for, emit the root opcode // check. if (const ComplexPattern *CP = - Pattern.getSrcPattern()->getComplexPatternInfo(CGP)) { + Pattern.getSrcPattern().getComplexPatternInfo(CGP)) { const std::vector &OpNodes = CP->getRootNodes(); assert(!OpNodes.empty() && "Complex Pattern must specify what it can match"); @@ -568,7 +568,7 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) { } // Emit the matcher for the pattern structure and types. - EmitMatchCode(Pattern.getSrcPattern(), PatWithNoTypes.get()); + EmitMatchCode(Pattern.getSrcPattern(), *PatWithNoTypes); // If the pattern has a predicate on it (e.g. only enabled when a subtarget // feature is around, do the check). @@ -581,28 +581,28 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) { // because they are generally more expensive to evaluate and more difficult to // factor. for (unsigned i = 0, e = MatchedComplexPatterns.size(); i != e; ++i) { - auto N = MatchedComplexPatterns[i].first; + auto &N = *MatchedComplexPatterns[i].first; // Remember where the results of this match get stuck. - if (N->isLeaf()) { - NamedComplexPatternOperands[N->getName()] = NextRecordedOperandNo + 1; + if (N.isLeaf()) { + NamedComplexPatternOperands[N.getName()] = NextRecordedOperandNo + 1; } else { unsigned CurOp = NextRecordedOperandNo; - for (unsigned i = 0; i < N->getNumChildren(); ++i) { - NamedComplexPatternOperands[N->getChild(i)->getName()] = CurOp + 1; - CurOp += N->getChild(i)->getNumMIResults(CGP); + for (unsigned i = 0; i < N.getNumChildren(); ++i) { + NamedComplexPatternOperands[N.getChild(i).getName()] = CurOp + 1; + CurOp += N.getChild(i).getNumMIResults(CGP); } } // Get the slot we recorded the value in from the name on the node. unsigned RecNodeEntry = MatchedComplexPatterns[i].second; - const ComplexPattern *CP = N->getComplexPatternInfo(CGP); + const ComplexPattern *CP = N.getComplexPatternInfo(CGP); assert(CP && "Not a valid ComplexPattern!"); // Emit a CheckComplexPat operation, which does the match (aborting if it // fails) and pushes the matched operands onto the recorded nodes list. - AddMatcher(new CheckComplexPatMatcher(*CP, RecNodeEntry, N->getName(), + AddMatcher(new CheckComplexPatMatcher(*CP, RecNodeEntry, N.getName(), NextRecordedOperandNo)); // Record the right number of operands. @@ -631,25 +631,25 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) { //===----------------------------------------------------------------------===// void MatcherGen::EmitResultOfNamedOperand( - const TreePatternNode *N, SmallVectorImpl &ResultOps) { - assert(!N->getName().empty() && "Operand not named!"); + const TreePatternNode &N, SmallVectorImpl &ResultOps) { + assert(!N.getName().empty() && "Operand not named!"); - if (unsigned SlotNo = NamedComplexPatternOperands[N->getName()]) { + if (unsigned SlotNo = NamedComplexPatternOperands[N.getName()]) { // Complex operands have already been completely selected, just find the // right slot ant add the arguments directly. - for (unsigned i = 0; i < N->getNumMIResults(CGP); ++i) + for (unsigned i = 0; i < N.getNumMIResults(CGP); ++i) ResultOps.push_back(SlotNo - 1 + i); return; } - unsigned SlotNo = getNamedArgumentSlot(N->getName()); + unsigned SlotNo = getNamedArgumentSlot(N.getName()); // If this is an 'imm' or 'fpimm' node, make sure to convert it to the target // version of the immediate so that it doesn't get selected due to some other // node use. - if (!N->isLeaf()) { - StringRef OperatorName = N->getOperator()->getName(); + if (!N.isLeaf()) { + StringRef OperatorName = N.getOperator()->getName(); if (OperatorName == "imm" || OperatorName == "fpimm") { AddMatcher(new EmitConvertToTargetMatcher(SlotNo)); ResultOps.push_back(NextRecordedOperandNo++); @@ -657,38 +657,38 @@ void MatcherGen::EmitResultOfNamedOperand( } } - for (unsigned i = 0; i < N->getNumMIResults(CGP); ++i) + for (unsigned i = 0; i < N.getNumMIResults(CGP); ++i) ResultOps.push_back(SlotNo + i); } -void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N, +void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode &N, SmallVectorImpl &ResultOps) { - assert(N->isLeaf() && "Must be a leaf"); + assert(N.isLeaf() && "Must be a leaf"); - if (IntInit *II = dyn_cast(N->getLeafValue())) { - AddMatcher(new EmitIntegerMatcher(II->getValue(), N->getSimpleType(0))); + if (IntInit *II = dyn_cast(N.getLeafValue())) { + AddMatcher(new EmitIntegerMatcher(II->getValue(), N.getSimpleType(0))); ResultOps.push_back(NextRecordedOperandNo++); return; } // If this is an explicit register reference, handle it. - if (DefInit *DI = dyn_cast(N->getLeafValue())) { + if (DefInit *DI = dyn_cast(N.getLeafValue())) { Record *Def = DI->getDef(); if (Def->isSubClassOf("Register")) { const CodeGenRegister *Reg = CGP.getTargetInfo().getRegBank().getReg(Def); - AddMatcher(new EmitRegisterMatcher(Reg, N->getSimpleType(0))); + AddMatcher(new EmitRegisterMatcher(Reg, N.getSimpleType(0))); ResultOps.push_back(NextRecordedOperandNo++); return; } if (Def->getName() == "zero_reg") { - AddMatcher(new EmitRegisterMatcher(nullptr, N->getSimpleType(0))); + AddMatcher(new EmitRegisterMatcher(nullptr, N.getSimpleType(0))); ResultOps.push_back(NextRecordedOperandNo++); return; } if (Def->getName() == "undef_tied_input") { - MVT::SimpleValueType ResultVT = N->getSimpleType(0); + MVT::SimpleValueType ResultVT = N.getSimpleType(0); auto IDOperandNo = NextRecordedOperandNo++; Record *ImpDef = Def->getRecords().getDef("IMPLICIT_DEF"); CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(ImpDef); @@ -741,23 +741,23 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N, } errs() << "unhandled leaf node:\n"; - N->dump(); + N.dump(); } -static bool mayInstNodeLoadOrStore(const TreePatternNode *N, +static bool mayInstNodeLoadOrStore(const TreePatternNode &N, const CodeGenDAGPatterns &CGP) { - Record *Op = N->getOperator(); + Record *Op = N.getOperator(); const CodeGenTarget &CGT = CGP.getTargetInfo(); CodeGenInstruction &II = CGT.getInstruction(Op); return II.mayLoad || II.mayStore; } -static unsigned numNodesThatMayLoadOrStore(const TreePatternNode *N, +static unsigned numNodesThatMayLoadOrStore(const TreePatternNode &N, const CodeGenDAGPatterns &CGP) { - if (N->isLeaf()) + if (N.isLeaf()) return 0; - Record *OpRec = N->getOperator(); + Record *OpRec = N.getOperator(); if (!OpRec->isSubClassOf("Instruction")) return 0; @@ -765,31 +765,31 @@ static unsigned numNodesThatMayLoadOrStore(const TreePatternNode *N, if (mayInstNodeLoadOrStore(N, CGP)) ++Count; - for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i) - Count += numNodesThatMayLoadOrStore(N->getChild(i), CGP); + for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i) + Count += numNodesThatMayLoadOrStore(N.getChild(i), CGP); return Count; } void MatcherGen::EmitResultInstructionAsOperand( - const TreePatternNode *N, SmallVectorImpl &OutputOps) { - Record *Op = N->getOperator(); + const TreePatternNode &N, SmallVectorImpl &OutputOps) { + Record *Op = N.getOperator(); const CodeGenTarget &CGT = CGP.getTargetInfo(); CodeGenInstruction &II = CGT.getInstruction(Op); const DAGInstruction &Inst = CGP.getInstruction(Op); - bool isRoot = N == Pattern.getDstPattern(); + bool isRoot = &N == &Pattern.getDstPattern(); // TreeHasOutGlue - True if this tree has glue. bool TreeHasInGlue = false, TreeHasOutGlue = false; if (isRoot) { - const TreePatternNode *SrcPat = Pattern.getSrcPattern(); - TreeHasInGlue = SrcPat->TreeHasProperty(SDNPOptInGlue, CGP) || - SrcPat->TreeHasProperty(SDNPInGlue, CGP); + const TreePatternNode &SrcPat = Pattern.getSrcPattern(); + TreeHasInGlue = SrcPat.TreeHasProperty(SDNPOptInGlue, CGP) || + SrcPat.TreeHasProperty(SDNPInGlue, CGP); // FIXME2: this is checking the entire pattern, not just the node in // question, doing this just for the root seems like a total hack. - TreeHasOutGlue = SrcPat->TreeHasProperty(SDNPOutGlue, CGP); + TreeHasOutGlue = SrcPat.TreeHasProperty(SDNPOutGlue, CGP); } // NumResults - This is the number of results produced by the instruction in @@ -826,13 +826,13 @@ void MatcherGen::EmitResultInstructionAsOperand( // Determine what to emit for this operand. Record *OperandNode = II.Operands[InstOpNo].Rec; if (CGP.operandHasDefault(OperandNode) && - (InstOpNo < NonOverridableOperands || ChildNo >= N->getNumChildren())) { + (InstOpNo < NonOverridableOperands || ChildNo >= N.getNumChildren())) { // This is a predicate or optional def operand which the pattern has not // overridden, or which we aren't letting it override; emit the 'default // ops' operands. const DAGDefaultOperand &DefaultOp = CGP.getDefaultOperand(OperandNode); for (unsigned i = 0, e = DefaultOp.DefaultOps.size(); i != e; ++i) - EmitResultOperand(DefaultOp.DefaultOps[i].get(), InstOps); + EmitResultOperand(*DefaultOp.DefaultOps[i], InstOps); continue; } @@ -851,14 +851,14 @@ void MatcherGen::EmitResultInstructionAsOperand( unsigned FinalNumOps = InstOps.size() + NumSubOps; while (InstOps.size() < FinalNumOps) { - const TreePatternNode *Child = N->getChild(ChildNo); + const TreePatternNode &Child = N.getChild(ChildNo); unsigned BeforeAddingNumOps = InstOps.size(); EmitResultOperand(Child, InstOps); assert(InstOps.size() > BeforeAddingNumOps && "Didn't add any operands"); // If the operand is an instruction and it produced multiple results, just // take the first one. - if (!Child->isLeaf() && Child->getOperator()->isSubClassOf("Instruction")) + if (!Child.isLeaf() && Child.getOperator()->isSubClassOf("Instruction")) InstOps.resize(BeforeAddingNumOps + 1); ++ChildNo; @@ -871,8 +871,8 @@ void MatcherGen::EmitResultInstructionAsOperand( // above. Emit the remaining instructions implicitly added by the use for // variable_ops. if (II.Operands.isVariadic) { - for (unsigned I = ChildNo, E = N->getNumChildren(); I < E; ++I) - EmitResultOperand(N->getChild(I), InstOps); + for (unsigned I = ChildNo, E = N.getNumChildren(); I < E; ++I) + EmitResultOperand(N.getChild(I), InstOps); } // If this node has input glue or explicitly specified input physregs, we @@ -896,8 +896,8 @@ void MatcherGen::EmitResultInstructionAsOperand( // Determine the result types. SmallVector ResultVTs; - for (unsigned i = 0, e = N->getNumTypes(); i != e; ++i) - ResultVTs.push_back(N->getSimpleType(i)); + for (unsigned i = 0, e = N.getNumTypes(); i != e; ++i) + ResultVTs.push_back(N.getSimpleType(i)); // If this is the root instruction of a pattern that has physical registers in // its result pattern, add output VTs for them. For example, X86 has: @@ -922,8 +922,8 @@ void MatcherGen::EmitResultInstructionAsOperand( // a node that is variadic, mark the generated node as variadic so that it // gets the excess operands from the input DAG. int NumFixedArityOperands = -1; - if (isRoot && Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP)) - NumFixedArityOperands = Pattern.getSrcPattern()->getNumChildren(); + if (isRoot && Pattern.getSrcPattern().NodeHasProperty(SDNPVariadic, CGP)) + NumFixedArityOperands = Pattern.getSrcPattern().getNumChildren(); // If this is the root node and multiple matched nodes in the input pattern // have MemRefs in them, have the interpreter collect them and plop them onto @@ -933,7 +933,7 @@ void MatcherGen::EmitResultInstructionAsOperand( // FIXME3: This is actively incorrect for result patterns with multiple // memory-referencing instructions. bool PatternHasMemOperands = - Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP); + Pattern.getSrcPattern().TreeHasProperty(SDNPMemOperand, CGP); bool NodeHasMemRefs = false; if (PatternHasMemOperands) { @@ -948,7 +948,7 @@ void MatcherGen::EmitResultInstructionAsOperand( // Determine whether we need to attach a chain to this node. bool NodeHasChain = false; - if (Pattern.getSrcPattern()->TreeHasProperty(SDNPHasChain, CGP)) { + if (Pattern.getSrcPattern().TreeHasProperty(SDNPHasChain, CGP)) { // For some instructions, we were able to infer from the pattern whether // they should have a chain. Otherwise, attach the chain to the root. // @@ -982,8 +982,8 @@ void MatcherGen::EmitResultInstructionAsOperand( } void MatcherGen::EmitResultSDNodeXFormAsOperand( - const TreePatternNode *N, SmallVectorImpl &ResultOps) { - assert(N->getOperator()->isSubClassOf("SDNodeXForm") && "Not SDNodeXForm?"); + const TreePatternNode &N, SmallVectorImpl &ResultOps) { + assert(N.getOperator()->isSubClassOf("SDNodeXForm") && "Not SDNodeXForm?"); // Emit the operand. SmallVector InputOps; @@ -991,31 +991,31 @@ void MatcherGen::EmitResultSDNodeXFormAsOperand( // FIXME2: Could easily generalize this to support multiple inputs and outputs // to the SDNodeXForm. For now we just support one input and one output like // the old instruction selector. - assert(N->getNumChildren() == 1); - EmitResultOperand(N->getChild(0), InputOps); + assert(N.getNumChildren() == 1); + EmitResultOperand(N.getChild(0), InputOps); // The input currently must have produced exactly one result. assert(InputOps.size() == 1 && "Unexpected input to SDNodeXForm"); - AddMatcher(new EmitNodeXFormMatcher(InputOps[0], N->getOperator())); + AddMatcher(new EmitNodeXFormMatcher(InputOps[0], N.getOperator())); ResultOps.push_back(NextRecordedOperandNo++); } -void MatcherGen::EmitResultOperand(const TreePatternNode *N, +void MatcherGen::EmitResultOperand(const TreePatternNode &N, SmallVectorImpl &ResultOps) { // This is something selected from the pattern we matched. - if (!N->getName().empty()) + if (!N.getName().empty()) return EmitResultOfNamedOperand(N, ResultOps); - if (N->isLeaf()) + if (N.isLeaf()) return EmitResultLeafAsOperand(N, ResultOps); - Record *OpRec = N->getOperator(); + Record *OpRec = N.getOperator(); if (OpRec->isSubClassOf("Instruction")) return EmitResultInstructionAsOperand(N, ResultOps); if (OpRec->isSubClassOf("SDNodeXForm")) return EmitResultSDNodeXFormAsOperand(N, ResultOps); - errs() << "Unknown result node to emit code for: " << *N << '\n'; + errs() << "Unknown result node to emit code for: " << N << '\n'; PrintFatalError("Unknown node in result pattern!"); } @@ -1036,18 +1036,17 @@ void MatcherGen::EmitResultCode() { // just lop them off. This doesn't need to worry about glue or chains, just // explicit results. // - unsigned NumSrcResults = Pattern.getSrcPattern()->getNumTypes(); + unsigned NumSrcResults = Pattern.getSrcPattern().getNumTypes(); // If the pattern also has (implicit) results, count them as well. if (!Pattern.getDstRegs().empty()) { // If the root came from an implicit def in the instruction handling stuff, // don't re-add it. Record *HandledReg = nullptr; - const TreePatternNode *DstPat = Pattern.getDstPattern(); - if (!DstPat->isLeaf() && - DstPat->getOperator()->isSubClassOf("Instruction")) { + const TreePatternNode &DstPat = Pattern.getDstPattern(); + if (!DstPat.isLeaf() && DstPat.getOperator()->isSubClassOf("Instruction")) { const CodeGenTarget &CGT = CGP.getTargetInfo(); - CodeGenInstruction &II = CGT.getInstruction(DstPat->getOperator()); + CodeGenInstruction &II = CGT.getInstruction(DstPat.getOperator()); if (II.HasOneImplicitDefWithKnownVT(CGT) != MVT::Other) HandledReg = II.ImplicitDefs[0]; @@ -1063,9 +1062,9 @@ void MatcherGen::EmitResultCode() { SmallVector Results(Ops); // Apply result permutation. - for (unsigned ResNo = 0; ResNo < Pattern.getDstPattern()->getNumResults(); + for (unsigned ResNo = 0; ResNo < Pattern.getDstPattern().getNumResults(); ++ResNo) { - Results[ResNo] = Ops[Pattern.getDstPattern()->getResultIndex(ResNo)]; + Results[ResNo] = Ops[Pattern.getDstPattern().getResultIndex(ResNo)]; } Results.resize(NumSrcResults); diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp index b137492..f786d41 100644 --- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp @@ -99,7 +99,7 @@ static void ContractNodes(std::unique_ptr &MatcherPtr, const PatternToMatch &Pattern = CM->getPattern(); if (!EN->hasChain() && - Pattern.getSrcPattern()->NodeHasProperty(SDNPHasChain, CGP)) + Pattern.getSrcPattern().NodeHasProperty(SDNPHasChain, CGP)) ResultsMatch = false; // If the matched node has glue and the output root doesn't, we can't @@ -109,7 +109,7 @@ static void ContractNodes(std::unique_ptr &MatcherPtr, // because the code in the pattern generator doesn't handle it right. We // do it anyway for thoroughness. if (!EN->hasOutGlue() && - Pattern.getSrcPattern()->NodeHasProperty(SDNPOutGlue, CGP)) + Pattern.getSrcPattern().NodeHasProperty(SDNPOutGlue, CGP)) ResultsMatch = false; #if 0 diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp index dff6503..00a1650 100644 --- a/llvm/utils/TableGen/FastISelEmitter.cpp +++ b/llvm/utils/TableGen/FastISelEmitter.cpp @@ -200,36 +200,36 @@ struct OperandsSignature { /// of the Operands array accordingly. Return true if all the operands /// are supported, false otherwise. /// - bool initialize(TreePatternNode *InstPatNode, const CodeGenTarget &Target, + bool initialize(TreePatternNode &InstPatNode, const CodeGenTarget &Target, MVT::SimpleValueType VT, ImmPredicateSet &ImmediatePredicates, const CodeGenRegisterClass *OrigDstRC) { - if (InstPatNode->isLeaf()) + if (InstPatNode.isLeaf()) return false; - if (InstPatNode->getOperator()->getName() == "imm") { + if (InstPatNode.getOperator()->getName() == "imm") { Operands.push_back(OpKind::getImm(0)); return true; } - if (InstPatNode->getOperator()->getName() == "fpimm") { + if (InstPatNode.getOperator()->getName() == "fpimm") { Operands.push_back(OpKind::getFP()); return true; } const CodeGenRegisterClass *DstRC = nullptr; - for (unsigned i = 0, e = InstPatNode->getNumChildren(); i != e; ++i) { - TreePatternNode *Op = InstPatNode->getChild(i); + for (unsigned i = 0, e = InstPatNode.getNumChildren(); i != e; ++i) { + TreePatternNode &Op = InstPatNode.getChild(i); // Handle imm operands specially. - if (!Op->isLeaf() && Op->getOperator()->getName() == "imm") { + if (!Op.isLeaf() && Op.getOperator()->getName() == "imm") { unsigned PredNo = 0; - if (!Op->getPredicateCalls().empty()) { - TreePredicateFn PredFn = Op->getPredicateCalls()[0].Fn; + if (!Op.getPredicateCalls().empty()) { + TreePredicateFn PredFn = Op.getPredicateCalls()[0].Fn; // If there is more than one predicate weighing in on this operand // then we don't handle it. This doesn't typically happen for // immediates anyway. - if (Op->getPredicateCalls().size() > 1 || + if (Op.getPredicateCalls().size() > 1 || !PredFn.isImmediatePattern() || PredFn.usesOperands()) return false; // Ignore any instruction with 'FastIselShouldIgnore', these are @@ -249,11 +249,11 @@ struct OperandsSignature { // For now, filter out any operand with a predicate. // For now, filter out any operand with multiple values. - if (!Op->getPredicateCalls().empty() || Op->getNumTypes() != 1) + if (!Op.getPredicateCalls().empty() || Op.getNumTypes() != 1) return false; - if (!Op->isLeaf()) { - if (Op->getOperator()->getName() == "fpimm") { + if (!Op.isLeaf()) { + if (Op.getOperator()->getName() == "fpimm") { Operands.push_back(OpKind::getFP()); continue; } @@ -261,15 +261,15 @@ struct OperandsSignature { return false; } - assert(Op->hasConcreteType(0) && "Type infererence not done?"); + assert(Op.hasConcreteType(0) && "Type infererence not done?"); // For now, all the operands must have the same type (if they aren't // immediates). Note that this causes us to reject variable sized shifts // on X86. - if (Op->getSimpleType(0) != VT) + if (Op.getSimpleType(0) != VT) return false; - DefInit *OpDI = dyn_cast(Op->getLeafValue()); + DefInit *OpDI = dyn_cast(Op.getLeafValue()); if (!OpDI) return false; Record *OpLeafRec = OpDI->getDef(); @@ -430,14 +430,14 @@ static std::string getLegalCName(std::string OpName) { FastISelMap::FastISelMap(StringRef instns) : InstNS(instns) {} -static std::string PhyRegForNode(TreePatternNode *Op, +static std::string PhyRegForNode(TreePatternNode &Op, const CodeGenTarget &Target) { std::string PhysReg; - if (!Op->isLeaf()) + if (!Op.isLeaf()) return PhysReg; - Record *OpLeafRec = cast(Op->getLeafValue())->getDef(); + Record *OpLeafRec = cast(Op.getLeafValue())->getDef(); if (!OpLeafRec->isSubClassOf("Register")) return PhysReg; @@ -458,10 +458,10 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { // For now, just look at Instructions, so that we don't have to worry // about emitting multiple instructions for a pattern. - TreePatternNode *Dst = Pattern.getDstPattern(); - if (Dst->isLeaf()) + TreePatternNode &Dst = Pattern.getDstPattern(); + if (Dst.isLeaf()) continue; - Record *Op = Dst->getOperator(); + Record *Op = Dst.getOperator(); if (!Op->isSubClassOf("Instruction")) continue; CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op); @@ -477,11 +477,11 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { // For now, ignore multi-instruction patterns. bool MultiInsts = false; - for (unsigned i = 0, e = Dst->getNumChildren(); i != e; ++i) { - TreePatternNode *ChildOp = Dst->getChild(i); - if (ChildOp->isLeaf()) + for (unsigned i = 0, e = Dst.getNumChildren(); i != e; ++i) { + TreePatternNode &ChildOp = Dst.getChild(i); + if (ChildOp.isLeaf()) continue; - if (ChildOp->getOperator()->isSubClassOf("Instruction")) { + if (ChildOp.getOperator()->isSubClassOf("Instruction")) { MultiInsts = true; break; } @@ -505,40 +505,38 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { } else { // If this isn't a leaf, then continue since the register classes are // a bit too complicated for now. - if (!Dst->getChild(1)->isLeaf()) + if (!Dst.getChild(1).isLeaf()) continue; - DefInit *SR = dyn_cast(Dst->getChild(1)->getLeafValue()); + DefInit *SR = dyn_cast(Dst.getChild(1).getLeafValue()); if (SR) SubRegNo = getQualifiedName(SR->getDef()); else - SubRegNo = Dst->getChild(1)->getLeafValue()->getAsString(); + SubRegNo = Dst.getChild(1).getLeafValue()->getAsString(); } // Inspect the pattern. - TreePatternNode *InstPatNode = Pattern.getSrcPattern(); - if (!InstPatNode) - continue; - if (InstPatNode->isLeaf()) + TreePatternNode &InstPatNode = Pattern.getSrcPattern(); + if (InstPatNode.isLeaf()) continue; // Ignore multiple result nodes for now. - if (InstPatNode->getNumTypes() > 1) + if (InstPatNode.getNumTypes() > 1) continue; - Record *InstPatOp = InstPatNode->getOperator(); + Record *InstPatOp = InstPatNode.getOperator(); std::string OpcodeName = getOpcodeName(InstPatOp, CGP); MVT::SimpleValueType RetVT = MVT::isVoid; - if (InstPatNode->getNumTypes()) - RetVT = InstPatNode->getSimpleType(0); + if (InstPatNode.getNumTypes()) + RetVT = InstPatNode.getSimpleType(0); MVT::SimpleValueType VT = RetVT; - if (InstPatNode->getNumChildren()) { - assert(InstPatNode->getChild(0)->getNumTypes() == 1); - VT = InstPatNode->getChild(0)->getSimpleType(0); + if (InstPatNode.getNumChildren()) { + assert(InstPatNode.getChild(0).getNumTypes() == 1); + VT = InstPatNode.getChild(0).getSimpleType(0); } // For now, filter out any instructions with predicates. - if (!InstPatNode->getPredicateCalls().empty()) + if (!InstPatNode.getPredicateCalls().empty()) continue; // Check all the operands. @@ -548,20 +546,20 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { continue; std::vector PhysRegInputs; - if (InstPatNode->getOperator()->getName() == "imm" || - InstPatNode->getOperator()->getName() == "fpimm") + if (InstPatNode.getOperator()->getName() == "imm" || + InstPatNode.getOperator()->getName() == "fpimm") PhysRegInputs.push_back(""); else { // Compute the PhysRegs used by the given pattern, and check that // the mapping from the src to dst patterns is simple. bool FoundNonSimplePattern = false; unsigned DstIndex = 0; - for (unsigned i = 0, e = InstPatNode->getNumChildren(); i != e; ++i) { - std::string PhysReg = PhyRegForNode(InstPatNode->getChild(i), Target); + for (unsigned i = 0, e = InstPatNode.getNumChildren(); i != e; ++i) { + std::string PhysReg = PhyRegForNode(InstPatNode.getChild(i), Target); if (PhysReg.empty()) { - if (DstIndex >= Dst->getNumChildren() || - Dst->getChild(DstIndex)->getName() != - InstPatNode->getChild(i)->getName()) { + if (DstIndex >= Dst.getNumChildren() || + Dst.getChild(DstIndex).getName() != + InstPatNode.getChild(i).getName()) { FoundNonSimplePattern = true; break; } @@ -571,7 +569,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { PhysRegInputs.push_back(PhysReg); } - if (Op->getName() != "EXTRACT_SUBREG" && DstIndex < Dst->getNumChildren()) + if (Op->getName() != "EXTRACT_SUBREG" && DstIndex < Dst.getNumChildren()) FoundNonSimplePattern = true; if (FoundNonSimplePattern) @@ -591,7 +589,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { std::string PredicateCheck = Pattern.getPredicateCheck(); // Ok, we found a pattern that we can handle. Remember it. - InstructionMemo Memo(Pattern.getDstPattern()->getOperator()->getName(), + InstructionMemo Memo(Pattern.getDstPattern().getOperator()->getName(), DstRC, SubRegNo, PhysRegInputs, PredicateCheck); int complexity = Pattern.getPatternComplexity(CGP); diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 22e7785..13f2384 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -90,10 +90,10 @@ static cl::opt OptimizeMatchTable( namespace { -static std::string explainPredicates(const TreePatternNode *N) { +static std::string explainPredicates(const TreePatternNode &N) { std::string Explanation; StringRef Separator = ""; - for (const TreePredicateCall &Call : N->getPredicateCalls()) { + for (const TreePredicateCall &Call : N.getPredicateCalls()) { const TreePredicateFn &P = Call.Fn; Explanation += (Separator + P.getOrigPatFragRecord()->getRecord()->getName()).str(); @@ -194,12 +194,12 @@ static Error failedImport(const Twine &Reason) { return make_error(Reason, inconvertibleErrorCode()); } -static Error isTrivialOperatorNode(const TreePatternNode *N) { +static Error isTrivialOperatorNode(const TreePatternNode &N) { std::string Explanation; std::string Separator; bool HasUnsupportedPredicate = false; - for (const TreePredicateCall &Call : N->getPredicateCalls()) { + for (const TreePredicateCall &Call : N.getPredicateCalls()) { const TreePredicateFn &Predicate = Call.Fn; if (Predicate.isAlwaysTrue()) @@ -288,8 +288,8 @@ static std::string getMangledRootDefName(StringRef DefOperandName) { //===- GlobalISelEmitter class --------------------------------------------===// -static Expected getInstResultType(const TreePatternNode *Dst) { - ArrayRef ChildTypes = Dst->getExtTypes(); +static Expected getInstResultType(const TreePatternNode &Dst) { + ArrayRef ChildTypes = Dst.getExtTypes(); if (ChildTypes.size() != 1) return failedImport("Dst pattern child has multiple results"); @@ -372,40 +372,40 @@ private: Record *findNodeEquiv(Record *N) const; const CodeGenInstruction *getEquivNode(Record &Equiv, - const TreePatternNode *N) const; + const TreePatternNode &N) const; Error importRulePredicates(RuleMatcher &M, ArrayRef Predicates); Expected createAndImportSelDAGMatcher(RuleMatcher &Rule, InstructionMatcher &InsnMatcher, - const TreePatternNode *Src, unsigned &TempOpIdx); + const TreePatternNode &Src, unsigned &TempOpIdx); Error importComplexPatternOperandMatcher(OperandMatcher &OM, Record *R, unsigned &TempOpIdx) const; Error importChildMatcher(RuleMatcher &Rule, InstructionMatcher &InsnMatcher, - const TreePatternNode *SrcChild, + const TreePatternNode &SrcChild, bool OperandIsAPointer, bool OperandIsImmArg, unsigned OpIdx, unsigned &TempOpIdx); Expected createAndImportInstructionRenderer( RuleMatcher &M, InstructionMatcher &InsnMatcher, - const TreePatternNode *Src, const TreePatternNode *Dst); + const TreePatternNode &Src, const TreePatternNode &Dst); Expected createAndImportSubInstructionRenderer( - action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst, - const TreePatternNode *Src, unsigned TempReg); + action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst, + const TreePatternNode &Src, unsigned TempReg); Expected createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M, - const TreePatternNode *Dst); + const TreePatternNode &Dst); Expected importExplicitDefRenderers( action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, - const TreePatternNode *Src, const TreePatternNode *Dst); + const TreePatternNode &Src, const TreePatternNode &Dst); Expected importExplicitUseRenderers( action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, - const llvm::TreePatternNode *Dst, const TreePatternNode *Src); + const llvm::TreePatternNode &Dst, const TreePatternNode &Src); Expected importExplicitUseRenderer( action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder, - const TreePatternNode *DstChild, const TreePatternNode *Src); + const TreePatternNode &DstChild, const TreePatternNode &Src); Error importDefaultOperandRenderers(action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, const DAGDefaultOperand &DefaultOp) const; @@ -430,25 +430,25 @@ private: /// If no register class is found, return std::nullopt. std::optional inferSuperRegisterClassForNode(const TypeSetByHwMode &Ty, - const TreePatternNode *SuperRegNode, - const TreePatternNode *SubRegIdxNode); + const TreePatternNode &SuperRegNode, + const TreePatternNode &SubRegIdxNode); std::optional - inferSubRegIndexForNode(const TreePatternNode *SubRegIdxNode); + inferSubRegIndexForNode(const TreePatternNode &SubRegIdxNode); /// Infer a CodeGenRegisterClass which suppoorts \p Ty and \p SubRegIdxNode. /// Return std::nullopt if no such class exists. std::optional inferSuperRegisterClass(const TypeSetByHwMode &Ty, - const TreePatternNode *SubRegIdxNode); + const TreePatternNode &SubRegIdxNode); /// Return the CodeGenRegisterClass associated with \p Leaf if it has one. std::optional - getRegClassFromLeaf(const TreePatternNode *Leaf); + getRegClassFromLeaf(const TreePatternNode &Leaf); /// Return a CodeGenRegisterClass for \p N if one can be found. Return /// std::nullopt otherwise. std::optional - inferRegClassFromPattern(const TreePatternNode *N); + inferRegClassFromPattern(const TreePatternNode &N); /// Return the size of the MemoryVT in this predicate, if possible. std::optional @@ -498,19 +498,19 @@ Record *GlobalISelEmitter::findNodeEquiv(Record *N) const { } const CodeGenInstruction * -GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode *N) const { - if (N->getNumChildren() >= 1) { +GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode &N) const { + if (N.getNumChildren() >= 1) { // setcc operation maps to two different G_* instructions based on the type. if (!Equiv.isValueUnset("IfFloatingPoint") && - MVT(N->getChild(0)->getSimpleType(0)).isFloatingPoint()) + MVT(N.getChild(0).getSimpleType(0)).isFloatingPoint()) return &Target.getInstruction(Equiv.getValueAsDef("IfFloatingPoint")); } if (!Equiv.isValueUnset("IfConvergent") && - N->getIntrinsicInfo(CGP)->isConvergent) + N.getIntrinsicInfo(CGP)->isConvergent) return &Target.getInstruction(Equiv.getValueAsDef("IfConvergent")); - for (const TreePredicateCall &Call : N->getPredicateCalls()) { + for (const TreePredicateCall &Call : N.getPredicateCalls()) { const TreePredicateFn &Predicate = Call.Fn; if (!Equiv.isValueUnset("IfSignExtend") && (Predicate.isLoad() || Predicate.isAtomic()) && @@ -707,15 +707,15 @@ Expected GlobalISelEmitter::addBuiltinPredicates( Expected GlobalISelEmitter::createAndImportSelDAGMatcher( RuleMatcher &Rule, InstructionMatcher &InsnMatcher, - const TreePatternNode *Src, unsigned &TempOpIdx) { - const auto SavedFlags = Rule.setGISelFlags(Src->getGISelFlagsRecord()); + const TreePatternNode &Src, unsigned &TempOpIdx) { + const auto SavedFlags = Rule.setGISelFlags(Src.getGISelFlagsRecord()); Record *SrcGIEquivOrNull = nullptr; const CodeGenInstruction *SrcGIOrNull = nullptr; // Start with the defined operands (i.e., the results of the root operator). - if (Src->isLeaf()) { - Init *SrcInit = Src->getLeafValue(); + if (Src.isLeaf()) { + Init *SrcInit = Src.getLeafValue(); if (isa(SrcInit)) { InsnMatcher.addPredicate( &Target.getInstruction(RK.getDef("G_CONSTANT"))); @@ -723,10 +723,10 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( return failedImport( "Unable to deduce gMIR opcode to handle Src (which is a leaf)"); } else { - SrcGIEquivOrNull = findNodeEquiv(Src->getOperator()); + SrcGIEquivOrNull = findNodeEquiv(Src.getOperator()); if (!SrcGIEquivOrNull) return failedImport("Pattern operator lacks an equivalent Instruction" + - explainOperator(Src->getOperator())); + explainOperator(Src.getOperator())); SrcGIOrNull = getEquivNode(*SrcGIEquivOrNull, Src); // The operators look good: match the opcode @@ -734,7 +734,7 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( } unsigned OpIdx = 0; - for (const TypeSetByHwMode &VTy : Src->getExtTypes()) { + for (const TypeSetByHwMode &VTy : Src.getExtTypes()) { // Results don't have a name unless they are the root node. The caller will // set the name if appropriate. const bool OperandIsAPointer = @@ -745,7 +745,7 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( " for result of Src pattern operator"); } - for (const TreePredicateCall &Call : Src->getPredicateCalls()) { + for (const TreePredicateCall &Call : Src.getPredicateCalls()) { const TreePredicateFn &Predicate = Call.Fn; bool HasAddedBuiltinMatcher = true; if (Predicate.isAlwaysTrue()) @@ -800,11 +800,11 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( "Unordered", AtomicOrderingMMOPredicateMatcher::AO_OrStronger); } - if (Src->isLeaf()) { - Init *SrcInit = Src->getLeafValue(); + if (Src.isLeaf()) { + Init *SrcInit = Src.getLeafValue(); if (IntInit *SrcIntInit = dyn_cast(SrcInit)) { OperandMatcher &OM = - InsnMatcher.addOperand(OpIdx++, Src->getName(), TempOpIdx); + InsnMatcher.addOperand(OpIdx++, Src.getName(), TempOpIdx); OM.addPredicate(SrcIntInit->getValue()); } else return failedImport( @@ -825,19 +825,19 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( // predicate operand needs to be swapped from the last operand to the first // source. - unsigned NumChildren = Src->getNumChildren(); + unsigned NumChildren = Src.getNumChildren(); bool IsFCmp = SrcGIOrNull->TheDef->getName() == "G_FCMP"; if (IsFCmp || SrcGIOrNull->TheDef->getName() == "G_ICMP") { - const TreePatternNode *SrcChild = Src->getChild(NumChildren - 1); - if (SrcChild->isLeaf()) { - DefInit *DI = dyn_cast(SrcChild->getLeafValue()); + const TreePatternNode &SrcChild = Src.getChild(NumChildren - 1); + if (SrcChild.isLeaf()) { + DefInit *DI = dyn_cast(SrcChild.getLeafValue()); Record *CCDef = DI ? DI->getDef() : nullptr; if (!CCDef || !CCDef->isSubClassOf("CondCode")) return failedImport("Unable to handle CondCode"); OperandMatcher &OM = - InsnMatcher.addOperand(OpIdx++, SrcChild->getName(), TempOpIdx); + InsnMatcher.addOperand(OpIdx++, SrcChild.getName(), TempOpIdx); StringRef PredType = IsFCmp ? CCDef->getValueAsString("FCmpPredicate") : CCDef->getValueAsString("ICmpPredicate"); @@ -856,12 +856,12 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( SrcGIOrNull->TheDef->getName() == "G_INTRINSIC_CONVERGENT" || SrcGIOrNull->TheDef->getName() == "G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS"; - const CodeGenIntrinsic *II = Src->getIntrinsicInfo(CGP); + const CodeGenIntrinsic *II = Src.getIntrinsicInfo(CGP); if (IsIntrinsic && !II) return failedImport("Expected IntInit containing intrinsic ID)"); for (unsigned i = 0; i != NumChildren; ++i) { - const TreePatternNode *SrcChild = Src->getChild(i); + const TreePatternNode &SrcChild = Src.getChild(i); // We need to determine the meaning of a literal integer based on the // context. If this is a field required to be an immediate (such as an @@ -884,7 +884,7 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( // following the defs is an intrinsic ID. if (i == 0) { OperandMatcher &OM = - InsnMatcher.addOperand(OpIdx++, SrcChild->getName(), TempOpIdx); + InsnMatcher.addOperand(OpIdx++, SrcChild.getName(), TempOpIdx); OM.addPredicate(II); continue; } @@ -921,11 +921,11 @@ Error GlobalISelEmitter::importComplexPatternOperandMatcher( // Get the name to use for a pattern operand. For an anonymous physical register // input, this should use the register name. -static StringRef getSrcChildName(const TreePatternNode *SrcChild, +static StringRef getSrcChildName(const TreePatternNode &SrcChild, Record *&PhysReg) { - StringRef SrcChildName = SrcChild->getName(); - if (SrcChildName.empty() && SrcChild->isLeaf()) { - if (auto *ChildDefInit = dyn_cast(SrcChild->getLeafValue())) { + StringRef SrcChildName = SrcChild.getName(); + if (SrcChildName.empty() && SrcChild.isLeaf()) { + if (auto *ChildDefInit = dyn_cast(SrcChild.getLeafValue())) { auto *ChildRec = ChildDefInit->getDef(); if (ChildRec->isSubClassOf("Register")) { SrcChildName = ChildRec->getName(); @@ -939,19 +939,19 @@ static StringRef getSrcChildName(const TreePatternNode *SrcChild, Error GlobalISelEmitter::importChildMatcher( RuleMatcher &Rule, InstructionMatcher &InsnMatcher, - const TreePatternNode *SrcChild, bool OperandIsAPointer, + const TreePatternNode &SrcChild, bool OperandIsAPointer, bool OperandIsImmArg, unsigned OpIdx, unsigned &TempOpIdx) { Record *PhysReg = nullptr; std::string SrcChildName = std::string(getSrcChildName(SrcChild, PhysReg)); - if (!SrcChild->isLeaf() && - SrcChild->getOperator()->isSubClassOf("ComplexPattern")) { + if (!SrcChild.isLeaf() && + SrcChild.getOperator()->isSubClassOf("ComplexPattern")) { // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is // "MY_PAT:op1:op2" and the ones with same "name" represent same operand. - std::string PatternName = std::string(SrcChild->getOperator()->getName()); - for (unsigned i = 0; i < SrcChild->getNumChildren(); ++i) { + std::string PatternName = std::string(SrcChild.getOperator()->getName()); + for (unsigned i = 0; i < SrcChild.getNumChildren(); ++i) { PatternName += ":"; - PatternName += SrcChild->getChild(i)->getName(); + PatternName += SrcChild.getChild(i).getName(); } SrcChildName = PatternName; } @@ -962,23 +962,23 @@ Error GlobalISelEmitter::importChildMatcher( if (OM.isSameAsAnotherOperand()) return Error::success(); - ArrayRef ChildTypes = SrcChild->getExtTypes(); + ArrayRef ChildTypes = SrcChild.getExtTypes(); if (ChildTypes.size() != 1) return failedImport("Src pattern child has multiple results"); // Check MBB's before the type check since they are not a known type. - if (!SrcChild->isLeaf()) { - if (SrcChild->getOperator()->isSubClassOf("SDNode")) { - auto &ChildSDNI = CGP.getSDNodeInfo(SrcChild->getOperator()); + if (!SrcChild.isLeaf()) { + if (SrcChild.getOperator()->isSubClassOf("SDNode")) { + auto &ChildSDNI = CGP.getSDNodeInfo(SrcChild.getOperator()); if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") { OM.addPredicate(); return Error::success(); } - if (SrcChild->getOperator()->getName() == "timm") { + if (SrcChild.getOperator()->getName() == "timm") { OM.addPredicate(); // Add predicates, if any - for (const TreePredicateCall &Call : SrcChild->getPredicateCalls()) { + for (const TreePredicateCall &Call : SrcChild.getPredicateCalls()) { const TreePredicateFn &Predicate = Call.Fn; // Only handle immediate patterns for now @@ -998,12 +998,12 @@ Error GlobalISelEmitter::importChildMatcher( if (auto Error = OM.addTypeCheckPredicate(ChildTypes.front(), OperandIsAPointer)) return failedImport(toString(std::move(Error)) + " for Src operand (" + - to_string(*SrcChild) + ")"); + to_string(SrcChild) + ")"); } // Try look up SrcChild for a (named) predicate operand if there is any. if (WaitingForNamedOperands) { - auto &ScopedNames = SrcChild->getNamesAsPredicateArg(); + auto &ScopedNames = SrcChild.getNamesAsPredicateArg(); if (!ScopedNames.empty()) { auto PA = ScopedNames.begin(); std::string Name = getScopedName(PA->getScope(), PA->getIdentifier()); @@ -1013,22 +1013,22 @@ Error GlobalISelEmitter::importChildMatcher( } // Check for nested instructions. - if (!SrcChild->isLeaf()) { - if (SrcChild->getOperator()->isSubClassOf("ComplexPattern")) { + if (!SrcChild.isLeaf()) { + if (SrcChild.getOperator()->isSubClassOf("ComplexPattern")) { // When a ComplexPattern is used as an operator, it should do the same // thing as when used as a leaf. However, the children of the operator // name the sub-operands that make up the complex operand and we must // prepare to reference them in the renderer too. unsigned RendererID = TempOpIdx; if (auto Error = importComplexPatternOperandMatcher( - OM, SrcChild->getOperator(), TempOpIdx)) + OM, SrcChild.getOperator(), TempOpIdx)) return Error; - for (unsigned i = 0, e = SrcChild->getNumChildren(); i != e; ++i) { - auto *SubOperand = SrcChild->getChild(i); - if (!SubOperand->getName().empty()) { + for (unsigned i = 0, e = SrcChild.getNumChildren(); i != e; ++i) { + auto &SubOperand = SrcChild.getChild(i); + if (!SubOperand.getName().empty()) { if (auto Error = Rule.defineComplexSubOperand( - SubOperand->getName(), SrcChild->getOperator(), RendererID, i, + SubOperand.getName(), SrcChild.getOperator(), RendererID, i, SrcChildName)) return Error; } @@ -1038,7 +1038,7 @@ Error GlobalISelEmitter::importChildMatcher( } auto MaybeInsnOperand = OM.addPredicate( - InsnMatcher.getRuleMatcher(), SrcChild->getName()); + InsnMatcher.getRuleMatcher(), SrcChild.getName()); if (!MaybeInsnOperand) { // This isn't strictly true. If the user were to provide exactly the same // matchers as the original operand then we could allow it. However, it's @@ -1057,11 +1057,11 @@ Error GlobalISelEmitter::importChildMatcher( return Error::success(); } - if (SrcChild->hasAnyPredicate()) + if (SrcChild.hasAnyPredicate()) return failedImport("Src pattern child has unsupported predicate"); // Check for constant immediates. - if (auto *ChildInt = dyn_cast(SrcChild->getLeafValue())) { + if (auto *ChildInt = dyn_cast(SrcChild.getLeafValue())) { if (OperandIsImmArg) { // Checks for argument directly in operand list OM.addPredicate(ChildInt->getValue()); @@ -1073,7 +1073,7 @@ Error GlobalISelEmitter::importChildMatcher( } // Check for def's like register classes or ComplexPattern's. - if (auto *ChildDefInit = dyn_cast(SrcChild->getLeafValue())) { + if (auto *ChildDefInit = dyn_cast(SrcChild.getLeafValue())) { auto *ChildRec = ChildDefInit->getDef(); // Check for register classes. @@ -1121,7 +1121,7 @@ Error GlobalISelEmitter::importChildMatcher( const bool ImmAllOnesV = ChildRec->getName() == "immAllOnesV"; if (ImmAllOnesV || ChildRec->getName() == "immAllZerosV") { auto MaybeInsnOperand = OM.addPredicate( - InsnMatcher.getRuleMatcher(), SrcChild->getName(), false); + InsnMatcher.getRuleMatcher(), SrcChild.getName(), false); InstructionOperandMatcher &InsnOperand = **MaybeInsnOperand; ValueTypeByHwMode VTy = ChildTypes.front().getValueTypeByHwMode(); @@ -1161,45 +1161,44 @@ Error GlobalISelEmitter::importChildMatcher( Expected GlobalISelEmitter::importExplicitUseRenderer( action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder, - const TreePatternNode *DstChild, const TreePatternNode *Src) { + const TreePatternNode &DstChild, const TreePatternNode &Src) { - const auto &SubOperand = Rule.getComplexSubOperand(DstChild->getName()); + const auto &SubOperand = Rule.getComplexSubOperand(DstChild.getName()); if (SubOperand) { DstMIBuilder.addRenderer( - *std::get<0>(*SubOperand), DstChild->getName(), - std::get<1>(*SubOperand), std::get<2>(*SubOperand)); + *std::get<0>(*SubOperand), DstChild.getName(), std::get<1>(*SubOperand), + std::get<2>(*SubOperand)); return InsertPt; } - if (!DstChild->isLeaf()) { - if (DstChild->getOperator()->isSubClassOf("SDNodeXForm")) { - auto Child = DstChild->getChild(0); - auto I = SDNodeXFormEquivs.find(DstChild->getOperator()); + if (!DstChild.isLeaf()) { + if (DstChild.getOperator()->isSubClassOf("SDNodeXForm")) { + auto &Child = DstChild.getChild(0); + auto I = SDNodeXFormEquivs.find(DstChild.getOperator()); if (I != SDNodeXFormEquivs.end()) { - Record *XFormOpc = DstChild->getOperator()->getValueAsDef("Opcode"); + Record *XFormOpc = DstChild.getOperator()->getValueAsDef("Opcode"); if (XFormOpc->getName() == "timm") { // If this is a TargetConstant, there won't be a corresponding // instruction to transform. Instead, this will refer directly to an // operand in an instruction's operand list. DstMIBuilder.addRenderer(*I->second, - Child->getName()); + Child.getName()); } else { - DstMIBuilder.addRenderer(*I->second, - Child->getName()); + DstMIBuilder.addRenderer(*I->second, Child.getName()); } return InsertPt; } - return failedImport("SDNodeXForm " + Child->getName() + + return failedImport("SDNodeXForm " + Child.getName() + " has no custom renderer"); } // We accept 'bb' here. It's an operator because BasicBlockSDNode isn't // inline, but in MI it's just another operand. - if (DstChild->getOperator()->isSubClassOf("SDNode")) { - auto &ChildSDNI = CGP.getSDNodeInfo(DstChild->getOperator()); + if (DstChild.getOperator()->isSubClassOf("SDNode")) { + auto &ChildSDNI = CGP.getSDNodeInfo(DstChild.getOperator()); if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") { - DstMIBuilder.addRenderer(DstChild->getName()); + DstMIBuilder.addRenderer(DstChild.getName()); return InsertPt; } } @@ -1208,19 +1207,19 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( // rendered as operands. // FIXME: The target should be able to choose sign-extended when appropriate // (e.g. on Mips). - if (DstChild->getOperator()->getName() == "timm") { - DstMIBuilder.addRenderer(DstChild->getName()); + if (DstChild.getOperator()->getName() == "timm") { + DstMIBuilder.addRenderer(DstChild.getName()); return InsertPt; - } else if (DstChild->getOperator()->getName() == "imm") { - DstMIBuilder.addRenderer(DstChild->getName()); + } else if (DstChild.getOperator()->getName() == "imm") { + DstMIBuilder.addRenderer(DstChild.getName()); return InsertPt; - } else if (DstChild->getOperator()->getName() == "fpimm") { + } else if (DstChild.getOperator()->getName() == "fpimm") { DstMIBuilder.addRenderer( - DstChild->getName()); + DstChild.getName()); return InsertPt; } - if (DstChild->getOperator()->isSubClassOf("Instruction")) { + if (DstChild.getOperator()->isSubClassOf("Instruction")) { auto OpTy = getInstResultType(DstChild); if (!OpTy) return OpTy.takeError(); @@ -1238,22 +1237,22 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( } return failedImport("Dst pattern child isn't a leaf node or an MBB" + - llvm::to_string(*DstChild)); + llvm::to_string(DstChild)); } // It could be a specific immediate in which case we should just check for // that immediate. if (const IntInit *ChildIntInit = - dyn_cast(DstChild->getLeafValue())) { + dyn_cast(DstChild.getLeafValue())) { DstMIBuilder.addRenderer(ChildIntInit->getValue()); return InsertPt; } // Otherwise, we're looking for a bog-standard RegisterClass operand. - if (auto *ChildDefInit = dyn_cast(DstChild->getLeafValue())) { + if (auto *ChildDefInit = dyn_cast(DstChild.getLeafValue())) { auto *ChildRec = ChildDefInit->getDef(); - ArrayRef ChildTypes = DstChild->getExtTypes(); + ArrayRef ChildTypes = DstChild.getExtTypes(); if (ChildTypes.size() != 1) return failedImport("Dst pattern child has multiple results"); @@ -1274,11 +1273,11 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( if (ChildRec->isSubClassOf("RegisterOperand") && !ChildRec->isValueUnset("GIZeroRegister")) { DstMIBuilder.addRenderer( - DstChild->getName(), ChildRec->getValueAsDef("GIZeroRegister")); + DstChild.getName(), ChildRec->getValueAsDef("GIZeroRegister")); return InsertPt; } - DstMIBuilder.addRenderer(DstChild->getName()); + DstMIBuilder.addRenderer(DstChild.getName()); return InsertPt; } @@ -1294,9 +1293,9 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( return failedImport( "SelectionDAG ComplexPattern not mapped to GlobalISel"); - const OperandMatcher &OM = Rule.getOperandMatcher(DstChild->getName()); + const OperandMatcher &OM = Rule.getOperandMatcher(DstChild.getName()); DstMIBuilder.addRenderer( - *ComplexPattern->second, DstChild->getName(), + *ComplexPattern->second, DstChild.getName(), OM.getAllocatedTemporariesBaseID()); return InsertPt; } @@ -1307,10 +1306,10 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( // Handle the case where the MVT/register class is omitted in the dest pattern // but MVT exists in the source pattern. - if (isa(DstChild->getLeafValue())) { - for (unsigned NumOp = 0; NumOp < Src->getNumChildren(); NumOp++) - if (Src->getChild(NumOp)->getName() == DstChild->getName()) { - DstMIBuilder.addRenderer(Src->getChild(NumOp)->getName()); + if (isa(DstChild.getLeafValue())) { + for (unsigned NumOp = 0; NumOp < Src.getNumChildren(); NumOp++) + if (Src.getChild(NumOp).getName() == DstChild.getName()) { + DstMIBuilder.addRenderer(Src.getChild(NumOp).getName()); return InsertPt; } } @@ -1318,8 +1317,8 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( } Expected GlobalISelEmitter::createAndImportInstructionRenderer( - RuleMatcher &M, InstructionMatcher &InsnMatcher, const TreePatternNode *Src, - const TreePatternNode *Dst) { + RuleMatcher &M, InstructionMatcher &InsnMatcher, const TreePatternNode &Src, + const TreePatternNode &Dst) { auto InsertPtOrError = createInstructionRenderer(M.actions_end(), M, Dst); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); @@ -1353,8 +1352,8 @@ Expected GlobalISelEmitter::createAndImportInstructionRenderer( Expected GlobalISelEmitter::createAndImportSubInstructionRenderer( - const action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst, - const TreePatternNode *Src, unsigned TempRegID) { + const action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst, + const TreePatternNode &Src, unsigned TempRegID) { auto InsertPtOrError = createInstructionRenderer(InsertPt, M, Dst); // TODO: Assert there's exactly one result. @@ -1376,15 +1375,15 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer( // We need to make sure that when we import an INSERT_SUBREG as a // subinstruction that it ends up being constrained to the correct super // register and subregister classes. - auto OpName = Target.getInstruction(Dst->getOperator()).TheDef->getName(); + auto OpName = Target.getInstruction(Dst.getOperator()).TheDef->getName(); if (OpName == "INSERT_SUBREG") { - auto SubClass = inferRegClassFromPattern(Dst->getChild(1)); + auto SubClass = inferRegClassFromPattern(Dst.getChild(1)); if (!SubClass) return failedImport( "Cannot infer register class from INSERT_SUBREG operand #1"); std::optional SuperClass = - inferSuperRegisterClassForNode(Dst->getExtType(0), Dst->getChild(0), - Dst->getChild(2)); + inferSuperRegisterClassForNode(Dst.getExtType(0), Dst.getChild(0), + Dst.getChild(2)); if (!SuperClass) return failedImport( "Cannot infer register class for INSERT_SUBREG operand #0"); @@ -1404,12 +1403,12 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer( // instructions, the result register class is controlled by the // subregisters of the operand. As a result, we must constrain the result // class rather than check that it's already the right one. - auto SuperClass = inferRegClassFromPattern(Dst->getChild(0)); + auto SuperClass = inferRegClassFromPattern(Dst.getChild(0)); if (!SuperClass) return failedImport( "Cannot infer register class from EXTRACT_SUBREG operand #0"); - auto SubIdx = inferSubRegIndexForNode(Dst->getChild(1)); + auto SubIdx = inferSubRegIndexForNode(Dst.getChild(1)); if (!SubIdx) return failedImport("EXTRACT_SUBREG child #1 is not a subreg index"); @@ -1429,12 +1428,12 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer( // Similar to INSERT_SUBREG, we also have to handle SUBREG_TO_REG as a // subinstruction. if (OpName == "SUBREG_TO_REG") { - auto SubClass = inferRegClassFromPattern(Dst->getChild(1)); + auto SubClass = inferRegClassFromPattern(Dst.getChild(1)); if (!SubClass) return failedImport( "Cannot infer register class from SUBREG_TO_REG child #1"); auto SuperClass = - inferSuperRegisterClass(Dst->getExtType(0), Dst->getChild(2)); + inferSuperRegisterClass(Dst.getExtType(0), Dst.getChild(2)); if (!SuperClass) return failedImport( "Cannot infer register class for SUBREG_TO_REG operand #0"); @@ -1446,13 +1445,13 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer( } if (OpName == "REG_SEQUENCE") { - auto SuperClass = inferRegClassFromPattern(Dst->getChild(0)); + auto SuperClass = inferRegClassFromPattern(Dst.getChild(0)); M.insertAction( InsertPt, DstMIBuilder.getInsnID(), 0, **SuperClass); - unsigned Num = Dst->getNumChildren(); + unsigned Num = Dst.getNumChildren(); for (unsigned I = 1; I != Num; I += 2) { - const TreePatternNode *SubRegChild = Dst->getChild(I + 1); + const TreePatternNode &SubRegChild = Dst.getChild(I + 1); auto SubIdx = inferSubRegIndexForNode(SubRegChild); if (!SubIdx) @@ -1474,8 +1473,8 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer( } Expected GlobalISelEmitter::createInstructionRenderer( - action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst) { - Record *DstOp = Dst->getOperator(); + action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst) { + Record *DstOp = Dst.getOperator(); if (!DstOp->isSubClassOf("Instruction")) { if (DstOp->isSubClassOf("ValueType")) return failedImport( @@ -1496,9 +1495,9 @@ Expected GlobalISelEmitter::createInstructionRenderer( Expected GlobalISelEmitter::importExplicitDefRenderers( action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, - const TreePatternNode *Src, const TreePatternNode *Dst) { + const TreePatternNode &Src, const TreePatternNode &Dst) { const CodeGenInstruction *DstI = DstMIBuilder.getCGI(); - const unsigned SrcNumDefs = Src->getExtTypes().size(); + const unsigned SrcNumDefs = Src.getExtTypes().size(); const unsigned DstNumDefs = DstI->Operands.NumDefs; if (DstNumDefs == 0) return InsertPt; @@ -1513,11 +1512,11 @@ Expected GlobalISelEmitter::importExplicitDefRenderers( // Some instructions have multiple defs, but are missing a type entry // (e.g. s_cc_out operands). - if (Dst->getExtTypes().size() < DstNumDefs) + if (Dst.getExtTypes().size() < DstNumDefs) return failedImport("unhandled discarded def"); for (unsigned I = SrcNumDefs; I < DstNumDefs; ++I) { - const TypeSetByHwMode &ExtTy = Dst->getExtType(I); + const TypeSetByHwMode &ExtTy = Dst.getExtType(I); if (!ExtTy.isMachineValueType()) return failedImport("unsupported typeset"); @@ -1536,24 +1535,24 @@ Expected GlobalISelEmitter::importExplicitDefRenderers( Expected GlobalISelEmitter::importExplicitUseRenderers( action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, - const llvm::TreePatternNode *Dst, const llvm::TreePatternNode *Src) { + const llvm::TreePatternNode &Dst, const llvm::TreePatternNode &Src) { const CodeGenInstruction *DstI = DstMIBuilder.getCGI(); - CodeGenInstruction *OrigDstI = &Target.getInstruction(Dst->getOperator()); + CodeGenInstruction *OrigDstI = &Target.getInstruction(Dst.getOperator()); StringRef Name = OrigDstI->TheDef->getName(); - unsigned ExpectedDstINumUses = Dst->getNumChildren(); + unsigned ExpectedDstINumUses = Dst.getNumChildren(); // EXTRACT_SUBREG needs to use a subregister COPY. if (Name == "EXTRACT_SUBREG") { - if (!Dst->getChild(1)->isLeaf()) + if (!Dst.getChild(1).isLeaf()) return failedImport("EXTRACT_SUBREG child #1 is not a leaf"); - DefInit *SubRegInit = dyn_cast(Dst->getChild(1)->getLeafValue()); + DefInit *SubRegInit = dyn_cast(Dst.getChild(1).getLeafValue()); if (!SubRegInit) return failedImport("EXTRACT_SUBREG child #1 is not a subreg index"); CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef()); - const TreePatternNode *ValChild = Dst->getChild(0); - if (!ValChild->isLeaf()) { + const TreePatternNode &ValChild = Dst.getChild(0); + if (!ValChild.isLeaf()) { // We really have to handle the source instruction, and then insert a // copy from the subregister. auto ExtractSrcTy = getInstResultType(ValChild); @@ -1574,7 +1573,7 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( } // If this is a source operand, this is just a subregister copy. - Record *RCDef = getInitValueAsRegClass(ValChild->getLeafValue()); + Record *RCDef = getInitValueAsRegClass(ValChild.getLeafValue()); if (!RCDef) return failedImport("EXTRACT_SUBREG child #0 could not " "be coerced to a register class"); @@ -1589,7 +1588,7 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( return failedImport("EXTRACT_SUBREG requires an additional COPY"); } - StringRef RegOperandName = Dst->getChild(0)->getName(); + StringRef RegOperandName = Dst.getChild(0).getName(); if (const auto &SubOperand = M.getComplexSubOperand(RegOperandName)) { DstMIBuilder.addRenderer( *std::get<0>(*SubOperand), RegOperandName, std::get<1>(*SubOperand), @@ -1602,10 +1601,10 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( } if (Name == "REG_SEQUENCE") { - if (!Dst->getChild(0)->isLeaf()) + if (!Dst.getChild(0).isLeaf()) return failedImport("REG_SEQUENCE child #0 is not a leaf"); - Record *RCDef = getInitValueAsRegClass(Dst->getChild(0)->getLeafValue()); + Record *RCDef = getInitValueAsRegClass(Dst.getChild(0).getLeafValue()); if (!RCDef) return failedImport("REG_SEQUENCE child #0 could not " "be coerced to a register class"); @@ -1614,11 +1613,10 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( return failedImport("Malformed REG_SEQUENCE"); for (unsigned I = 1; I != ExpectedDstINumUses; I += 2) { - const TreePatternNode *ValChild = Dst->getChild(I); - const TreePatternNode *SubRegChild = Dst->getChild(I + 1); + const TreePatternNode &ValChild = Dst.getChild(I); + const TreePatternNode &SubRegChild = Dst.getChild(I + 1); - if (DefInit *SubRegInit = - dyn_cast(SubRegChild->getLeafValue())) { + if (DefInit *SubRegInit = dyn_cast(SubRegChild.getLeafValue())) { CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef()); auto InsertPtOrError = @@ -1676,7 +1674,7 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( // If the operand has default values, introduce them now. if (CGP.operandHasDefault(OperandNode) && - (InstOpNo < NonOverridableOperands || Child >= Dst->getNumChildren())) { + (InstOpNo < NonOverridableOperands || Child >= Dst.getNumChildren())) { // This is a predicate or optional def operand which the pattern has not // overridden, or which we aren't letting it override; emit the 'default // ops' operands. @@ -1691,7 +1689,7 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( } auto InsertPtOrError = importExplicitUseRenderer(InsertPt, M, DstMIBuilder, - Dst->getChild(Child), Src); + Dst.getChild(Child), Src); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); InsertPt = InsertPtOrError.get(); @@ -1712,14 +1710,14 @@ Error GlobalISelEmitter::importDefaultOperandRenderers( action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, const DAGDefaultOperand &DefaultOp) const { for (const auto &Op : DefaultOp.DefaultOps) { - const auto *N = Op.get(); - if (!N->isLeaf()) + const auto &N = *Op; + if (!N.isLeaf()) return failedImport("Could not add default op"); - const auto *DefaultOp = N->getLeafValue(); + const auto *DefaultOp = N.getLeafValue(); if (const DefInit *DefaultDefOp = dyn_cast(DefaultOp)) { - std::optional OpTyOrNone = MVTToLLT(N->getSimpleType(0)); + std::optional OpTyOrNone = MVTToLLT(N.getSimpleType(0)); auto Def = DefaultDefOp->getDef(); if (Def->getName() == "undef_tied_input") { unsigned TempRegID = M.allocateTempRegID(); @@ -1758,10 +1756,9 @@ Error GlobalISelEmitter::importImplicitDefRenderers( } std::optional -GlobalISelEmitter::getRegClassFromLeaf(const TreePatternNode *Leaf) { - assert(Leaf && "Expected node?"); - assert(Leaf->isLeaf() && "Expected leaf?"); - Record *RCRec = getInitValueAsRegClass(Leaf->getLeafValue()); +GlobalISelEmitter::getRegClassFromLeaf(const TreePatternNode &Leaf) { + assert(Leaf.isLeaf() && "Expected leaf?"); + Record *RCRec = getInitValueAsRegClass(Leaf.getLeafValue()); if (!RCRec) return std::nullopt; CodeGenRegisterClass *RC = CGRegs.getRegClass(RCRec); @@ -1771,20 +1768,17 @@ GlobalISelEmitter::getRegClassFromLeaf(const TreePatternNode *Leaf) { } std::optional -GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode *N) { - if (!N) - return std::nullopt; - - if (N->isLeaf()) +GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode &N) { + if (N.isLeaf()) return getRegClassFromLeaf(N); // We don't have a leaf node, so we have to try and infer something. Check // that we have an instruction that we an infer something from. // Only handle things that produce a single type. - if (N->getNumTypes() != 1) + if (N.getNumTypes() != 1) return std::nullopt; - Record *OpRec = N->getOperator(); + Record *OpRec = N.getOperator(); // We only want instructions. if (!OpRec->isSubClassOf("Instruction")) @@ -1803,21 +1797,21 @@ GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode *N) { if (IsRegSequence || InstName == "COPY_TO_REGCLASS") { // If we have a COPY_TO_REGCLASS, then we need to handle it specially. It // has the desired register class as the first child. - const TreePatternNode *RCChild = N->getChild(IsRegSequence ? 0 : 1); - if (!RCChild->isLeaf()) + const TreePatternNode &RCChild = N.getChild(IsRegSequence ? 0 : 1); + if (!RCChild.isLeaf()) return std::nullopt; return getRegClassFromLeaf(RCChild); } if (InstName == "INSERT_SUBREG") { - const TreePatternNode *Child0 = N->getChild(0); - assert(Child0->getNumTypes() == 1 && "Unexpected number of types!"); - const TypeSetByHwMode &VTy = Child0->getExtType(0); - return inferSuperRegisterClassForNode(VTy, Child0, N->getChild(2)); + const TreePatternNode &Child0 = N.getChild(0); + assert(Child0.getNumTypes() == 1 && "Unexpected number of types!"); + const TypeSetByHwMode &VTy = Child0.getExtType(0); + return inferSuperRegisterClassForNode(VTy, Child0, N.getChild(2)); } if (InstName == "EXTRACT_SUBREG") { - assert(N->getNumTypes() == 1 && "Unexpected number of types!"); - const TypeSetByHwMode &VTy = N->getExtType(0); - return inferSuperRegisterClass(VTy, N->getChild(1)); + assert(N.getNumTypes() == 1 && "Unexpected number of types!"); + const TypeSetByHwMode &VTy = N.getExtType(0); + return inferSuperRegisterClass(VTy, N.getChild(1)); } // Handle destination record types that we can safely infer a register class @@ -1840,14 +1834,13 @@ GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode *N) { std::optional GlobalISelEmitter::inferSuperRegisterClass( - const TypeSetByHwMode &Ty, const TreePatternNode *SubRegIdxNode) { - assert(SubRegIdxNode && "Expected subregister index node!"); + const TypeSetByHwMode &Ty, const TreePatternNode &SubRegIdxNode) { // We need a ValueTypeByHwMode for getSuperRegForSubReg. if (!Ty.isValueTypeByHwMode(false)) return std::nullopt; - if (!SubRegIdxNode->isLeaf()) + if (!SubRegIdxNode.isLeaf()) return std::nullopt; - DefInit *SubRegInit = dyn_cast(SubRegIdxNode->getLeafValue()); + DefInit *SubRegInit = dyn_cast(SubRegIdxNode.getLeafValue()); if (!SubRegInit) return std::nullopt; CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef()); @@ -1864,9 +1857,8 @@ GlobalISelEmitter::inferSuperRegisterClass( std::optional GlobalISelEmitter::inferSuperRegisterClassForNode( - const TypeSetByHwMode &Ty, const TreePatternNode *SuperRegNode, - const TreePatternNode *SubRegIdxNode) { - assert(SuperRegNode && "Expected super register node!"); + const TypeSetByHwMode &Ty, const TreePatternNode &SuperRegNode, + const TreePatternNode &SubRegIdxNode) { // Check if we already have a defined register class for the super register // node. If we do, then we should preserve that rather than inferring anything // from the subregister index node. We can assume that whoever wrote the @@ -1879,11 +1871,11 @@ GlobalISelEmitter::inferSuperRegisterClassForNode( } std::optional GlobalISelEmitter::inferSubRegIndexForNode( - const TreePatternNode *SubRegIdxNode) { - if (!SubRegIdxNode->isLeaf()) + const TreePatternNode &SubRegIdxNode) { + if (!SubRegIdxNode.isLeaf()) return std::nullopt; - DefInit *SubRegInit = dyn_cast(SubRegIdxNode->getLeafValue()); + DefInit *SubRegInit = dyn_cast(SubRegIdxNode.getLeafValue()); if (!SubRegInit) return std::nullopt; return CGRegs.getSubRegIdx(SubRegInit->getDef()); @@ -1894,9 +1886,9 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { int Score = P.getPatternComplexity(CGP); RuleMatcher M(P.getSrcRecord()->getLoc()); RuleMatcherScores[M.getRuleID()] = Score; - M.addAction(llvm::to_string(*P.getSrcPattern()) + + M.addAction(llvm::to_string(P.getSrcPattern()) + " => " + - llvm::to_string(*P.getDstPattern())); + llvm::to_string(P.getDstPattern())); SmallVector Predicates; P.getPredicateRecords(Predicates); @@ -1907,8 +1899,8 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { M.addHwModeIdx(declareHwModeCheck(P.getHwModeFeatures())); // Next, analyze the pattern operators. - TreePatternNode *Src = P.getSrcPattern(); - TreePatternNode *Dst = P.getDstPattern(); + TreePatternNode &Src = P.getSrcPattern(); + TreePatternNode &Dst = P.getDstPattern(); // If the root of either pattern isn't a simple operator, ignore it. if (auto Err = isTrivialOperatorNode(Dst)) @@ -1939,7 +1931,7 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { // the capture accross rules. The downside is that it would // introduce a dependency between predicates (captures must happen // before their first use.) - InstructionMatcher &InsnMatcherTemp = M.addInstructionMatcher(Src->getName()); + InstructionMatcher &InsnMatcherTemp = M.addInstructionMatcher(Src.getName()); unsigned TempOpIdx = 0; const auto SavedFlags = M.setGISelFlags(P.getSrcRecord()); @@ -1950,8 +1942,8 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { return std::move(Error); InstructionMatcher &InsnMatcher = InsnMatcherOrError.get(); - if (Dst->isLeaf()) { - Record *RCDef = getInitValueAsRegClass(Dst->getLeafValue()); + if (Dst.isLeaf()) { + Record *RCDef = getInitValueAsRegClass(Dst.getLeafValue()); if (RCDef) { const CodeGenRegisterClass &RC = Target.getRegisterClass(RCDef); @@ -1969,7 +1961,7 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { auto &DstMIBuilder = M.addAction(M.allocateOutputInsnID(), &DstI); DstMIBuilder.addRenderer(DstIOperand.Name); - DstMIBuilder.addRenderer(Dst->getName()); + DstMIBuilder.addRenderer(Dst.getName()); M.addAction(0, 0, RC); // Erase the root. @@ -1986,7 +1978,7 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { } // Start with the defined operands (i.e., the results of the root operator). - Record *DstOp = Dst->getOperator(); + Record *DstOp = Dst.getOperator(); if (!DstOp->isSubClassOf("Instruction")) return failedImport("Pattern operator isn't an instruction"); @@ -1994,7 +1986,7 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { StringRef DstIName = DstI.TheDef->getName(); unsigned DstNumDefs = DstI.Operands.NumDefs, - SrcNumDefs = Src->getExtTypes().size(); + SrcNumDefs = Src.getExtTypes().size(); if (DstNumDefs < SrcNumDefs) { if (DstNumDefs != 0) return failedImport("Src pattern result has more defs than dst MI (" + @@ -2017,23 +2009,23 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { unsigned OpIdx = 0; unsigned N = std::min(DstNumDefs, SrcNumDefs); for (unsigned I = 0; I < N; ++I) { - const TypeSetByHwMode &VTy = Src->getExtType(I); + const TypeSetByHwMode &VTy = Src.getExtType(I); const auto &DstIOperand = DstI.Operands[OpIdx]; PointerUnion MatchedRC = DstIOperand.Rec; if (DstIName == "COPY_TO_REGCLASS") { - MatchedRC = getInitValueAsRegClass(Dst->getChild(1)->getLeafValue()); + MatchedRC = getInitValueAsRegClass(Dst.getChild(1).getLeafValue()); if (MatchedRC.isNull()) return failedImport( "COPY_TO_REGCLASS operand #1 isn't a register class"); } else if (DstIName == "REG_SEQUENCE") { - MatchedRC = getInitValueAsRegClass(Dst->getChild(0)->getLeafValue()); + MatchedRC = getInitValueAsRegClass(Dst.getChild(0).getLeafValue()); if (MatchedRC.isNull()) return failedImport("REG_SEQUENCE operand #0 isn't a register class"); } else if (DstIName == "EXTRACT_SUBREG") { - auto InferredClass = inferRegClassFromPattern(Dst->getChild(0)); + auto InferredClass = inferRegClassFromPattern(Dst.getChild(0)); if (!InferredClass) return failedImport( "Could not infer class for EXTRACT_SUBREG operand #0"); @@ -2042,8 +2034,8 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { // register. MatchedRC = (*InferredClass)->getDef(); } else if (DstIName == "INSERT_SUBREG") { - auto MaybeSuperClass = inferSuperRegisterClassForNode( - VTy, Dst->getChild(0), Dst->getChild(2)); + auto MaybeSuperClass = + inferSuperRegisterClassForNode(VTy, Dst.getChild(0), Dst.getChild(2)); if (!MaybeSuperClass) return failedImport( "Cannot infer register class for INSERT_SUBREG operand #0"); @@ -2052,7 +2044,7 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { // set DstIOpRec using this. MatchedRC = *MaybeSuperClass; } else if (DstIName == "SUBREG_TO_REG") { - auto MaybeRegClass = inferSuperRegisterClass(VTy, Dst->getChild(2)); + auto MaybeRegClass = inferSuperRegisterClass(VTy, Dst.getChild(2)); if (!MaybeRegClass) return failedImport( "Cannot infer register class for SUBREG_TO_REG operand #0"); @@ -2060,8 +2052,7 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { } else if (MatchedRC.get()->isSubClassOf("RegisterOperand")) MatchedRC = MatchedRC.get()->getValueAsDef("RegClass"); else if (!MatchedRC.get()->isSubClassOf("RegisterClass")) - return failedImport("Dst MI def isn't a register class" + - to_string(*Dst)); + return failedImport("Dst MI def isn't a register class" + to_string(Dst)); OperandMatcher &OM = InsnMatcher.getOperand(OpIdx); // The operand names declared in the DstI instruction are unrelated to @@ -2095,8 +2086,7 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { if (DstIName == "COPY_TO_REGCLASS") { // COPY_TO_REGCLASS does not provide operand constraints itself but the // result is constrained to the class given by the second child. - Record *DstIOpRec = - getInitValueAsRegClass(Dst->getChild(1)->getLeafValue()); + Record *DstIOpRec = getInitValueAsRegClass(Dst.getChild(1).getLeafValue()); if (DstIOpRec == nullptr) return failedImport("COPY_TO_REGCLASS operand #1 isn't a register class"); @@ -2104,12 +2094,12 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { M.addAction( 0, 0, Target.getRegisterClass(DstIOpRec)); } else if (DstIName == "EXTRACT_SUBREG") { - auto SuperClass = inferRegClassFromPattern(Dst->getChild(0)); + auto SuperClass = inferRegClassFromPattern(Dst.getChild(0)); if (!SuperClass) return failedImport( "Cannot infer register class from EXTRACT_SUBREG operand #0"); - auto SubIdx = inferSubRegIndexForNode(Dst->getChild(1)); + auto SubIdx = inferSubRegIndexForNode(Dst.getChild(1)); if (!SubIdx) return failedImport("EXTRACT_SUBREG child #1 is not a subreg index"); @@ -2119,7 +2109,7 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { // // FIXME: This may introduce an extra copy if the chosen class doesn't // actually contain the subregisters. - assert(Src->getExtTypes().size() == 1 && + assert(Src.getExtTypes().size() == 1 && "Expected Src of EXTRACT_SUBREG to have one result type"); const auto SrcRCDstRCPair = @@ -2134,16 +2124,16 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { *SrcRCDstRCPair->second); M.addAction(0, 1, *SrcRCDstRCPair->first); } else if (DstIName == "INSERT_SUBREG") { - assert(Src->getExtTypes().size() == 1 && + assert(Src.getExtTypes().size() == 1 && "Expected Src of INSERT_SUBREG to have one result type"); // We need to constrain the destination, a super regsister source, and a // subregister source. - auto SubClass = inferRegClassFromPattern(Dst->getChild(1)); + auto SubClass = inferRegClassFromPattern(Dst.getChild(1)); if (!SubClass) return failedImport( "Cannot infer register class from INSERT_SUBREG operand #1"); auto SuperClass = inferSuperRegisterClassForNode( - Src->getExtType(0), Dst->getChild(0), Dst->getChild(2)); + Src.getExtType(0), Dst.getChild(0), Dst.getChild(2)); if (!SuperClass) return failedImport( "Cannot infer register class for INSERT_SUBREG operand #0"); @@ -2152,32 +2142,32 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { M.addAction(0, 2, **SubClass); } else if (DstIName == "SUBREG_TO_REG") { // We need to constrain the destination and subregister source. - assert(Src->getExtTypes().size() == 1 && + assert(Src.getExtTypes().size() == 1 && "Expected Src of SUBREG_TO_REG to have one result type"); // Attempt to infer the subregister source from the first child. If it has // an explicitly given register class, we'll use that. Otherwise, we will // fail. - auto SubClass = inferRegClassFromPattern(Dst->getChild(1)); + auto SubClass = inferRegClassFromPattern(Dst.getChild(1)); if (!SubClass) return failedImport( "Cannot infer register class from SUBREG_TO_REG child #1"); // We don't have a child to look at that might have a super register node. auto SuperClass = - inferSuperRegisterClass(Src->getExtType(0), Dst->getChild(2)); + inferSuperRegisterClass(Src.getExtType(0), Dst.getChild(2)); if (!SuperClass) return failedImport( "Cannot infer register class for SUBREG_TO_REG operand #0"); M.addAction(0, 0, **SuperClass); M.addAction(0, 2, **SubClass); } else if (DstIName == "REG_SEQUENCE") { - auto SuperClass = inferRegClassFromPattern(Dst->getChild(0)); + auto SuperClass = inferRegClassFromPattern(Dst.getChild(0)); M.addAction(0, 0, **SuperClass); - unsigned Num = Dst->getNumChildren(); + unsigned Num = Dst.getNumChildren(); for (unsigned I = 1; I != Num; I += 2) { - TreePatternNode *SubRegChild = Dst->getChild(I + 1); + TreePatternNode &SubRegChild = Dst.getChild(I + 1); auto SubIdx = inferSubRegIndexForNode(SubRegChild); if (!SubIdx) -- cgit v1.1 From 0de859c8f22669ab7a816afdf975c7b012e511b9 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 9 Feb 2024 14:16:48 +0000 Subject: [MC] Fix operator++ for various MCRegister iterators (#81250) Return *this from operator++. NFC, this just allows using ++Iter in an expression in future patches. --- llvm/include/llvm/MC/MCRegisterInfo.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h index e52f0a4..fb4d11e 100644 --- a/llvm/include/llvm/MC/MCRegisterInfo.h +++ b/llvm/include/llvm/MC/MCRegisterInfo.h @@ -572,9 +572,10 @@ public: bool isValid() const { return SRIter.isValid(); } /// Moves to the next position. - void operator++() { + MCSubRegIndexIterator &operator++() { ++SRIter; ++SRIndex; + return *this; } }; @@ -688,9 +689,10 @@ public: bool isValid() const { return RUIter.isValid(); } /// Moves to the next position. - void operator++() { + MCRegUnitMaskIterator &operator++() { ++MaskListIter; ++RUIter; + return *this; } }; @@ -728,10 +730,11 @@ public: } /// Preincrement to move to the next root register. - void operator++() { + MCRegUnitRootIterator &operator++() { assert(isValid() && "Cannot move off the end of the list."); Reg0 = Reg1; Reg1 = 0; + return *this; } }; @@ -788,10 +791,11 @@ public: } } - void operator++() { + MCRegAliasIterator &operator++() { assert(isValid() && "Cannot move off the end of the list."); do advance(); while (!IncludeSelf && isValid() && *SI == Reg); + return *this; } }; -- cgit v1.1 From b5abaea3c0de605c8145035b21a5ee492883ebd7 Mon Sep 17 00:00:00 2001 From: stephenpeckham <118857872+stephenpeckham@users.noreply.github.com> Date: Fri, 9 Feb 2024 08:20:21 -0600 Subject: [yaml2obj][XOFF] Update yaml2obj for XCOFF to create valid XCOFF files in more cases. (#77620) yaml2obj creates invalid object files even when the input was created by obj2yaml using a valid object file. On the other hand, yaml2obj is used to intentionally create invalid object files for testing purposes. This update balances using specified input values when provided and computing file offsets and sizes if necessary. --- llvm/lib/ObjectYAML/XCOFFEmitter.cpp | 232 +++++++++++++-------- .../tools/llvm-objcopy/XCOFF/invalid-read.test | 6 +- .../XCOFF/disassemble-traceback-table.test | 2 +- .../tools/llvm-objdump/XCOFF/section-headers.test | 2 +- .../test/tools/llvm-readobj/XCOFF/file-header.test | 3 +- llvm/test/tools/llvm-readobj/XCOFF/sections.test | 20 +- llvm/test/tools/obj2yaml/XCOFF/aix.yaml | 4 +- .../test/tools/obj2yaml/XCOFF/invalid-section.yaml | 3 +- .../tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml | 24 +-- llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml | 4 +- llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml | 91 ++++++++ 11 files changed, 269 insertions(+), 122 deletions(-) create mode 100644 llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp index 5b244ff..f68c571 100644 --- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp +++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp @@ -41,17 +41,19 @@ public: bool writeXCOFF(); private: + void reportOverwrite(uint64_t currentOffset, uint64_t specifiedOffset, + const Twine &fieldName); bool nameShouldBeInStringTable(StringRef SymbolName); bool initFileHeader(uint64_t CurrentOffset); void initAuxFileHeader(); - bool initSectionHeader(uint64_t &CurrentOffset); + bool initSectionHeaders(uint64_t &CurrentOffset); bool initRelocations(uint64_t &CurrentOffset); bool initStringTable(); bool assignAddressesAndIndices(); void writeFileHeader(); void writeAuxFileHeader(); - void writeSectionHeader(); + void writeSectionHeaders(); bool writeSectionData(); bool writeRelocations(); bool writeSymbols(); @@ -91,6 +93,14 @@ static void writeName(StringRef StrName, support::endian::Writer W) { W.write(NameRef); } +void XCOFFWriter::reportOverwrite(uint64_t CurrentOffset, + uint64_t specifiedOffset, + const Twine &fieldName) { + ErrHandler("current file offset (" + Twine(CurrentOffset) + + ") is bigger than the specified " + fieldName + " (" + + Twine(specifiedOffset) + ") "); +} + bool XCOFFWriter::nameShouldBeInStringTable(StringRef SymbolName) { // For XCOFF64: The symbol name is always in the string table. return (SymbolName.size() > XCOFF::NameSize) || Is64Bit; @@ -99,14 +109,31 @@ bool XCOFFWriter::nameShouldBeInStringTable(StringRef SymbolName) { bool XCOFFWriter::initRelocations(uint64_t &CurrentOffset) { for (XCOFFYAML::Section &InitSection : InitSections) { if (!InitSection.Relocations.empty()) { - InitSection.NumberOfRelocations = InitSection.Relocations.size(); - InitSection.FileOffsetToRelocations = CurrentOffset; uint64_t RelSize = Is64Bit ? XCOFF::RelocationSerializationSize64 : XCOFF::RelocationSerializationSize32; - CurrentOffset += InitSection.NumberOfRelocations * RelSize; + uint64_t UsedSize = RelSize * InitSection.Relocations.size(); + + // If NumberOfRelocations was specified, we use it, even if it's + // not consistent with the number of provided relocations. + if (!InitSection.NumberOfRelocations) + InitSection.NumberOfRelocations = InitSection.Relocations.size(); + + // If the YAML file specified an offset to relocations, we use it. + if (InitSection.FileOffsetToRelocations) { + if (CurrentOffset > InitSection.FileOffsetToRelocations) { + reportOverwrite(CurrentOffset, InitSection.FileOffsetToRelocations, + "FileOffsetToRelocations for the " + + InitSection.SectionName + " section"); + return false; + } + CurrentOffset = InitSection.FileOffsetToRelocations; + } else + InitSection.FileOffsetToRelocations = CurrentOffset; + CurrentOffset += UsedSize; if (CurrentOffset > MaxRawDataSize) { - ErrHandler("maximum object size of" + Twine(MaxRawDataSize) + - "exceeded when writing relocation data"); + ErrHandler("maximum object size (" + Twine(MaxRawDataSize) + + ") exceeded when writing relocation data for section " + + Twine(InitSection.SectionName)); return false; } } @@ -114,15 +141,10 @@ bool XCOFFWriter::initRelocations(uint64_t &CurrentOffset) { return true; } -bool XCOFFWriter::initSectionHeader(uint64_t &CurrentOffset) { - uint64_t CurrentSecAddr = 0; +bool XCOFFWriter::initSectionHeaders(uint64_t &CurrentOffset) { + uint64_t CurrentEndDataAddr = 0; + uint64_t CurrentEndTDataAddr = 0; for (uint16_t I = 0, E = InitSections.size(); I < E; ++I) { - if (CurrentOffset > MaxRawDataSize) { - ErrHandler("maximum object size of" + Twine(MaxRawDataSize) + - "exceeded when writing section data"); - return false; - } - // Assign indices for sections. if (InitSections[I].SectionName.size() && !SectionIndexMap[InitSections[I].SectionName]) { @@ -135,23 +157,58 @@ bool XCOFFWriter::initSectionHeader(uint64_t &CurrentOffset) { } } - // Calculate the physical/virtual address. This field should contain 0 for - // all sections except the text, data and bss sections. - if (InitSections[I].Flags != XCOFF::STYP_TEXT && - InitSections[I].Flags != XCOFF::STYP_DATA && - InitSections[I].Flags != XCOFF::STYP_BSS) - InitSections[I].Address = 0; - else - InitSections[I].Address = CurrentSecAddr; + if (!InitSections[I].Size) + InitSections[I].Size = InitSections[I].SectionData.binary_size(); + + // Section data addresses (physical/virtual) are related to symbol + // addresses and alignments. Furthermore, it is possible to specify the + // same starting addresses for the .text, .data, and .tdata sections. + // Without examining all the symbols and their addreses and alignments, + // it is not possible to compute valid section addresses. The only + // condition required by XCOFF is that the .bss section immediately + // follows the .data section, and the .tbss section immediately follows + // the .tdata section. Therefore, we only assign addresses to the .bss + // and .tbss sections if they do not already have non-zero addresses. + // (If the YAML file is being used to generate a valid object file, we + // expect all section addresses to be specified explicitly.) + switch (InitSections[I].Flags) { + case XCOFF::STYP_DATA: + CurrentEndDataAddr = InitSections[I].Address + InitSections[I].Size; + break; + case XCOFF::STYP_BSS: + if (!InitSections[I].Address) + InitSections[I].Address = CurrentEndDataAddr; + break; + case XCOFF::STYP_TDATA: + CurrentEndTDataAddr = InitSections[I].Address + InitSections[I].Size; + break; + case XCOFF::STYP_TBSS: + if (!InitSections[I].Address) + InitSections[I].Address = CurrentEndTDataAddr; + break; + } - // Calculate the FileOffsetToData and data size for sections. if (InitSections[I].SectionData.binary_size()) { - InitSections[I].FileOffsetToData = CurrentOffset; + if (InitSections[I].FileOffsetToData) { + // Use the providedFileOffsetToData. + if (CurrentOffset > InitSections[I].FileOffsetToData) { + reportOverwrite(CurrentOffset, InitSections[I].FileOffsetToData, + "FileOffsetToData for the " + + InitSections[I].SectionName + " section"); + return false; + } + CurrentOffset = InitSections[I].FileOffsetToData; + } else { + CurrentOffset = alignTo(CurrentOffset, DefaultSectionAlign); + InitSections[I].FileOffsetToData = CurrentOffset; + } CurrentOffset += InitSections[I].SectionData.binary_size(); - // Ensure the offset is aligned to DefaultSectionAlign. - CurrentOffset = alignTo(CurrentOffset, DefaultSectionAlign); - InitSections[I].Size = CurrentOffset - InitSections[I].FileOffsetToData; - CurrentSecAddr += InitSections[I].Size; + if (CurrentOffset > MaxRawDataSize) { + ErrHandler("maximum object size (" + Twine(MaxRawDataSize) + + ") exceeded when writing data for section " + Twine(I + 1) + + " (" + Twine(InitSections[I].SectionName) + ")"); + return false; + } } } return initRelocations(CurrentOffset); @@ -255,12 +312,20 @@ bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) { // Calculate SymbolTableOffset for the file header. if (InitFileHdr.NumberOfSymTableEntries) { + if (Obj.Header.SymbolTableOffset) { + if (CurrentOffset > Obj.Header.SymbolTableOffset) { + reportOverwrite(CurrentOffset, Obj.Header.SymbolTableOffset, + "SymbolTableOffset"); + return false; + } + CurrentOffset = Obj.Header.SymbolTableOffset; + } InitFileHdr.SymbolTableOffset = CurrentOffset; CurrentOffset += InitFileHdr.NumberOfSymTableEntries * XCOFF::SymbolTableEntrySize; if (CurrentOffset > MaxRawDataSize) { - ErrHandler("maximum object size of" + Twine(MaxRawDataSize) + - "exceeded when writing symbols"); + ErrHandler("maximum object size of " + Twine(MaxRawDataSize) + + " exceeded when writing symbols"); return false; } } @@ -269,7 +334,8 @@ bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) { } void XCOFFWriter::initAuxFileHeader() { - InitAuxFileHdr = *Obj.AuxHeader; + if (Obj.AuxHeader) + InitAuxFileHdr = *Obj.AuxHeader; // In general, an object file might contain multiple sections of a given type, // but in a loadable module, there must be exactly one .text, .data, .bss, and // .loader section. A loadable object might also have one .tdata section and @@ -323,28 +389,32 @@ void XCOFFWriter::initAuxFileHeader() { bool XCOFFWriter::assignAddressesAndIndices() { uint64_t FileHdrSize = Is64Bit ? XCOFF::FileHeaderSize64 : XCOFF::FileHeaderSize32; + + // If AuxHeaderSize is specified in the YAML file, we construct + // an auxiliary header. uint64_t AuxFileHdrSize = 0; - if (Obj.AuxHeader) - AuxFileHdrSize = Obj.Header.AuxHeaderSize - ? Obj.Header.AuxHeaderSize - : (Is64Bit ? XCOFF::AuxFileHeaderSize64 - : XCOFF::AuxFileHeaderSize32); + + if (Obj.Header.AuxHeaderSize) + AuxFileHdrSize = Obj.Header.AuxHeaderSize; + else if (Obj.AuxHeader) + AuxFileHdrSize = + (Is64Bit ? XCOFF::AuxFileHeaderSize64 : XCOFF::AuxFileHeaderSize32); uint64_t SecHdrSize = Is64Bit ? XCOFF::SectionHeaderSize64 : XCOFF::SectionHeaderSize32; uint64_t CurrentOffset = FileHdrSize + AuxFileHdrSize + InitSections.size() * SecHdrSize; // Calculate section header info. - if (!initSectionHeader(CurrentOffset)) + if (!initSectionHeaders(CurrentOffset)) return false; - InitFileHdr.AuxHeaderSize = AuxFileHdrSize; // Calculate file header info. if (!initFileHeader(CurrentOffset)) return false; + InitFileHdr.AuxHeaderSize = AuxFileHdrSize; // Initialize the auxiliary file header. - if (Obj.AuxHeader) + if (AuxFileHdrSize) initAuxFileHeader(); // Initialize the string table. @@ -357,18 +427,14 @@ void XCOFFWriter::writeFileHeader() { : InitFileHdr.NumberOfSections); W.write(Obj.Header.TimeStamp); if (Is64Bit) { - W.write(Obj.Header.SymbolTableOffset - ? Obj.Header.SymbolTableOffset - : InitFileHdr.SymbolTableOffset); + W.write(InitFileHdr.SymbolTableOffset); W.write(InitFileHdr.AuxHeaderSize); W.write(Obj.Header.Flags); W.write(Obj.Header.NumberOfSymTableEntries ? Obj.Header.NumberOfSymTableEntries : InitFileHdr.NumberOfSymTableEntries); } else { - W.write(Obj.Header.SymbolTableOffset - ? Obj.Header.SymbolTableOffset - : InitFileHdr.SymbolTableOffset); + W.write(InitFileHdr.SymbolTableOffset); W.write(Obj.Header.NumberOfSymTableEntries ? Obj.Header.NumberOfSymTableEntries : InitFileHdr.NumberOfSymTableEntries); @@ -392,6 +458,9 @@ void XCOFFWriter::writeAuxFileHeader() { W.write(InitAuxFileHdr.EntryPointAddr.value_or(yaml::Hex64(0))); W.write(InitAuxFileHdr.TextStartAddr.value_or(yaml::Hex64(0))); W.write(InitAuxFileHdr.DataStartAddr.value_or(yaml::Hex64(0))); + // A short 32-bit auxiliary header ends here. + if (InitFileHdr.AuxHeaderSize == XCOFF::AuxFileHeaderSizeShort) + return; W.write(InitAuxFileHdr.TOCAnchorAddr.value_or(yaml::Hex64(0))); } W.write(InitAuxFileHdr.SecNumOfEntryPoint.value_or(0)); @@ -434,50 +503,39 @@ void XCOFFWriter::writeAuxFileHeader() { InitAuxFileHdr.Flag.value_or(yaml::Hex16(XCOFF::SHR_SYMTAB))); if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize64) W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize64); - } else if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize32) { - W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize32); + } else { + if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize32) + W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize32); } } -void XCOFFWriter::writeSectionHeader() { +void XCOFFWriter::writeSectionHeaders() { for (uint16_t I = 0, E = Obj.Sections.size(); I < E; ++I) { - XCOFFYAML::Section YamlSec = Obj.Sections[I]; XCOFFYAML::Section DerivedSec = InitSections[I]; - writeName(YamlSec.SectionName, W); - // Virtual address is the same as physical address. - uint64_t SectionAddress = - YamlSec.Address ? YamlSec.Address : DerivedSec.Address; + writeName(DerivedSec.SectionName, W); if (Is64Bit) { - W.write(SectionAddress); // Physical address - W.write(SectionAddress); // Virtual address - W.write(YamlSec.Size ? YamlSec.Size : DerivedSec.Size); - W.write(YamlSec.FileOffsetToData ? YamlSec.FileOffsetToData - : DerivedSec.FileOffsetToData); - W.write(YamlSec.FileOffsetToRelocations - ? YamlSec.FileOffsetToRelocations - : DerivedSec.FileOffsetToRelocations); - W.write(YamlSec.FileOffsetToLineNumbers); - W.write(YamlSec.NumberOfRelocations - ? YamlSec.NumberOfRelocations - : DerivedSec.NumberOfRelocations); - W.write(YamlSec.NumberOfLineNumbers); - W.write(YamlSec.Flags); + // Virtual address is the same as physical address. + W.write(DerivedSec.Address); // Physical address + W.write(DerivedSec.Address); // Virtual address + W.write(DerivedSec.Size); + W.write(DerivedSec.FileOffsetToData); + W.write(DerivedSec.FileOffsetToRelocations); + W.write(DerivedSec.FileOffsetToLineNumbers); + W.write(DerivedSec.NumberOfRelocations); + W.write(DerivedSec.NumberOfLineNumbers); + W.write(DerivedSec.Flags); W.OS.write_zeros(4); } else { - W.write(SectionAddress); // Physical address - W.write(SectionAddress); // Virtual address - W.write(YamlSec.Size ? YamlSec.Size : DerivedSec.Size); - W.write(YamlSec.FileOffsetToData ? YamlSec.FileOffsetToData - : DerivedSec.FileOffsetToData); - W.write(YamlSec.FileOffsetToRelocations - ? YamlSec.FileOffsetToRelocations - : DerivedSec.FileOffsetToRelocations); - W.write(YamlSec.FileOffsetToLineNumbers); - W.write(YamlSec.NumberOfRelocations - ? YamlSec.NumberOfRelocations - : DerivedSec.NumberOfRelocations); - W.write(YamlSec.NumberOfLineNumbers); - W.write(YamlSec.Flags); + // Virtual address is the same as physical address. + W.write(DerivedSec.Address); // Physical address + W.write(DerivedSec.Address); // Virtual address + W.write(DerivedSec.Size); + W.write(DerivedSec.FileOffsetToData); + W.write(DerivedSec.FileOffsetToRelocations); + W.write(DerivedSec.FileOffsetToLineNumbers); + W.write(DerivedSec.NumberOfRelocations); + W.write(DerivedSec.NumberOfLineNumbers); + W.write(DerivedSec.Flags); } } } @@ -487,8 +545,8 @@ bool XCOFFWriter::writeSectionData() { XCOFFYAML::Section YamlSec = Obj.Sections[I]; if (YamlSec.SectionData.binary_size()) { // Fill the padding size with zeros. - int64_t PaddingSize = - InitSections[I].FileOffsetToData - (W.OS.tell() - StartOffset); + int64_t PaddingSize = (uint64_t)InitSections[I].FileOffsetToData - + (W.OS.tell() - StartOffset); if (PaddingSize < 0) { ErrHandler("redundant data was written before section data"); return false; @@ -685,7 +743,7 @@ bool XCOFFWriter::writeAuxSymbol( bool XCOFFWriter::writeSymbols() { int64_t PaddingSize = - (uint64_t)InitFileHdr.SymbolTableOffset - (W.OS.tell() - StartOffset); + InitFileHdr.SymbolTableOffset - (W.OS.tell() - StartOffset); if (PaddingSize < 0) { ErrHandler("redundant data was written before symbols"); return false; @@ -797,10 +855,10 @@ bool XCOFFWriter::writeXCOFF() { return false; StartOffset = W.OS.tell(); writeFileHeader(); - if (Obj.AuxHeader) + if (InitFileHdr.AuxHeaderSize) writeAuxFileHeader(); if (!Obj.Sections.empty()) { - writeSectionHeader(); + writeSectionHeaders(); if (!writeSectionData()) return false; if (!writeRelocations()) diff --git a/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test b/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test index 1df6340..96dcd72 100644 --- a/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test +++ b/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test @@ -5,7 +5,7 @@ # RUN: yaml2obj %s --docnum=1 -o %t1 # RUN: not llvm-objcopy %t1 %t1.out 2>&1 | FileCheck %s -DFILE=%t1 --check-prefix=ERROR1 -# ERROR1: error: '[[FILE]]': The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x4 goes past the end of the file +# ERROR1: error: '[[FILE]]': The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x20 goes past the end of the file --- !XCOFF FileHeader: @@ -13,6 +13,7 @@ FileHeader: Sections: - SectionData: '00007400' FileOffsetToData: 0x70 + Size: 0x20 ## Failed to read relocations. # RUN: yaml2obj %s --docnum=2 -o %t2 @@ -35,12 +36,13 @@ Sections: # RUN: yaml2obj %s --docnum=3 -o %t3 # RUN: not llvm-objcopy %t3 %t3.out 2>&1 | FileCheck %s -DFILE=%t3 --check-prefix=ERROR3 -# ERROR3: error: '[[FILE]]': The end of the file was unexpectedly encountered: symbol table with offset 0x15 and size 0x24 goes past the end of the file +# ERROR3: error: '[[FILE]]': The end of the file was unexpectedly encountered: symbol table with offset 0x15 and size 0x36 goes past the end of the file --- !XCOFF FileHeader: MagicNumber: 0x01DF OffsetToSymbolTable: 0x15 + EntriesInSymbolTable: 3 Symbols: - Name: foo AuxEntries: diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test index 91354f5..96cac6b 100644 --- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test +++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test @@ -112,4 +112,4 @@ Symbols: # CHECK-NEXT: 70: 00 00 00 00 # CHECK-NEXT: ... # CHECK-NEXT: 7c: 00 12 34 00 -# CHECK-NEXT: 80: 00 00 00 00 +# CHECK-NEXT: 80: 00 00 00 diff --git a/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test b/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test index e80d5f6..1a110fb 100644 --- a/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test +++ b/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test @@ -10,7 +10,7 @@ # CHECK-NEXT: 1 .data 00000004 00000000 DATA # CHECK-NEXT: 2 .bss 00000000 00000010 BSS # CHECK-NEXT: 3 .tdata 00000004 00000000 DATA -# CHECK-NEXT: 4 .tbss 00000000 00000000 BSS +# CHECK-NEXT: 4 .tbss 00000000 00000004 BSS # CHECK-NEXT: 5 .dwline 00000046 00000000 DEBUG # CHECK-NEXT: 6 .debug 00000046 00000000 DEBUG diff --git a/llvm/test/tools/llvm-readobj/XCOFF/file-header.test b/llvm/test/tools/llvm-readobj/XCOFF/file-header.test index 8cbd847..2407aef 100644 --- a/llvm/test/tools/llvm-readobj/XCOFF/file-header.test +++ b/llvm/test/tools/llvm-readobj/XCOFF/file-header.test @@ -23,7 +23,6 @@ FileHeader: CreationTime: [[CREATTIME=1]] EntriesInSymbolTable: [[SYMBOLCOUNT=1]] NumberOfSections: 1 - OffsetToSymbolTable: 0x3C AuxiliaryHeaderSize: 0 Flags: 0x12 Sections: @@ -42,7 +41,7 @@ Symbols: # FILEHEADER64-NEXT: Magic: 0x1F7 # FILEHEADER64-NEXT: NumberOfSections: 1 # FILEHEADER64-NEXT: TimeStamp: None (0x0) -# FILEHEADER64-NEXT: SymbolTableOffset: 0x3C +# FILEHEADER64-NEXT: SymbolTableOffset: 0x60 # FILEHEADER64-NEXT: SymbolTableEntries: 1 # FILEHEADER64-NEXT: OptionalHeaderSize: 0x0 # FILEHEADER64-NEXT: Flags: 0x12 diff --git a/llvm/test/tools/llvm-readobj/XCOFF/sections.test b/llvm/test/tools/llvm-readobj/XCOFF/sections.test index be09893..36e85d6 100644 --- a/llvm/test/tools/llvm-readobj/XCOFF/sections.test +++ b/llvm/test/tools/llvm-readobj/XCOFF/sections.test @@ -13,7 +13,7 @@ # SEC32-NEXT: Name: .text # SEC32-NEXT: PhysicalAddress: 0x0 # SEC32-NEXT: VirtualAddress: 0x0 -# SEC32-NEXT: Size: 0x4 +# SEC32-NEXT: Size: 0x2 # SEC32-NEXT: RawDataOffset: 0x64 # SEC32-NEXT: RelocationPointer: 0x0 # SEC32-NEXT: LineNumberPointer: 0x0 @@ -24,11 +24,11 @@ # SEC32-NEXT: Section { # SEC32-NEXT: Index: 2 # SEC32-NEXT: Name: .data -# SEC32-NEXT: PhysicalAddress: 0x4 -# SEC32-NEXT: VirtualAddress: 0x4 -# SEC32-NEXT: Size: 0x4 +# SEC32-NEXT: PhysicalAddress: 0x0 +# SEC32-NEXT: VirtualAddress: 0x0 +# SEC32-NEXT: Size: 0x2 # SEC32-NEXT: RawDataOffset: 0x68 -# SEC32-NEXT: RelocationPointer: 0x6C +# SEC32-NEXT: RelocationPointer: 0x6A # SEC32-NEXT: LineNumberPointer: 0x0 # SEC32-NEXT: NumberOfRelocations: 1 # SEC32-NEXT: NumberOfLineNumbers: 0 @@ -65,7 +65,7 @@ Sections: # SEC64-NEXT: Name: .text # SEC64-NEXT: PhysicalAddress: 0x0 # SEC64-NEXT: VirtualAddress: 0x0 -# SEC64-NEXT: Size: 0x4 +# SEC64-NEXT: Size: 0x2 # SEC64-NEXT: RawDataOffset: 0xA8 # SEC64-NEXT: RelocationPointer: 0x0 # SEC64-NEXT: LineNumberPointer: 0x0 @@ -76,11 +76,11 @@ Sections: # SEC64-NEXT: Section { # SEC64-NEXT: Index: 2 # SEC64-NEXT: Name: .data -# SEC64-NEXT: PhysicalAddress: 0x4 -# SEC64-NEXT: VirtualAddress: 0x4 -# SEC64-NEXT: Size: 0x4 +# SEC64-NEXT: PhysicalAddress: 0x0 +# SEC64-NEXT: VirtualAddress: 0x0 +# SEC64-NEXT: Size: 0x2 # SEC64-NEXT: RawDataOffset: 0xAC -# SEC64-NEXT: RelocationPointer: 0xB0 +# SEC64-NEXT: RelocationPointer: 0xAE # SEC64-NEXT: LineNumberPointer: 0x0 # SEC64-NEXT: NumberOfRelocations: 1 # SEC64-NEXT: NumberOfLineNumbers: 0 diff --git a/llvm/test/tools/obj2yaml/XCOFF/aix.yaml b/llvm/test/tools/obj2yaml/XCOFF/aix.yaml index 9f2f68b..12f44d0 100644 --- a/llvm/test/tools/obj2yaml/XCOFF/aix.yaml +++ b/llvm/test/tools/obj2yaml/XCOFF/aix.yaml @@ -31,7 +31,7 @@ # CHECK32-NEXT: Info: 0xF # CHECK32-NEXT: Type: 0x3 # CHECK32-NEXT: - Name: .data -# CHECK32-NEXT: Address: 0x4 +# CHECK32-NEXT: Address: 0x0 # CHECK32-NEXT: Size: 0x4 # CHECK32-NEXT: FileOffsetToData: 0x68 # CHECK32-NEXT: FileOffsetToRelocations: 0x76 @@ -105,7 +105,7 @@ # CHECK64-NEXT: Info: 0xF # CHECK64-NEXT: Type: 0x3 # CHECK64-NEXT: - Name: .data -# CHECK64-NEXT: Address: 0x4 +# CHECK64-NEXT: Address: 0x0 # CHECK64-NEXT: Size: 0x4 # CHECK64-NEXT: FileOffsetToData: 0xAC # CHECK64-NEXT: FileOffsetToRelocations: 0xBE diff --git a/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml b/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml index 1e16c5f..0e16a47 100644 --- a/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml +++ b/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml @@ -5,7 +5,7 @@ # RUN: yaml2obj %s --docnum=1 -o %t1 # RUN: not obj2yaml %t1 2>&1 | FileCheck %s -DFILE=%t1 --check-prefix=ERROR1 -# ERROR1: The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x4 goes past the end of the file +# ERROR1: The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x20 goes past the end of the file --- !XCOFF FileHeader: @@ -13,6 +13,7 @@ FileHeader: Sections: - SectionData: '00007400' FileOffsetToData: 0x70 + Size: 0x20 ## Error2: failed to get relocations. # RUN: yaml2obj %s --docnum=2 -o %t2 diff --git a/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml b/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml index f6d6193..a93123b 100644 --- a/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml +++ b/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml @@ -10,12 +10,12 @@ # CASE1: AuxiliaryHeader { # CASE1-NEXT: Magic: 0x10B # CASE1-NEXT: Version: 0x1 -# CASE1-NEXT: Size of .text section: 0x8 -# CASE1-NEXT: Size of .data section: 0x8 -# CASE1-NEXT: Size of .bss section: 0x8 +# CASE1-NEXT: Size of .text section: 0x5 +# CASE1-NEXT: Size of .data section: 0x5 +# CASE1-NEXT: Size of .bss section: 0x5 # CASE1-NEXT: Entry point address: 0x0 -# CASE1-NEXT: .text section start address: 0x4 -# CASE1-NEXT: .data section start address: 0x10 +# CASE1-NEXT: .text section start address: 0x0 +# CASE1-NEXT: .data section start address: 0x0 # CASE1-NEXT: TOC anchor address: 0x0 # CASE1-NEXT: Section number of entryPoint: 0 # CASE1-NEXT: Section number of .text: 2 @@ -79,16 +79,12 @@ Sections: # RUN: yaml2obj %s --docnum=1 -DMAGIC=0x1F7 -o %t2 # RUN: llvm-readobj --auxiliary-header %t2 | FileCheck %s --check-prefix=CASE2 -## Case2: same as case1, except it is 64-bit. -# RUN: yaml2obj %s --docnum=1 -DMAGIC=0x1F7 -o %t2 -# RUN: llvm-readobj --auxiliary-header %t2 | FileCheck %s --check-prefix=CASE2 - # CASE2: AuxiliaryHeader { # CASE2-NEXT: Magic: 0x10B # CASE2-NEXT: Version: 0x1 # CASE2-NEXT: Reserved for debugger: 0x0 -# CASE2-NEXT: .text section start address: 0x2 -# CASE2-NEXT: .data section start address: 0xE +# CASE2-NEXT: .text section start address: 0x0 +# CASE2-NEXT: .data section start address: 0x0 # CASE2-NEXT: TOC anchor address: 0x0 # CASE2-NEXT: Section number of entryPoint: 0 # CASE2-NEXT: Section number of .text: 2 @@ -106,9 +102,9 @@ Sections: # CASE2-NEXT: Stack page size: 0x0 # CASE2-NEXT: Flag: 0x80 # CASE2-NEXT: Alignment of thread-local storage: 0x0 -# CASE2-NEXT: Size of .text section: 0x8 -# CASE2-NEXT: Size of .data section: 0x8 -# CASE2-NEXT: Size of .bss section: 0x8 +# CASE2-NEXT: Size of .text section: 0x5 +# CASE2-NEXT: Size of .data section: 0x5 +# CASE2-NEXT: Size of .bss section: 0x5 # CASE2-NEXT: Entry point address: 0x0 # CASE2-NEXT: Maximum stack size: 0x0 # CASE2-NEXT: Maximum data size: 0x0 diff --git a/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml b/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml index ed85bc6..8c3d77d 100644 --- a/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml +++ b/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml @@ -65,8 +65,8 @@ Symbols: # CHECK-NEXT: Section { # CHECK-NEXT: Index: 2 # CHECK-NEXT: Name: .data -# CHECK-NEXT: PhysicalAddress: 0x8 -# CHECK-NEXT: VirtualAddress: 0x8 +# CHECK-NEXT: PhysicalAddress: 0x0 +# CHECK-NEXT: VirtualAddress: 0x0 # CHECK-NEXT: Size: 0x8 # CHECK-NEXT: RawDataOffset: 0xE4 # CHECK-NEXT: RelocationPointer: 0xF0 diff --git a/llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml b/llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml new file mode 100644 index 0000000..ee23a16 --- /dev/null +++ b/llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml @@ -0,0 +1,91 @@ +## Report errors when specified file offsets are invalid. + +# RUN: not yaml2obj %s -DTEXTRAWDATAOFFSET=0xFFFFFFF0 -o %t 2>&1 | \ +# RUN: FileCheck %s --check-prefix=ERROR1 +# ERROR1: current file offset (4294967288) is bigger than the specified FileOffsetToData for the .data section (108) + +# RUN: not yaml2obj %s -DDATARAWDATAOFFSET=0xFFFFFFF0 -o %t 2>&1 | \ +# RUN: FileCheck %s --check-prefix=ERROR2 +# ERROR2: current file offset (4294967284) is bigger than the specified FileOffsetToRelocations for the .text section (112) + +# RUN: not yaml2obj %s -DRELOCOFFSET=0xFFFFFFF0 -o %t 2>&1 | \ +# RUN: FileCheck %s --check-prefix=ERROR3 +# ERROR3: current file offset (4294967290) is bigger than the specified SymbolTableOffset (122) + +# RUN: not yaml2obj %s -DSYMTABOFFSET=0x100000000 -o %t 2>&1 | \ +# RUN: FileCheck %s --check-prefix=ERROR4 +# ERROR4: maximum object size of 4294967295 exceeded when writing symbols + +# RUN: not yaml2obj %s -DRELOCOFFSET=0x100000000 -o %t 2>&1 | \ +# RUN: FileCheck %s --check-prefix=ERROR5 +# ERROR5: maximum object size (4294967295) exceeded when writing relocation data for section .text + +# RUN: not yaml2obj %s -DDATARAWDATAOFFSET=0x100000000 -o %t 2>&1 | \ +# RUN: FileCheck %s --check-prefix=ERROR6 +# ERROR6: maximum object size (4294967295) exceeded when writing data for section 2 (.data) + +--- !XCOFF +FileHeader: + MagicNumber: 0x1DF + NumberOfSections: 2 + OffsetToSymbolTable: [[SYMTABOFFSET=0x7A]] + EntriesInSymbolTable: 6 + AuxiliaryHeaderSize: 0 + Flags: 0x0 +Sections: + - Name: .text + Address: 0x0 + Size: 0x8 + FileOffsetToData: [[TEXTRAWDATAOFFSET=0x64]] + FileOffsetToRelocations: [[RELOCOFFSET=0x70]] + NumberOfRelocations: 0x1 + Flags: [ STYP_TEXT ] + SectionData: 386000004BFFFFFC + Relocations: + - Address: 0x4 + Symbol: 0x2 + Info: 0x99 + Type: 0x1A + - Name: .data + Address: 0x0 + Size: 0x4 + FileOffsetToData: [[DATARAWDATAOFFSET=0x6C]] + FileOffsetToRelocations: 0 + Flags: [ STYP_DATA ] + SectionData: 3210ABCD +Symbols: + - Name: .bar + Value: 0x0 + Section: N_UNDEF + Type: 0x0 + StorageClass: C_EXT + NumberOfAuxEntries: 1 + AuxEntries: + - Type: AUX_CSECT + SymbolAlignmentAndType: 0 + StorageMappingClass: XMC_PR + SectionOrLength: 0 + - Name: '' + Value: 0x0 + Section: .text + Type: 0x0 + StorageClass: C_HIDEXT + NumberOfAuxEntries: 1 + AuxEntries: + - Type: AUX_CSECT + SymbolAlignmentAndType: 17 + StorageMappingClass: XMC_PR + SectionOrLength: 8 + - Name: .foo + Value: 0x0 + Section: .text + Type: 0x0 + StorageClass: C_EXT + NumberOfAuxEntries: 1 + AuxEntries: + - Type: AUX_CSECT + SymbolAlignmentAndType: 2 + StorageMappingClass: XMC_PR + SectionOrLength: 2 +StringTable: {} +... -- cgit v1.1 From b1b8a383fcdab007ccd1a5daa08cb33ce7cbc6c0 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 9 Feb 2024 15:25:24 +0100 Subject: [InstCombine] Remove one-use restriction on icmp of gep fold (#76730) The fold for icmp (gep (p, i1), gep (p, i2)) to icmp (i1, i2) is currently limited to one of the GEPs either having one use or a constant offset. I believe this is to avoid duplicating complex arithmetic both in the GEP and the offset comparison. This patch instead does the same thing that the indexed compare fold does, which is to rewrite the GEP into i8 form if necessary, so that the offset arithmetic is not repeated after the transform. I ran into this problem in a case where there are multiple conditions on the same pointer, which prevents them from getting folded. --- .../Transforms/InstCombine/InstCombineCompares.cpp | 30 +++++++++++++++++----- llvm/test/Transforms/InstCombine/icmp-custom-dl.ll | 4 +-- llvm/test/Transforms/InstCombine/icmp-gep.ll | 17 +++++++----- 3 files changed, 35 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index cbb6988..280c4d7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -813,14 +813,30 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, } } - // Only lower this if the icmp is the only user of the GEP or if we expect - // the result to fold to a constant! - if ((GEPsInBounds || CmpInst::isEquality(Cond)) && - (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) && - (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse())) { + if (GEPsInBounds || CmpInst::isEquality(Cond)) { + auto EmitGEPOffsetAndRewrite = [&](GEPOperator *GEP) { + IRBuilderBase::InsertPointGuard Guard(Builder); + auto *Inst = dyn_cast(GEP); + if (Inst) + Builder.SetInsertPoint(Inst); + + Value *Offset = EmitGEPOffset(GEP); + // If a non-trivial GEP has other uses, rewrite it to avoid duplicating + // the offset arithmetic. + if (Inst && !GEP->hasOneUse() && !GEP->hasAllConstantIndices() && + !GEP->getSourceElementType()->isIntegerTy(8)) { + replaceInstUsesWith(*Inst, + Builder.CreateGEP(Builder.getInt8Ty(), + GEP->getPointerOperand(), + Offset, "", GEPsInBounds)); + eraseInstFromFunction(*Inst); + } + return Offset; + }; + // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2) - Value *L = EmitGEPOffset(GEPLHS); - Value *R = EmitGEPOffset(GEPRHS); + Value *L = EmitGEPOffsetAndRewrite(GEPLHS); + Value *R = EmitGEPOffsetAndRewrite(GEPRHS); return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R); } } diff --git a/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll b/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll index 491f214..a595ddb 100644 --- a/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll +++ b/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll @@ -40,8 +40,8 @@ define i1 @test59_as1(ptr addrspace(1) %foo) { define i1 @test60(ptr %foo, i64 %i, i64 %j) { ; CHECK-LABEL: @test60( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i32 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i32 ; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[GEP1_IDX]], [[TMP2]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -54,8 +54,8 @@ define i1 @test60(ptr %foo, i64 %i, i64 %j) { define i1 @test60_as1(ptr addrspace(1) %foo, i64 %i, i64 %j) { ; CHECK-LABEL: @test60_as1( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16 ; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i16 [[TMP1]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[GEP1_IDX]], [[TMP2]] ; CHECK-NEXT: ret i1 [[CMP]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll index d912f96..a0e03a5 100644 --- a/llvm/test/Transforms/InstCombine/icmp-gep.ll +++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll @@ -313,8 +313,8 @@ define i1 @test_gep_eq_no_inbounds(ptr %foo, i64 %i, i64 %j) { define i1 @test60_as1(ptr addrspace(1) %foo, i64 %i, i64 %j) { ; CHECK-LABEL: @test60_as1( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16 ; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i16 [[TMP1]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[GEP1_IDX]], [[TMP2]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -400,11 +400,13 @@ define i1 @test61_as1(ptr addrspace(1) %foo, i16 %i, i16 %j) { define i1 @test60_extra_use(ptr %foo, i64 %i, i64 %j) { ; CHECK-LABEL: @test60_extra_use( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 [[I:%.*]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i16, ptr [[FOO]], i64 [[J:%.*]] +; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 [[GEP1_IDX]] +; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl nsw i64 [[J:%.*]], 1 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 [[GEP2_IDX]] ; CHECK-NEXT: call void @use(ptr [[GEP1]]) ; CHECK-NEXT: call void @use(ptr [[GEP2]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult ptr [[GEP1]], [[GEP2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[GEP1_IDX]], [[GEP2_IDX]] ; CHECK-NEXT: ret i1 [[CMP]] ; %gep1 = getelementptr inbounds i32, ptr %foo, i64 %i @@ -446,13 +448,14 @@ define i1 @test60_extra_use_const_operands_no_inbounds(ptr %foo, i64 %i, i64 %j) define void @test60_extra_use_fold(ptr %foo, i64 %start.idx, i64 %end.offset) { ; CHECK-LABEL: @test60_extra_use_fold( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 [[START_IDX:%.*]] +; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i64 [[START_IDX:%.*]], 2 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 [[GEP1_IDX]] ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 [[END_OFFSET:%.*]] ; CHECK-NEXT: call void @use(ptr [[GEP1]]) ; CHECK-NEXT: call void @use(ptr [[GEP2]]) -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq ptr [[GEP1]], [[GEP2]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 [[GEP1_IDX]], [[END_OFFSET]] ; CHECK-NEXT: call void @use.i1(i1 [[CMP1]]) -; CHECK-NEXT: [[CMP2:%.*]] = icmp ult ptr [[GEP1]], [[GEP2]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i64 [[GEP1_IDX]], [[END_OFFSET]] ; CHECK-NEXT: call void @use.i1(i1 [[CMP2]]) ; CHECK-NEXT: ret void ; -- cgit v1.1 From fcb59203c8b883aa39d22cf9788c48dbbb734932 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 9 Feb 2024 10:05:26 -0500 Subject: [AMDGPU][DOC] Add MI200 Names to AMDGPUUsage Doc (#81252) --- llvm/docs/AMDGPUUsage.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 6b24171..f463e83 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -357,12 +357,12 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following Add product names. - ``gfx90a`` ``amdgcn`` dGPU - sramecc - Absolute - *rocm-amdhsa* *TBA* - - tgsplit flat - - xnack scratch .. TODO:: + ``gfx90a`` ``amdgcn`` dGPU - sramecc - Absolute - *rocm-amdhsa* - AMD Instinct MI210 Accelerator + - tgsplit flat - *rocm-amdhsa* - AMD Instinct MI250 Accelerator + - xnack scratch - *rocm-amdhsa* - AMD Instinct MI250X Accelerator - kernarg preload - Packed - work-item Add product - IDs names. + work-item + IDs ``gfx90c`` ``amdgcn`` APU - xnack - Absolute - *pal-amdpal* - Ryzen 7 4700G flat - Ryzen 7 4700GE -- cgit v1.1 From 50c5107f42a88a1d2ab66dc6cd1f2cfee6707f7d Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Fri, 9 Feb 2024 19:20:10 +0400 Subject: [clang] Add tests for DRs about inheriting constructors (#79981) Covers CWG issues [1150](https://cplusplus.github.io/CWG/issues/1150.html), [1487](https://cplusplus.github.io/CWG/issues/1487.html), [1567](https://cplusplus.github.io/CWG/issues/1567.html), [1738](https://cplusplus.github.io/CWG/issues/1738.html), [2273](https://cplusplus.github.io/CWG/issues/2273.html), [2277](https://cplusplus.github.io/CWG/issues/2277.html), [2356](https://cplusplus.github.io/CWG/issues/2356.html), [2504](https://cplusplus.github.io/CWG/issues/2504.html). On top of the wording in proposed resolutions, [P0136R1](https://wg21.link/p0136r1) "Rewording inheriting constructors (core issue 1941 et al)" is a very relevant paper. Note that status for 1738 `sup P0136R1` is not officially recognized by CWG, but saying `yes` or `no` seems even more confusing to me. Official resolution is to reject certain code, but Clang is the only implementation that still rejects it to this day: https://godbolt.org/z/b1W8jc1o5. GCC rejected it until 9, now it's accepted: https://godbolt.org/z/of6oh4sdT --- clang/test/CXX/drs/dr11xx.cpp | 2 ++ clang/test/CXX/drs/dr14xx.cpp | 24 ++++++++++++++++++++++++ clang/test/CXX/drs/dr15xx.cpp | 39 +++++++++++++++++++++++++++++++++++++++ clang/test/CXX/drs/dr17xx.cpp | 17 +++++++++++++++++ clang/test/CXX/drs/dr22xx.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ clang/test/CXX/drs/dr23xx.cpp | 25 +++++++++++++++++++++++++ clang/test/CXX/drs/dr2504.cpp | 37 +++++++++++++++++++++++++++++++++++++ clang/test/CXX/drs/dr25xx.cpp | 2 ++ clang/www/cxx_dr_status.html | 16 ++++++++-------- 9 files changed, 195 insertions(+), 8 deletions(-) create mode 100644 clang/test/CXX/drs/dr2504.cpp diff --git a/clang/test/CXX/drs/dr11xx.cpp b/clang/test/CXX/drs/dr11xx.cpp index 86e726a..a71a105 100644 --- a/clang/test/CXX/drs/dr11xx.cpp +++ b/clang/test/CXX/drs/dr11xx.cpp @@ -70,3 +70,5 @@ namespace dr1113 { // dr1113: partial } void g() { f(); } } + +// dr1150: na diff --git a/clang/test/CXX/drs/dr14xx.cpp b/clang/test/CXX/drs/dr14xx.cpp index d262f6f..58a2b3a 100644 --- a/clang/test/CXX/drs/dr14xx.cpp +++ b/clang/test/CXX/drs/dr14xx.cpp @@ -614,6 +614,30 @@ enum E2 : S::I { e }; #endif } // namespace dr1482 +namespace dr1487 { // dr1487: 3.3 +#if __cplusplus >= 201103L +struct A { // #dr1482-A + struct B { + using A::A; + // since-cxx11-error@-1 {{using declaration refers into 'A::', which is not a base class of 'B'}} + }; + + struct C : A { + // since-cxx11-error@-1 {{base class has incomplete type}} + // since-cxx11-note@#dr1482-A {{definition of 'dr1487::A' is not complete until the closing '}'}} + using A::A; + // since-cxx11-error@-1 {{using declaration refers into 'A::', which is not a base class of 'C'}} + }; + + struct D; +}; + +struct D : A { + using A::A; +}; +#endif +} // namespace dr1487 + namespace dr1490 { // dr1490: 3.7 c++11 #if __cplusplus >= 201103L // List-initialization from a string literal diff --git a/clang/test/CXX/drs/dr15xx.cpp b/clang/test/CXX/drs/dr15xx.cpp index 3d4050a..ac503db 100644 --- a/clang/test/CXX/drs/dr15xx.cpp +++ b/clang/test/CXX/drs/dr15xx.cpp @@ -360,6 +360,45 @@ namespace dr1563 { // dr1563: yes #endif } +namespace dr1567 { // dr1567: 3.3 +#if __cplusplus >= 201103L +struct B; +struct A { + A(const A&); + A(const B&) = delete; + A(A&&); + A(B&&) = delete; + A(int); // #dr1567-A-int +}; + +struct B: A { // #dr1567-B + using A::A; // #dr1567-using-A + B(double); // #dr1567-B-double +}; + +A a{0}; +B b{1.0}; +// Good, deleted converting ctors are not inherited as copy/move ctors +B b2{b}; +B b3{B{1.0}}; +// Good, copy/move ctors are not inherited +B b4{a}; +// since-cxx11-error@-1 {{no matching constructor for initialization of 'B'}} +// since-cxx11-note@#dr1567-A-int {{candidate inherited constructor not viable: no known conversion from 'A' to 'int' for 1st argument}} +// since-cxx11-note@#dr1567-using-A {{constructor from base class 'A' inherited here}} +// since-cxx11-note@#dr1567-B {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'A' to 'const B' for 1st argument}} +// since-cxx11-note@#dr1567-B {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'A' to 'B' for 1st argument}} +// since-cxx11-note@#dr1567-B-double {{candidate constructor not viable: no known conversion from 'A' to 'double' for 1st argument}} +B b5{A{0}}; +// since-cxx11-error@-1 {{no matching constructor for initialization of 'B'}} +// since-cxx11-note@#dr1567-A-int {{candidate inherited constructor not viable: no known conversion from 'A' to 'int' for 1st argument}} +// since-cxx11-note@#dr1567-using-A {{constructor from base class 'A' inherited here}} +// since-cxx11-note@#dr1567-B {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'A' to 'const B' for 1st argument}} +// since-cxx11-note@#dr1567-B {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'A' to 'B' for 1st argument}} +// since-cxx11-note@#dr1567-B-double {{candidate constructor not viable: no known conversion from 'A' to 'double' for 1st argument}} +#endif +} + namespace dr1573 { // dr1573: 3.9 #if __cplusplus >= 201103L // ellipsis is inherited (p0136r1 supersedes this part). diff --git a/clang/test/CXX/drs/dr17xx.cpp b/clang/test/CXX/drs/dr17xx.cpp index 885ed00..2f7e62d 100644 --- a/clang/test/CXX/drs/dr17xx.cpp +++ b/clang/test/CXX/drs/dr17xx.cpp @@ -89,6 +89,23 @@ S s(q); // #dr1736-s #endif } +namespace dr1738 { // dr1738: sup P0136R1 +#if __cplusplus >= 201103L +struct A { + template + A(int, T) {} +}; + +struct B : A { + using A::A; +}; + +// FIXME: this is well-formed since P0136R1 +template B::B(int, double); +// since-cxx11-error@-1 {{explicit instantiation of 'B' does not refer to a function template, variable template, member function, member class, or static data member}} +#endif +} + // dr1748 is in dr1748.cpp namespace dr1753 { // dr1753: 11 diff --git a/clang/test/CXX/drs/dr22xx.cpp b/clang/test/CXX/drs/dr22xx.cpp index 1951824..3a13cb0 100644 --- a/clang/test/CXX/drs/dr22xx.cpp +++ b/clang/test/CXX/drs/dr22xx.cpp @@ -154,6 +154,47 @@ const D &d3(c); // FIXME ill-formed #endif } +namespace dr2273 { // dr2273: 3.3 +#if __cplusplus >= 201103L +struct A { + A(int = 0) = delete; // #dr2273-A +}; + +struct B : A { // #dr2273-B + using A::A; +}; + +B b; +// since-cxx11-error@-1 {{call to implicitly-deleted default constructor of 'B'}} +// since-cxx11-note@#dr2273-B {{default constructor of 'B' is implicitly deleted because base class 'A' has a deleted default constructor}} +// since-cxx11-note@#dr2273-A {{'A' has been explicitly marked deleted here}} +#endif +} + +namespace dr2277 { // dr2277: partial +#if __cplusplus >= 201103L +struct A { + A(int, int = 0); + void f(int, int = 0); // #dr2277-A-f +}; +struct B : A { + B(int); + using A::A; + + void f(int); // #dr2277-B-f + using A::f; +}; + +void g() { + B b{0}; + b.f(0); // FIXME: this is well-formed for the same reason as initialization of 'b' above + // since-cxx11-error@-1 {{call to member function 'f' is ambiguous}} + // since-cxx11-note@#dr2277-A-f {{candidate function}} + // since-cxx11-note@#dr2277-B-f {{candidate function}} +} +#endif +} + namespace dr2292 { // dr2292: 9 #if __cplusplus >= 201103L template using id = T; diff --git a/clang/test/CXX/drs/dr23xx.cpp b/clang/test/CXX/drs/dr23xx.cpp index 3f8c476..c046373 100644 --- a/clang/test/CXX/drs/dr23xx.cpp +++ b/clang/test/CXX/drs/dr23xx.cpp @@ -147,6 +147,31 @@ enum struct alignas(64) B {}; #endif } // namespace dr2354 +namespace dr2356 { // dr2356: 4 +#if __cplusplus >= 201103L +struct A { + A(); + A(A &&); // #1 + template A(T &&); // #2 +}; +struct B : A { + using A::A; + B(const B &); // #3 + B(B &&) = default; // #4, implicitly deleted + // since-cxx11-warning@-1 {{explicitly defaulted move constructor is implicitly deleted}} + // since-cxx11-note@#dr2356-X {{move constructor of 'B' is implicitly deleted because field 'x' has a deleted move constructor}} + // since-cxx11-note@#dr2356-X {{'X' has been explicitly marked deleted here}} + // since-cxx11-note@-4 {{replace 'default' with 'delete'}} + + struct X { X(X &&) = delete; } x; // #dr2356-X +}; +extern B b1; +B b2 = static_cast(b1); // calls #3: #1, #2, and #4 are not viable +struct C { operator B&&(); }; +B b3 = C(); // calls #3 +#endif +} + #if __cplusplus >= 201402L namespace dr2358 { // dr2358: 16 void f2() { diff --git a/clang/test/CXX/drs/dr2504.cpp b/clang/test/CXX/drs/dr2504.cpp new file mode 100644 index 0000000..686ea73 --- /dev/null +++ b/clang/test/CXX/drs/dr2504.cpp @@ -0,0 +1,37 @@ +// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK +// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11 +// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11 +// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11 +// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11 +// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11 +// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11 + +namespace dr2504 { // dr2504: no +#if __cplusplus >= 201103L +struct V { V() = default; V(int); }; +struct Q { Q(); }; +struct A : virtual V, Q { + using V::V; + A() = delete; +}; +int bar() { return 42; } +struct B : A { + B() : A(bar()) {} // ok +}; +struct C : B {}; +void foo() { C c; } // bar is not invoked, because the V subobject is not initialized as part of B +#endif +} + +// FIXME: As specified in the comment above (which comes from an example in the Standard), +// we are not supposed to unconditionally call `bar()` and call a constructor +// inherited from `V`. + +// SINCE-CXX11-LABEL: define linkonce_odr void @dr2504::B::B() +// SINCE-CXX11-NOT: br +// SINCE-CXX11: call noundef i32 @dr2504::bar() +// SINCE-CXX11-NOT: br +// SINCE-CXX11: call void @dr2504::A::A(int) +// SINCE-CXX11-LABEL: } + +// CHECK: {{.*}} diff --git a/clang/test/CXX/drs/dr25xx.cpp b/clang/test/CXX/drs/dr25xx.cpp index 502f032..b1e5480 100644 --- a/clang/test/CXX/drs/dr25xx.cpp +++ b/clang/test/CXX/drs/dr25xx.cpp @@ -10,6 +10,8 @@ // expected-no-diagnostics #endif +// dr2504 is in dr2504.cpp + namespace dr2516 { // dr2516: 3.0 // NB: reusing 1482 test #if __cplusplus >= 201103L diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 3e13a4d..4ce5c43 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -6708,7 +6708,7 @@ and POD class 1150 NAD Inheriting constructors have not been implemented - Unknown + N/A 1151 @@ -8730,7 +8730,7 @@ and POD class 1487 CD3 When are inheriting constructors declared? - Unknown + Clang 3.3 1488 @@ -9210,7 +9210,7 @@ and POD class 1567 C++14 Inheriting constructors and copy/move constructors - Unknown + Clang 3.3 1568 @@ -10236,7 +10236,7 @@ and POD class 1738 C++14 Explicit instantiation/specialization of inheriting constructor templates - Unknown + Superseded by P0136R1 1739 @@ -13446,7 +13446,7 @@ and POD class 2273 CD5 Inheriting constructors vs implicit default constructor - Unknown + Clang 3.3 2274 @@ -13470,7 +13470,7 @@ and POD class 2277 CD5 Ambiguity inheriting constructors with default arguments - Unknown + Partial 2278 @@ -13944,7 +13944,7 @@ and POD class 2356 CD5 Base class copy and move constructors should not be inherited - Unknown + Clang 4 2357 @@ -14832,7 +14832,7 @@ and POD class 2504 DR Inheriting constructors from virtual base classes - Unknown + No 2505 -- cgit v1.1 From d05483288465a87e75cfab51792801cfee43914c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 15:39:36 +0100 Subject: [clang][Interp] Handle dummy pointers in ArrayElemPtr{,Pop} differently Instead of returning false, just ignore the operation and return true; This gives us the desired diagnostic behavior in the added test case. --- clang/lib/AST/Interp/Interp.h | 8 +++++--- clang/test/AST/Interp/c.c | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index bcabd93..290edc0 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -1856,7 +1856,7 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) { const Pointer &Ptr = S.Stk.peek(); if (!CheckDummy(S, OpPC, Ptr)) - return false; + return true; if (!OffsetHelper(S, OpPC, Offset, Ptr)) return false; @@ -1869,8 +1869,10 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) { const T &Offset = S.Stk.pop(); const Pointer &Ptr = S.Stk.pop(); - if (!CheckDummy(S, OpPC, Ptr)) - return false; + if (!CheckDummy(S, OpPC, Ptr)) { + S.Stk.push(Ptr); + return true; + } if (!OffsetHelper(S, OpPC, Offset, Ptr)) return false; diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c index 9ab271a..3605462 100644 --- a/clang/test/AST/Interp/c.c +++ b/clang/test/AST/Interp/c.c @@ -129,3 +129,8 @@ _Static_assert(sizeof(name2) == 0, ""); // expected-error {{failed}} \ // expected-note {{evaluates to}} \ // pedantic-expected-error {{failed}} \ // pedantic-expected-note {{evaluates to}} + +void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // expected-warning {{refers past the last possible element}} \ + // pedantic-expected-warning {{refers past the last possible element}} \ + // ref-warning {{refers past the last possible element}} \ + // pedantic-ref-warning {{refers past the last possible element}} -- cgit v1.1 From 356fdc31edd1734ef8dc8f010d5f805345157c49 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Fri, 9 Feb 2024 16:23:43 +0100 Subject: [bazel][clang] Fix BUILD after a8d4a024e6bea3ae71d6187f0c040b2b25e4bf69. --- utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index dda6d94..b8b3fcb 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -321,6 +321,20 @@ gentbl( ) gentbl( + name = "basic_builtins_riscv_gen", + tbl_outs = [( + "-gen-clang-builtins", + "include/clang/Basic/BuiltinsRISCV.inc", + )], + tblgen = ":clang-tblgen", + td_file = "include/clang/Basic/BuiltinsRISCV.td", + td_srcs = [ + "include/clang/Basic/BuiltinsRISCV.td", + "include/clang/Basic/BuiltinsBase.td", + ], +) + +gentbl( name = "basic_builtins_gen", tbl_outs = [( "-gen-clang-builtins", @@ -656,6 +670,7 @@ cc_library( ":basic_attr_gen", ":basic_builtins_bpf_gen", ":basic_builtins_gen", + ":basic_builtins_riscv_gen", ":basic_internal_headers", ":basic_riscv_sifive_vector_builtins_gen", ":basic_riscv_vector_builtin_cg_gen", -- cgit v1.1 From a9700904765590ca2fbf08c0cc36d0da1107d3a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 16:45:58 +0100 Subject: [clang][Interp][NFC] Convert test case to verify=expected,all style --- clang/test/AST/Interp/c.c | 50 ++++++++++++----------------------------------- 1 file changed, 13 insertions(+), 37 deletions(-) diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c index 3605462..337a7cf 100644 --- a/clang/test/AST/Interp/c.c +++ b/clang/test/AST/Interp/c.c @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify -std=c11 %s -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -pedantic -verify=pedantic-expected -std=c11 %s -// RUN: %clang_cc1 -verify=ref -std=c11 %s -// RUN: %clang_cc1 -pedantic -verify=pedantic-ref -std=c11 %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,all -std=c11 %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -pedantic -verify=pedantic-expected,all -std=c11 %s +// RUN: %clang_cc1 -verify=ref,all -std=c11 %s +// RUN: %clang_cc1 -pedantic -verify=pedantic-ref,all -std=c11 %s typedef __INTPTR_TYPE__ intptr_t; typedef __PTRDIFF_TYPE__ ptrdiff_t; @@ -22,10 +22,7 @@ _Static_assert(!!1.0, ""); // pedantic-ref-warning {{not an integer constant exp _Static_assert(!!1, ""); int a = (1 == 1 ? 5 : 3); -_Static_assert(a == 5, ""); // ref-error {{not an integral constant expression}} \ - // pedantic-ref-error {{not an integral constant expression}} \ - // expected-error {{not an integral constant expression}} \ - // pedantic-expected-error {{not an integral constant expression}} +_Static_assert(a == 5, ""); // all-error {{not an integral constant expression}} const int b = 3; @@ -67,25 +64,17 @@ _Static_assert((&a - 100) != 0, ""); // pedantic-ref-warning {{is a GNU extensio /// extern variable of a composite type. /// FIXME: The 'cast from void*' note is missing in the new interpreter. extern struct Test50S Test50; -_Static_assert(&Test50 != (void*)0, ""); // ref-warning {{always true}} \ - // pedantic-ref-warning {{always true}} \ +_Static_assert(&Test50 != (void*)0, ""); // all-warning {{always true}} \ // pedantic-ref-warning {{is a GNU extension}} \ // pedantic-ref-note {{cast from 'void *' is not allowed}} \ - // expected-warning {{always true}} \ - // pedantic-expected-warning {{always true}} \ // pedantic-expected-warning {{is a GNU extension}} struct y {int x,y;}; -int a2[(intptr_t)&((struct y*)0)->y]; // expected-warning {{folded to constant array}} \ - // pedantic-expected-warning {{folded to constant array}} \ - // ref-warning {{folded to constant array}} \ - // pedantic-ref-warning {{folded to constant array}} +int a2[(intptr_t)&((struct y*)0)->y]; // all-warning {{folded to constant array}} const struct y *yy = (struct y*)0; -const intptr_t L = (intptr_t)(&(yy->y)); // expected-error {{not a compile-time constant}} \ - // pedantic-expected-error {{not a compile-time constant}} \ - // ref-error {{not a compile-time constant}} \ - // pedantic-ref-error {{not a compile-time constant}} +const intptr_t L = (intptr_t)(&(yy->y)); // all-error {{not a compile-time constant}} + const ptrdiff_t m = &m + 137 - &m; _Static_assert(m == 137, ""); // pedantic-ref-warning {{GNU extension}} \ // pedantic-expected-warning {{GNU extension}} @@ -93,10 +82,7 @@ _Static_assert(m == 137, ""); // pedantic-ref-warning {{GNU extension}} \ /// from test/Sema/switch.c, used to cause an assertion failure. void f (int z) { while (z) { - default: z--; // expected-error {{'default' statement not in switch}} \ - // pedantic-expected-error {{'default' statement not in switch}} \ - // ref-error {{'default' statement not in switch}} \ - // pedantic-ref-error {{'default' statement not in switch}} + default: z--; // all-error {{'default' statement not in switch}} } } @@ -104,15 +90,8 @@ int expr; int chooseexpr[__builtin_choose_expr(1, 1, expr)]; int somefunc(int i) { - return (i, 65537) * 65537; // expected-warning {{left operand of comma operator has no effect}} \ - // expected-warning {{overflow in expression; result is 131073}} \ - // pedantic-expected-warning {{left operand of comma operator has no effect}} \ - // pedantic-expected-warning {{overflow in expression; result is 131073}} \ - // ref-warning {{left operand of comma operator has no effect}} \ - // ref-warning {{overflow in expression; result is 131073}} \ - // pedantic-ref-warning {{left operand of comma operator has no effect}} \ - // pedantic-ref-warning {{overflow in expression; result is 131073}} - + return (i, 65537) * 65537; // all-warning {{left operand of comma operator has no effect}} \ + // all-warning {{overflow in expression; result is 131073}} } /// FIXME: The following test is incorrect in the new interpreter. @@ -130,7 +109,4 @@ _Static_assert(sizeof(name2) == 0, ""); // expected-error {{failed}} \ // pedantic-expected-error {{failed}} \ // pedantic-expected-note {{evaluates to}} -void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // expected-warning {{refers past the last possible element}} \ - // pedantic-expected-warning {{refers past the last possible element}} \ - // ref-warning {{refers past the last possible element}} \ - // pedantic-ref-warning {{refers past the last possible element}} +void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // all-warning {{refers past the last possible element}} -- cgit v1.1 From a0635edc5980218ad210da25a5c9afe346110ccb Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 9 Feb 2024 15:48:01 +0000 Subject: [PhaseOrdering] Add tests showing missed simplifications. Add tests showing missed simplifications due to phase ordering. --- .../AArch64/extra-unroll-simplifications.ll | 82 ++++++++++++++++++ .../PhaseOrdering/AArch64/hoist-runtime-checks.ll | 98 ++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll new file mode 100644 index 0000000..6132c35 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes='default' -S %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-macosx11.0.0" + +define void @partial_unroll_forced(i32 %N, ptr %src, ptr noalias %dst) { +; CHECK-LABEL: define void @partial_unroll_forced( +; CHECK-SAME: i32 [[N:%.*]], ptr nocapture readonly [[SRC:%.*]], ptr noalias nocapture writeonly [[DST:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP141:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP141]], label [[LOOP_LATCH_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: loop.latch.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[N]], 1 +; CHECK-NEXT: br i1 [[TMP0]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[LOOP_LATCH_PREHEADER_NEW:%.*]] +; CHECK: loop.latch.preheader.new: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483646 +; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] +; CHECK: loop.latch: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[LOOP_LATCH]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP_LATCH]] ] +; CHECK-NEXT: [[SRC_IDX:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[L:%.*]] = load <8 x half>, ptr [[SRC_IDX]], align 16 +; CHECK-NEXT: [[DST_IDX:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ADD:%.*]] = fadd <8 x half> [[L]], [[L]] +; CHECK-NEXT: store <8 x half> [[ADD]], ptr [[DST_IDX]], align 16 +; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[SRC_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load <8 x half>, ptr [[SRC_IDX_1]], align 16 +; CHECK-NEXT: [[DST_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd <8 x half> [[L_1]], [[L_1]] +; CHECK-NEXT: store <8 x half> [[ADD_1]], ptr [[DST_IDX_1]], align 16 +; CHECK-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[EXIT_LOOPEXIT_UNR_LCSSA]], label [[LOOP_LATCH]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: exit.loopexit.unr-lcssa: +; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1]], [[LOOP_LATCH]] ] +; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[EXIT]], label [[LOOP_LATCH_EPIL:%.*]] +; CHECK: loop.latch.epil: +; CHECK-NEXT: [[SRC_IDX_EPIL:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV_UNR]] +; CHECK-NEXT: [[L_EPIL:%.*]] = load <8 x half>, ptr [[SRC_IDX_EPIL]], align 16 +; CHECK-NEXT: [[DST_IDX_EPIL:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_UNR]] +; CHECK-NEXT: [[ADD_EPIL:%.*]] = fadd <8 x half> [[L_EPIL]], [[L_EPIL]] +; CHECK-NEXT: store <8 x half> [[ADD_EPIL]], ptr [[DST_IDX_EPIL]], align 16 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %cmp14 = icmp slt i32 %iv, %N + br i1 %cmp14, label %loop.latch, label %exit + +loop.latch: + %iv.ext = zext i32 %iv to i64 + %src.idx = getelementptr <8 x half>, ptr %src, i64 %iv.ext + %l = load <8 x half>, ptr %src.idx, align 16 + %dst.idx = getelementptr <8 x half>, ptr %dst, i64 %iv.ext + %add = fadd <8 x half> %l, %l + store <8 x half> %add, ptr %dst.idx, align 16 + %iv.next = add i32 %iv, 1 + br label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.unroll.count", i32 2} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.disable"} +;. diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll new file mode 100644 index 0000000..c6c9a52 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes='default' -S %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-macosx11.0.0" + +define i32 @read_only_loop_with_runtime_check(ptr noundef %array, i32 noundef %count, i32 noundef %n) { +; CHECK-LABEL: define i32 @read_only_loop_with_runtime_check( +; CHECK-SAME: ptr nocapture noundef readonly [[ARRAY:%.*]], i32 noundef [[COUNT:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP6_NOT:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP6_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[DOTNOT_NOT:%.*]] = icmp ult i32 [[TMP1]], [[COUNT]] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD]], [[IF_END]] ] +; CHECK-NEXT: br i1 [[DOTNOT_NOT]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: if.end: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP2]], [[SUM_07]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; +entry: + %array.addr = alloca ptr, align 8 + %count.addr = alloca i32, align 4 + %n.addr = alloca i32, align 4 + %sum = alloca i32, align 4 + %i = alloca i32, align 4 + store ptr %array, ptr %array.addr, align 8 + store i32 %count, ptr %count.addr, align 4 + store i32 %n, ptr %n.addr, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #3 + store i32 0, ptr %sum, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #3 + store i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4 + %1 = load i32, ptr %n.addr, align 4 + %cmp = icmp ult i32 %0, %1 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #3 + br label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, ptr %i, align 4 + %3 = load i32, ptr %count.addr, align 4 + %cmp1 = icmp uge i32 %2, %3 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + call void @llvm.trap() + br label %if.end + +if.end: ; preds = %if.then, %for.body + %4 = load ptr, ptr %array.addr, align 8 + %5 = load i32, ptr %i, align 4 + %idxprom = zext i32 %5 to i64 + %arrayidx = getelementptr inbounds i32, ptr %4, i64 %idxprom + %6 = load i32, ptr %arrayidx, align 4 + %7 = load i32, ptr %sum, align 4 + %add = add nsw i32 %7, %6 + store i32 %add, ptr %sum, align 4 + br label %for.inc + +for.inc: ; preds = %if.end + %8 = load i32, ptr %i, align 4 + %inc = add i32 %8, 1 + store i32 %inc, ptr %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond.cleanup + %9 = load i32, ptr %sum, align 4 + call void @llvm.lifetime.end.p0(i64 4, ptr %sum) + ret i32 %9 +} + +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) + +declare void @llvm.trap() + +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) -- cgit v1.1 From fdb16e6fd81b38835795f22730b39b30ddd90f07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 16:51:53 +0100 Subject: [clang][Interp] Only use __int128 in test case if supported --- clang/test/AST/Interp/c.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c index 337a7cf..bb2c7cf 100644 --- a/clang/test/AST/Interp/c.c +++ b/clang/test/AST/Interp/c.c @@ -109,4 +109,6 @@ _Static_assert(sizeof(name2) == 0, ""); // expected-error {{failed}} \ // pedantic-expected-error {{failed}} \ // pedantic-expected-note {{evaluates to}} +#ifdef __SIZEOF_INT128__ void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // all-warning {{refers past the last possible element}} +#endif -- cgit v1.1 From b081e9d4cafe2563c513ed7b5ae3ced6d177b657 Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Fri, 9 Feb 2024 10:56:57 -0500 Subject: [Flang] Fix NULLIFY statement that returns too early for multiple procedure pointer objects. (#81164) The current code that handles NULLIFY statement for procedure pointer returns after the 1st object. This PR is to remove the `return` so it can nullify multiple procedure pointer objects. --- flang/lib/Lower/Bridge.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 579f94b..7577c49 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -3115,10 +3115,10 @@ private: hlfir::Entity nullBoxProc( fir::factory::createNullBoxProc(*builder, loc, boxTy)); builder->createStoreWithConvert(loc, nullBoxProc, pptr); - return; + } else { + fir::MutableBoxValue box = genExprMutableBox(loc, *expr); + fir::factory::disassociateMutableBox(*builder, loc, box); } - fir::MutableBoxValue box = genExprMutableBox(loc, *expr); - fir::factory::disassociateMutableBox(*builder, loc, box); } } -- cgit v1.1 From 935f7d633374f7073fec14927922a2d534c8795f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 17:04:50 +0100 Subject: [clang][Interp][NFC] We do support complex bitint now Remove a stale FIXME comment and improve the test. --- clang/test/AST/Interp/complex.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clang/test/AST/Interp/complex.cpp b/clang/test/AST/Interp/complex.cpp index 7d625ab..9fdaabd 100644 --- a/clang/test/AST/Interp/complex.cpp +++ b/clang/test/AST/Interp/complex.cpp @@ -98,8 +98,9 @@ constexpr _Complex int I3 = {15}; static_assert(__real(I3) == 15, ""); static_assert(__imag(I3) == 0, ""); -/// FIXME: This should work in the new interpreter as well. -// constexpr _Complex _BitInt(8) A = 0;// = {4}; +constexpr _Complex _BitInt(8) A = {4}; +static_assert(__real(A) == 4, ""); +static_assert(__imag(A) == 0, ""); constexpr _Complex double Doubles[4] = {{1.0, 2.0}}; -- cgit v1.1 From 99d743320c5dddb780f1fb2f49414b10e6a52a05 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 9 Feb 2024 08:13:15 -0800 Subject: [clang][fmv] Drop .ifunc from target_version's entrypoint's mangling (#81194) Fixes: https://github.com/llvm/llvm-project/issues/81043 --- clang/include/clang/AST/Decl.h | 4 + clang/include/clang/Basic/AttrDocs.td | 8 ++ clang/lib/AST/Decl.cpp | 4 + clang/lib/CodeGen/CodeGenModule.cpp | 16 +++- clang/test/CodeGen/attr-target-version.c | 129 ++++++++++++++------------ clang/test/CodeGenCXX/attr-target-version.cpp | 87 ++++++++--------- 6 files changed, 143 insertions(+), 105 deletions(-) diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index f26fb5a..42fdf2b 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -2619,6 +2619,10 @@ public: /// the target-clones functionality. bool isTargetClonesMultiVersion() const; + /// True if this function is a multiversioned dispatch function as a part of + /// the target-version functionality. + bool isTargetVersionMultiVersion() const; + /// \brief Get the associated-constraints of this function declaration. /// Currently, this will either be a vector of size 1 containing the /// trailing-requires-clause or an empty vector. diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 041786f..19a98a0 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2517,6 +2517,14 @@ function it instructs compiler to emit multiple function versions based on priority and target features availability. One of the versions is always ( implicitly or explicitly ) the ``default`` (fallback). Attribute strings can contain dependent features names joined by the "+" sign. + +For targets that support the GNU indirect function (IFUNC) feature, dispatch +is performed by emitting an indirect function that is resolved to the appropriate +target clone at load time. The indirect function is given the name the +multiversioned function would have if it had been declared without the attribute. +For backward compatibility with earlier Clang releases, a function alias with an +``.ifunc`` suffix is also emitted. The ``.ifunc`` suffixed symbol is a deprecated +feature and support for it may be removed in the future. }]; } diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 26fdfa0..40e2903 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3541,6 +3541,10 @@ bool FunctionDecl::isTargetClonesMultiVersion() const { return isMultiVersion() && hasAttr(); } +bool FunctionDecl::isTargetVersionMultiVersion() const { + return isMultiVersion() && hasAttr(); +} + void FunctionDecl::setPreviousDeclaration(FunctionDecl *PrevDecl) { redeclarable_base::setPreviousDecl(PrevDecl); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 36b63d7..2f923d5 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -30,6 +30,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/ASTLambda.h" #include "clang/AST/CharUnits.h" +#include "clang/AST/Decl.h" #include "clang/AST/DeclCXX.h" #include "clang/AST/DeclObjC.h" #include "clang/AST/DeclTemplate.h" @@ -4212,7 +4213,8 @@ void CodeGenModule::emitMultiVersionFunctions() { llvm::Constant *ResolverConstant = GetOrCreateMultiVersionResolver(GD); if (auto *IFunc = dyn_cast(ResolverConstant)) { ResolverConstant = IFunc->getResolver(); - if (FD->isTargetClonesMultiVersion()) { + if (FD->isTargetClonesMultiVersion() || + FD->isTargetVersionMultiVersion()) { const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD); llvm::FunctionType *DeclTy = getTypes().GetFunctionType(FI); std::string MangledName = getMangledNameImpl( @@ -4393,8 +4395,18 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) { // a separate resolver). std::string ResolverName = MangledName; if (getTarget().supportsIFunc()) { - if (!FD->isTargetClonesMultiVersion()) + switch (FD->getMultiVersionKind()) { + case MultiVersionKind::None: + llvm_unreachable("unexpected MultiVersionKind::None for resolver"); + case MultiVersionKind::Target: + case MultiVersionKind::CPUSpecific: + case MultiVersionKind::CPUDispatch: ResolverName += ".ifunc"; + break; + case MultiVersionKind::TargetClones: + case MultiVersionKind::TargetVersion: + break; + } } else if (FD->isTargetMultiVersion()) { ResolverName += ".resolver"; } diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c index 2a96697..c27d48f 100644 --- a/clang/test/CodeGen/attr-target-version.c +++ b/clang/test/CodeGen/attr-target-version.c @@ -90,13 +90,20 @@ int hoo(void) { //. // CHECK: @__aarch64_cpu_features = external dso_local global { i64 } -// CHECK: @fmv.ifunc = weak_odr ifunc i32 (), ptr @fmv.resolver -// CHECK: @fmv_one.ifunc = weak_odr ifunc i32 (), ptr @fmv_one.resolver -// CHECK: @fmv_two.ifunc = weak_odr ifunc i32 (), ptr @fmv_two.resolver -// CHECK: @fmv_e.ifunc = weak_odr ifunc i32 (), ptr @fmv_e.resolver -// CHECK: @fmv_c.ifunc = weak_odr ifunc void (), ptr @fmv_c.resolver -// CHECK: @fmv_inline.ifunc = weak_odr ifunc i32 (), ptr @fmv_inline.resolver -// CHECK: @fmv_d.ifunc = internal ifunc i32 (), ptr @fmv_d.resolver +// CHECK: @fmv.ifunc = weak_odr alias i32 (), ptr @fmv +// CHECK: @fmv_one.ifunc = weak_odr alias i32 (), ptr @fmv_one +// CHECK: @fmv_two.ifunc = weak_odr alias i32 (), ptr @fmv_two +// CHECK: @fmv_e.ifunc = weak_odr alias i32 (), ptr @fmv_e +// CHECK: @fmv_inline.ifunc = weak_odr alias i32 (), ptr @fmv_inline +// CHECK: @fmv_d.ifunc = internal alias i32 (), ptr @fmv_d +// CHECK: @fmv_c.ifunc = weak_odr alias void (), ptr @fmv_c +// CHECK: @fmv = weak_odr ifunc i32 (), ptr @fmv.resolver +// CHECK: @fmv_one = weak_odr ifunc i32 (), ptr @fmv_one.resolver +// CHECK: @fmv_two = weak_odr ifunc i32 (), ptr @fmv_two.resolver +// CHECK: @fmv_e = weak_odr ifunc i32 (), ptr @fmv_e.resolver +// CHECK: @fmv_inline = weak_odr ifunc i32 (), ptr @fmv_inline.resolver +// CHECK: @fmv_d = internal ifunc i32 (), ptr @fmv_d.resolver +// CHECK: @fmv_c = weak_odr ifunc void (), ptr @fmv_c.resolver //. // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv._MrngMflagmMfp16fml @@ -105,6 +112,32 @@ int hoo(void) { // CHECK-NEXT: ret i32 1 // // +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_one._MsimdMls64 +// CHECK-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 1 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp +// CHECK-SAME: () #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 1 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@foo +// CHECK-SAME: () #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = call i32 @fmv() +// CHECK-NEXT: [[CALL1:%.*]] = call i32 @fmv_one() +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]] +// CHECK-NEXT: [[CALL2:%.*]] = call i32 @fmv_two() +// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]] +// CHECK-NEXT: ret i32 [[ADD3]] +// +// // CHECK-LABEL: define {{[^@]+}}@fmv.resolver() comdat { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() @@ -183,42 +216,16 @@ int hoo(void) { // CHECK-NEXT: ret ptr @fmv.default // // -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_one._MsimdMls64 -// CHECK-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// // CHECK-LABEL: define {{[^@]+}}@fmv_one.resolver() comdat { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: ret ptr @fmv_one._MsimdMls64 // // -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp -// CHECK-SAME: () #[[ATTR1]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// // CHECK-LABEL: define {{[^@]+}}@fmv_two.resolver() comdat { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: ret ptr @fmv_two._MsimdMfp16 // // -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@foo -// CHECK-SAME: () #[[ATTR2:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[CALL:%.*]] = call i32 @fmv.ifunc() -// CHECK-NEXT: [[CALL1:%.*]] = call i32 @fmv_one.ifunc() -// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]] -// CHECK-NEXT: [[CALL2:%.*]] = call i32 @fmv_two.ifunc() -// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]] -// CHECK-NEXT: ret i32 [[ADD3]] -// -// // CHECK-LABEL: define {{[^@]+}}@fmv_e.resolver() comdat { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: ret ptr @fmv_e._Mls64 @@ -238,28 +245,14 @@ int hoo(void) { // CHECK-NEXT: ret void // // -// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @fmv_c._Mssbs -// CHECK: resolver_else: -// CHECK-NEXT: ret ptr @fmv_c.default -// -// // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@goo // CHECK-SAME: () #[[ATTR2]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[CALL:%.*]] = call i32 @fmv_inline.ifunc() -// CHECK-NEXT: [[CALL1:%.*]] = call i32 @fmv_e.ifunc() -// CHECK-NEXT: [[CALL2:%.*]] = call i32 @fmv_d.ifunc() -// CHECK-NEXT: call void @fmv_c.ifunc() +// CHECK-NEXT: [[CALL:%.*]] = call i32 @fmv_inline() +// CHECK-NEXT: [[CALL1:%.*]] = call i32 @fmv_e() +// CHECK-NEXT: [[CALL2:%.*]] = call i32 @fmv_d() +// CHECK-NEXT: call void @fmv_c() // CHECK-NEXT: [[CALL3:%.*]] = call i32 @fmv_default() // CHECK-NEXT: ret i32 [[CALL3]] // @@ -412,6 +405,20 @@ int hoo(void) { // CHECK-NEXT: ret ptr @fmv_d.default // // +// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @fmv_c._Mssbs +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @fmv_c.default +// +// // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@recur // CHECK-SAME: () #[[ATTR2]] { @@ -437,9 +444,9 @@ int hoo(void) { // CHECK-NEXT: entry: // CHECK-NEXT: [[FP1:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[FP2:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: call void @f(ptr noundef @fmv.ifunc) -// CHECK-NEXT: store ptr @fmv.ifunc, ptr [[FP1]], align 8 -// CHECK-NEXT: store ptr @fmv.ifunc, ptr [[FP2]], align 8 +// CHECK-NEXT: call void @f(ptr noundef @fmv) +// CHECK-NEXT: store ptr @fmv, ptr [[FP1]], align 8 +// CHECK-NEXT: store ptr @fmv, ptr [[FP2]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[FP1]], align 8 // CHECK-NEXT: [[CALL:%.*]] = call i32 [[TMP0]]() // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[FP2]], align 8 @@ -561,13 +568,6 @@ int hoo(void) { // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_c.default -// CHECK-SAME: () #[[ATTR2]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret void -// -// -// CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msha1MpmullMf64mm // CHECK-SAME: () #[[ATTR12:[0-9]+]] { // CHECK-NEXT: entry: @@ -700,6 +700,13 @@ int hoo(void) { // CHECK-NEXT: ret i32 1 // // +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_c.default +// CHECK-SAME: () #[[ATTR2]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// // CHECK-NOFMV: Function Attrs: noinline nounwind optnone // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv // CHECK-NOFMV-SAME: () #[[ATTR0:[0-9]+]] { diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp index 68dd7be..b63815d 100644 --- a/clang/test/CodeGenCXX/attr-target-version.cpp +++ b/clang/test/CodeGenCXX/attr-target-version.cpp @@ -26,9 +26,12 @@ int bar() { //. // CHECK: @__aarch64_cpu_features = external dso_local global { i64 } -// CHECK: @_Z3fooi.ifunc = weak_odr ifunc i32 (i32), ptr @_Z3fooi.resolver -// CHECK: @_Z3foov.ifunc = weak_odr ifunc i32 (), ptr @_Z3foov.resolver -// CHECK: @_ZN7MyClass3gooEi.ifunc = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver +// CHECK: @_ZN7MyClass3gooEi.ifunc = weak_odr alias i32 (ptr, i32), ptr @_ZN7MyClass3gooEi +// CHECK: @_Z3fooi.ifunc = weak_odr alias i32 (i32), ptr @_Z3fooi +// CHECK: @_Z3foov.ifunc = weak_odr alias i32 (), ptr @_Z3foov +// CHECK: @_ZN7MyClass3gooEi = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver +// CHECK: @_Z3fooi = weak_odr ifunc i32 (i32), ptr @_Z3fooi.resolver +// CHECK: @_Z3foov = weak_odr ifunc i32 (), ptr @_Z3foov.resolver //. // CHECK-LABEL: @_Z3fooi._Mbf16Msme-f64f64( // CHECK-NEXT: entry: @@ -37,39 +40,11 @@ int bar() { // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_Z3fooi.resolver( -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @_Z3fooi._Mbf16Msme-f64f64 -// CHECK: resolver_else: -// CHECK-NEXT: ret ptr @_Z3fooi.default -// -// // CHECK-LABEL: @_Z3foov._Msm4Mebf16( // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 3 // // -// CHECK-LABEL: @_Z3foov.resolver( -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 268435488 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @_Z3foov._Msm4Mebf16 -// CHECK: resolver_else: -// CHECK-NEXT: ret ptr @_Z3foov.default -// -// // CHECK-LABEL: @_ZN7MyClass3gooEi.resolver( // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() @@ -95,24 +70,40 @@ int bar() { // CHECK-LABEL: @_Z3barv( // CHECK-NEXT: entry: // CHECK-NEXT: [[M:%.*]] = alloca [[STRUCT_MYCLASS:%.*]], align 1 -// CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN7MyClass3gooEi.ifunc(ptr noundef nonnull align 1 dereferenceable(1) [[M]], i32 noundef 1) -// CHECK-NEXT: [[CALL1:%.*]] = call noundef i32 @_Z3fooi.ifunc(i32 noundef 1) +// CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN7MyClass3gooEi(ptr noundef nonnull align 1 dereferenceable(1) [[M]], i32 noundef 1) +// CHECK-NEXT: [[CALL1:%.*]] = call noundef i32 @_Z3fooi(i32 noundef 1) // CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]] -// CHECK-NEXT: [[CALL2:%.*]] = call noundef i32 @_Z3foov.ifunc() +// CHECK-NEXT: [[CALL2:%.*]] = call noundef i32 @_Z3foov() // CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]] // CHECK-NEXT: ret i32 [[ADD3]] // // -// CHECK-LABEL: @_Z3fooi.default( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4 -// CHECK-NEXT: ret i32 2 +// CHECK-LABEL: @_Z3fooi.resolver( +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @_Z3fooi._Mbf16Msme-f64f64 +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @_Z3fooi.default // // -// CHECK-LABEL: @_Z3foov.default( -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 4 +// CHECK-LABEL: @_Z3foov.resolver( +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 268435488 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @_Z3foov._Msm4Mebf16 +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @_Z3foov.default // // // CHECK-LABEL: @_ZN7MyClass3gooEi._Mdotprod( @@ -144,6 +135,18 @@ int bar() { // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 1 // +// +// CHECK-LABEL: @_Z3fooi.default( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: ret i32 2 +// +// +// CHECK-LABEL: @_Z3foov.default( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 4 +// //. // CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" } // CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+neon,+sm4" } -- cgit v1.1 From 99446df3f5357b327b388bbbb4adf6465999ea60 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 9 Feb 2024 08:12:58 -0800 Subject: Bump the minimum LLVM version for chrono datatformatters tests --- .../data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py index a90fb82..c306315 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py @@ -11,7 +11,7 @@ from lldbsuite.test import lldbutil class LibcxxChronoDataFormatterTestCase(TestBase): @add_test_categories(["libc++"]) - @skipIf(compiler="clang", compiler_version=["<", "11.0"]) + @skipIf(compiler="clang", compiler_version=["<", "17.0"]) def test_with_run_command(self): """Test that that file and class static variables display correctly.""" self.build() -- cgit v1.1 From 2095655f8e2324971f11be61b88ef1644d5796b8 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 9 Feb 2024 08:14:09 -0800 Subject: [clang][sema] Fix -Wunused-function on target_version'd file-scope Fn's (#81167) We should only warn if the default version is the one that is unused. Fixes: https://github.com/llvm/llvm-project/issues/80227 --- clang/include/clang/AST/Decl.h | 4 ++++ clang/lib/AST/Decl.cpp | 5 +++++ clang/lib/Sema/Sema.cpp | 3 ++- clang/test/SemaCXX/warn-unused-filescoped.cpp | 16 ++++++++++++++++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index 42fdf2b..61117cc 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -2615,6 +2615,10 @@ public: /// the target functionality. bool isTargetMultiVersion() const; + /// True if this function is the default version of a multiversioned dispatch + /// function as a part of the target functionality. + bool isTargetMultiVersionDefault() const; + /// True if this function is a multiversioned dispatch function as a part of /// the target-clones functionality. bool isTargetClonesMultiVersion() const; diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 40e2903..e281f2d 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3537,6 +3537,11 @@ bool FunctionDecl::isTargetMultiVersion() const { (hasAttr() || hasAttr()); } +bool FunctionDecl::isTargetMultiVersionDefault() const { + return isMultiVersion() && hasAttr() && + getAttr()->isDefaultVersion(); +} + bool FunctionDecl::isTargetClonesMultiVersion() const { return isMultiVersion() && hasAttr(); } diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 2d4e6d1..cfb653e6 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -1393,7 +1393,8 @@ void Sema::ActOnEndOfTranslationUnit() { Diag(DiagD->getLocation(), diag::warn_unneeded_internal_decl) << /*function=*/0 << DiagD << DiagRange; } - } else { + } else if (!FD->isTargetMultiVersion() || + FD->isTargetMultiVersionDefault()) { if (FD->getDescribedFunctionTemplate()) Diag(DiagD->getLocation(), diag::warn_unused_template) << /*function=*/0 << DiagD << DiagRange; diff --git a/clang/test/SemaCXX/warn-unused-filescoped.cpp b/clang/test/SemaCXX/warn-unused-filescoped.cpp index be8d350..0c347e9 100644 --- a/clang/test/SemaCXX/warn-unused-filescoped.cpp +++ b/clang/test/SemaCXX/warn-unused-filescoped.cpp @@ -236,4 +236,20 @@ constexpr int constexpr4() { return 2; } #endif } +__attribute__((target_version("fp16"))) +static int not_used_fmv(void) { return 1; } +__attribute__((target_version("fp16fml"))) +static int not_used_fmv(void) { return 2; } +__attribute__((target_version("default"))) +static int not_used_fmv(void) { return 0; } // expected-warning {{unused function 'not_used_fmv'}} + + +__attribute__((target_version("fp16"))) +static int definitely_used_fmv(void) { return 1; } +__attribute__((target_version("fp16fml"))) +static int definitely_used_fmv(void) { return 2; } +__attribute__((target_version("default"))) +static int definitely_used_fmv(void) { return 0; } +int definite_user(void) { return definitely_used_fmv(); } + #endif -- cgit v1.1 From 7ddc32052546abd41656d2e670f3902b1bf805a7 Mon Sep 17 00:00:00 2001 From: quic-areg Date: Fri, 9 Feb 2024 10:15:23 -0600 Subject: [llvm-objcopy] Support SREC output format (#75874) Adds a new output target "srec" to write SREC files from ELF inputs. https://en.wikipedia.org/wiki/SREC_(file_format) --- llvm/docs/CommandGuide/llvm-objcopy.rst | 9 +- llvm/include/llvm/ObjCopy/CommonConfig.h | 7 +- llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp | 4 +- llvm/lib/ObjCopy/ELF/ELFObject.cpp | 280 ++++++++++++++++++---- llvm/lib/ObjCopy/ELF/ELFObject.h | 134 ++++++++++- llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test | 6 +- llvm/test/tools/llvm-objcopy/ELF/srec-writer.test | 196 +++++++++++++++ llvm/tools/llvm-objcopy/ObjcopyOptions.cpp | 1 + llvm/tools/llvm-objcopy/llvm-objcopy.cpp | 1 + 9 files changed, 576 insertions(+), 62 deletions(-) create mode 100644 llvm/test/tools/llvm-objcopy/ELF/srec-writer.test diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst index 42d11fa..b823be9 100644 --- a/llvm/docs/CommandGuide/llvm-objcopy.rst +++ b/llvm/docs/CommandGuide/llvm-objcopy.rst @@ -544,8 +544,13 @@ options. For GNU :program:`objcopy` compatibility, the values are all bfdnames. - `elf32-sparc` - `elf32-sparcel` -Additionally, all targets except `binary` and `ihex` can have `-freebsd` as a -suffix. +The following formats are suppoprted by :program:`llvm-objcopy` for the +:option:`--output-target` only: + +- `srec` + +Additionally, all targets except `binary`, `ihex`, and `srec` can have +`-freebsd` as a suffix. BINARY INPUT AND OUTPUT ----------------------- diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h index 0d9320e..3833959 100644 --- a/llvm/include/llvm/ObjCopy/CommonConfig.h +++ b/llvm/include/llvm/ObjCopy/CommonConfig.h @@ -27,12 +27,7 @@ namespace llvm { namespace objcopy { -enum class FileFormat { - Unspecified, - ELF, - Binary, - IHex, -}; +enum class FileFormat { Unspecified, ELF, Binary, IHex, SREC }; // This type keeps track of the machine info for various architectures. This // lets us map architecture names to ELF types and the e_machine value of the diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp index 36f7994..1b3a582 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp @@ -182,7 +182,9 @@ static std::unique_ptr createWriter(const CommonConfig &Config, case FileFormat::Binary: return std::make_unique(Obj, Out, Config); case FileFormat::IHex: - return std::make_unique(Obj, Out); + return std::make_unique(Obj, Out, Config.OutputFilename); + case FileFormat::SREC: + return std::make_unique(Obj, Out, Config.OutputFilename); default: return createELFWriter(Config, Obj, Out, OutputElfType); } diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp index c8b66d6..c2de456 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp @@ -2707,10 +2707,52 @@ Error BinaryWriter::finalize() { return Error::success(); } -bool IHexWriter::SectionCompare::operator()(const SectionBase *Lhs, - const SectionBase *Rhs) const { - return (sectionPhysicalAddr(Lhs) & 0xFFFFFFFFU) < - (sectionPhysicalAddr(Rhs) & 0xFFFFFFFFU); +Error ASCIIHexWriter::checkSection(const SectionBase &S) const { + if (addressOverflows32bit(S.Addr) || + addressOverflows32bit(S.Addr + S.Size - 1)) + return createStringError( + errc::invalid_argument, + "section '%s' address range [0x%llx, 0x%llx] is not 32 bit", + S.Name.c_str(), S.Addr, S.Addr + S.Size - 1); + return Error::success(); +} + +Error ASCIIHexWriter::finalize() { + // We can't write 64-bit addresses. + if (addressOverflows32bit(Obj.Entry)) + return createStringError(errc::invalid_argument, + "entry point address 0x%llx overflows 32 bits", + Obj.Entry); + + for (const SectionBase &S : Obj.sections()) { + if ((S.Flags & ELF::SHF_ALLOC) && S.Type != ELF::SHT_NOBITS && S.Size > 0) { + if (Error E = checkSection(S)) + return E; + Sections.push_back(&S); + } + } + + llvm::sort(Sections, [](const SectionBase *A, const SectionBase *B) { + return sectionPhysicalAddr(A) < sectionPhysicalAddr(B); + }); + + std::unique_ptr EmptyBuffer = + WritableMemoryBuffer::getNewMemBuffer(0); + if (!EmptyBuffer) + return createStringError(errc::not_enough_memory, + "failed to allocate memory buffer of 0 bytes"); + + Expected ExpTotalSize = getTotalSize(*EmptyBuffer); + if (!ExpTotalSize) + return ExpTotalSize.takeError(); + TotalSize = *ExpTotalSize; + + Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize); + if (!Buf) + return createStringError(errc::not_enough_memory, + "failed to allocate memory buffer of 0x" + + Twine::utohexstr(TotalSize) + " bytes"); + return Error::success(); } uint64_t IHexWriter::writeEntryPointRecord(uint8_t *Buf) { @@ -2740,6 +2782,20 @@ uint64_t IHexWriter::writeEndOfFileRecord(uint8_t *Buf) { return HexData.size(); } +Expected +IHexWriter::getTotalSize(WritableMemoryBuffer &EmptyBuffer) const { + IHexSectionWriterBase LengthCalc(EmptyBuffer); + for (const SectionBase *Sec : Sections) + if (Error Err = Sec->accept(LengthCalc)) + return Err; + + // We need space to write section records + StartAddress record + // (if start adress is not zero) + EndOfFile record. + return LengthCalc.getBufferOffset() + + (Obj.Entry ? IHexRecord::getLineLength(4) : 0) + + IHexRecord::getLineLength(0); +} + Error IHexWriter::write() { IHexSectionWriter Writer(*Buf); // Write sections. @@ -2762,54 +2818,196 @@ Error IHexWriter::write() { return Error::success(); } -Error IHexWriter::checkSection(const SectionBase &Sec) { - uint64_t Addr = sectionPhysicalAddr(&Sec); - if (addressOverflows32bit(Addr) || addressOverflows32bit(Addr + Sec.Size - 1)) - return createStringError( - errc::invalid_argument, - "Section '%s' address range [0x%llx, 0x%llx] is not 32 bit", - Sec.Name.c_str(), Addr, Addr + Sec.Size - 1); +Error SRECSectionWriterBase::visit(const StringTableSection &Sec) { + // Check that the sizer has already done its work. + assert(Sec.Size == Sec.StrTabBuilder.getSize() && + "Expected section size to have been finalized"); + // We don't need to write anything here because the real writer has already + // done it. return Error::success(); } -Error IHexWriter::finalize() { - // We can't write 64-bit addresses. - if (addressOverflows32bit(Obj.Entry)) - return createStringError(errc::invalid_argument, - "Entry point address 0x%llx overflows 32 bits", - Obj.Entry); +Error SRECSectionWriterBase::visit(const Section &Sec) { + writeSection(Sec, Sec.Contents); + return Error::success(); +} - for (const SectionBase &Sec : Obj.sections()) - if ((Sec.Flags & ELF::SHF_ALLOC) && Sec.Type != ELF::SHT_NOBITS && - Sec.Size > 0) { - if (Error E = checkSection(Sec)) - return E; - Sections.insert(&Sec); - } +Error SRECSectionWriterBase::visit(const OwnedDataSection &Sec) { + writeSection(Sec, Sec.Data); + return Error::success(); +} - std::unique_ptr EmptyBuffer = - WritableMemoryBuffer::getNewMemBuffer(0); - if (!EmptyBuffer) - return createStringError(errc::not_enough_memory, - "failed to allocate memory buffer of 0 bytes"); +Error SRECSectionWriterBase::visit(const DynamicRelocationSection &Sec) { + writeSection(Sec, Sec.Contents); + return Error::success(); +} + +void SRECSectionWriter::writeRecord(SRecord &Record, uint64_t Off) { + SRecLineData Data = Record.toString(); + memcpy(Out.getBufferStart() + Off, Data.data(), Data.size()); +} - IHexSectionWriterBase LengthCalc(*EmptyBuffer); +void SRECSectionWriterBase::writeRecords(uint32_t Entry) { + // The ELF header could contain an entry point outside of the sections we have + // seen that does not fit the current record Type. + Type = std::max(Type, SRecord::getType(Entry)); + uint64_t Off = HeaderSize; + for (SRecord &Record : Records) { + Record.Type = Type; + writeRecord(Record, Off); + Off += Record.getSize(); + } + Offset = Off; +} + +void SRECSectionWriterBase::writeSection(const SectionBase &S, + ArrayRef Data) { + const uint32_t ChunkSize = 16; + uint32_t Address = sectionPhysicalAddr(&S); + uint32_t EndAddr = Address + S.Size - 1; + Type = std::max(SRecord::getType(EndAddr), Type); + while (!Data.empty()) { + uint64_t DataSize = std::min(Data.size(), ChunkSize); + SRecord Record{Type, Address, Data.take_front(DataSize)}; + Records.push_back(Record); + Data = Data.drop_front(DataSize); + Address += DataSize; + } +} + +Error SRECSectionWriter::visit(const StringTableSection &Sec) { + assert(Sec.Size == Sec.StrTabBuilder.getSize() && + "Section size does not match the section's string table builder size"); + std::vector Data(Sec.Size); + Sec.StrTabBuilder.write(Data.data()); + writeSection(Sec, Data); + return Error::success(); +} + +SRecLineData SRecord::toString() const { + SRecLineData Line(getSize()); + auto *Iter = Line.begin(); + *Iter++ = 'S'; + *Iter++ = '0' + Type; + // Write 1 byte (2 hex characters) record count. + Iter = toHexStr(getCount(), Iter, 2); + // Write the address field with length depending on record type. + Iter = toHexStr(Address, Iter, getAddressSize()); + // Write data byte by byte. + for (uint8_t X : Data) + Iter = toHexStr(X, Iter, 2); + // Write the 1 byte checksum. + Iter = toHexStr(getChecksum(), Iter, 2); + *Iter++ = '\r'; + *Iter++ = '\n'; + assert(Iter == Line.end()); + return Line; +} + +uint8_t SRecord::getChecksum() const { + uint32_t Sum = getCount(); + Sum += (Address >> 24) & 0xFF; + Sum += (Address >> 16) & 0xFF; + Sum += (Address >> 8) & 0xFF; + Sum += Address & 0xFF; + for (uint8_t Byte : Data) + Sum += Byte; + return 0xFF - (Sum & 0xFF); +} + +size_t SRecord::getSize() const { + // Type, Count, Checksum, and CRLF are two characters each. + return 2 + 2 + getAddressSize() + Data.size() * 2 + 2 + 2; +} + +uint8_t SRecord::getAddressSize() const { + switch (Type) { + case Type::S2: + return 6; + case Type::S3: + return 8; + case Type::S7: + return 8; + case Type::S8: + return 6; + default: + return 4; + } +} + +uint8_t SRecord::getCount() const { + uint8_t DataSize = Data.size(); + uint8_t ChecksumSize = 1; + return getAddressSize() / 2 + DataSize + ChecksumSize; +} + +uint8_t SRecord::getType(uint32_t Address) { + if (isUInt<16>(Address)) + return SRecord::S1; + if (isUInt<24>(Address)) + return SRecord::S2; + return SRecord::S3; +} + +SRecord SRecord::getHeader(StringRef FileName) { + // Header is a record with Type S0, Address 0, and Data that is a + // vendor-specific text comment. For the comment we will use the output file + // name truncated to 40 characters to match the behavior of GNU objcopy. + StringRef HeaderContents = FileName.slice(0, 40); + ArrayRef Data( + reinterpret_cast(HeaderContents.data()), + HeaderContents.size()); + return {SRecord::S0, 0, Data}; +} + +size_t SRECWriter::writeHeader(uint8_t *Buf) { + SRecLineData Record = SRecord::getHeader(OutputFileName).toString(); + memcpy(Buf, Record.data(), Record.size()); + return Record.size(); +} + +size_t SRECWriter::writeTerminator(uint8_t *Buf, uint8_t Type) { + assert(Type >= SRecord::S7 && Type <= SRecord::S9 && + "Invalid record type for terminator"); + uint32_t Entry = Obj.Entry; + SRecLineData Data = SRecord{Type, Entry, {}}.toString(); + memcpy(Buf, Data.data(), Data.size()); + return Data.size(); +} + +Expected +SRECWriter::getTotalSize(WritableMemoryBuffer &EmptyBuffer) const { + SRECSizeCalculator SizeCalc(EmptyBuffer, 0); for (const SectionBase *Sec : Sections) - if (Error Err = Sec->accept(LengthCalc)) + if (Error Err = Sec->accept(SizeCalc)) return Err; - // We need space to write section records + StartAddress record - // (if start adress is not zero) + EndOfFile record. - TotalSize = LengthCalc.getBufferOffset() + - (Obj.Entry ? IHexRecord::getLineLength(4) : 0) + - IHexRecord::getLineLength(0); + SizeCalc.writeRecords(Obj.Entry); + // We need to add the size of the Header and Terminator records. + SRecord Header = SRecord::getHeader(OutputFileName); + uint8_t TerminatorType = 10 - SizeCalc.getType(); + SRecord Terminator = {TerminatorType, static_cast(Obj.Entry), {}}; + return Header.getSize() + SizeCalc.getBufferOffset() + Terminator.getSize(); +} - Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize); - if (!Buf) - return createStringError(errc::not_enough_memory, - "failed to allocate memory buffer of " + - Twine::utohexstr(TotalSize) + " bytes"); +Error SRECWriter::write() { + uint32_t HeaderSize = + writeHeader(reinterpret_cast(Buf->getBufferStart())); + SRECSectionWriter Writer(*Buf, HeaderSize); + for (const SectionBase *S : Sections) { + if (Error E = S->accept(Writer)) + return E; + } + Writer.writeRecords(Obj.Entry); + uint64_t Offset = Writer.getBufferOffset(); + // An S1 record terminates with an S9 record, S2 with S8, and S3 with S7. + uint8_t TerminatorType = 10 - Writer.getType(); + Offset += writeTerminator( + reinterpret_cast(Buf->getBufferStart() + Offset), + TerminatorType); + assert(Offset == TotalSize); + Out.write(Buf->getBufferStart(), Buf->getBufferSize()); return Error::success(); } diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h index 95bea09..7a2e20d 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.h +++ b/llvm/lib/ObjCopy/ELF/ELFObject.h @@ -172,6 +172,9 @@ public: friend class SectionWriter; \ friend class IHexSectionWriterBase; \ friend class IHexSectionWriter; \ + friend class SRECSectionWriter; \ + friend class SRECSectionWriterBase; \ + friend class SRECSizeCalculator; \ template friend class ELFSectionWriter; \ template friend class ELFSectionSizer; @@ -371,23 +374,136 @@ public: : Writer(Obj, Out), GapFill(Config.GapFill), PadTo(Config.PadTo) {} }; -class IHexWriter : public Writer { - struct SectionCompare { - bool operator()(const SectionBase *Lhs, const SectionBase *Rhs) const; - }; +// A base class for writing ascii hex formats such as srec and ihex. +class ASCIIHexWriter : public Writer { +public: + ASCIIHexWriter(Object &Obj, raw_ostream &OS, StringRef OutputFile) + : Writer(Obj, OS), OutputFileName(OutputFile) {} + Error finalize() override; - std::set Sections; +protected: + StringRef OutputFileName; size_t TotalSize = 0; + std::vector Sections; + + Error checkSection(const SectionBase &S) const; + virtual Expected + getTotalSize(WritableMemoryBuffer &EmptyBuffer) const = 0; +}; + +class IHexWriter : public ASCIIHexWriter { +public: + Error write() override; + IHexWriter(Object &Obj, raw_ostream &Out, StringRef OutputFile) + : ASCIIHexWriter(Obj, Out, OutputFile) {} - Error checkSection(const SectionBase &Sec); +private: uint64_t writeEntryPointRecord(uint8_t *Buf); uint64_t writeEndOfFileRecord(uint8_t *Buf); + Expected + getTotalSize(WritableMemoryBuffer &EmptyBuffer) const override; +}; +class SRECWriter : public ASCIIHexWriter { public: - ~IHexWriter() {} - Error finalize() override; + SRECWriter(Object &Obj, raw_ostream &OS, StringRef OutputFile) + : ASCIIHexWriter(Obj, OS, OutputFile) {} Error write() override; - IHexWriter(Object &Obj, raw_ostream &Out) : Writer(Obj, Out) {} + +private: + size_t writeHeader(uint8_t *Buf); + size_t writeTerminator(uint8_t *Buf, uint8_t Type); + Expected + getTotalSize(WritableMemoryBuffer &EmptyBuffer) const override; +}; + +using SRecLineData = SmallVector; +struct SRecord { + uint8_t Type; + uint32_t Address; + ArrayRef Data; + SRecLineData toString() const; + uint8_t getCount() const; + // Get address size in characters. + uint8_t getAddressSize() const; + uint8_t getChecksum() const; + size_t getSize() const; + static SRecord getHeader(StringRef FileName); + static uint8_t getType(uint32_t Address); + + enum Type : uint8_t { + // Vendor specific text comment. + S0 = 0, + // Data that starts at a 16 bit address. + S1 = 1, + // Data that starts at a 24 bit address. + S2 = 2, + // Data that starts at a 32 bit address. + S3 = 3, + // Reserved. + S4 = 4, + // 16 bit count of S1/S2/S3 records (optional). + S5 = 5, + // 32 bit count of S1/S2/S3 records (optional). + S6 = 6, + // Terminates a series of S3 records. + S7 = 7, + // Terminates a series of S2 records. + S8 = 8, + // Terminates a series of S1 records. + S9 = 9 + }; +}; + +class SRECSectionWriterBase : public BinarySectionWriter { +public: + explicit SRECSectionWriterBase(WritableMemoryBuffer &Buf, + uint64_t StartOffset) + : BinarySectionWriter(Buf), Offset(StartOffset), HeaderSize(StartOffset) { + } + + using BinarySectionWriter::visit; + + void writeRecords(uint32_t Entry); + uint64_t getBufferOffset() const { return Offset; } + Error visit(const Section &S) override; + Error visit(const OwnedDataSection &S) override; + Error visit(const StringTableSection &S) override; + Error visit(const DynamicRelocationSection &S) override; + uint8_t getType() const { return Type; }; + +protected: + // Offset in the output buffer. + uint64_t Offset; + // Sections start after the header. + uint64_t HeaderSize; + // Type of records to write. + uint8_t Type = SRecord::S1; + std::vector Records; + + void writeSection(const SectionBase &S, ArrayRef Data); + virtual void writeRecord(SRecord &Record, uint64_t Off) = 0; +}; + +// An SRECSectionWriterBase that visits sections but does not write anything. +// This class is only used to calculate the size of the output file. +class SRECSizeCalculator : public SRECSectionWriterBase { +public: + SRECSizeCalculator(WritableMemoryBuffer &EmptyBuffer, uint64_t Offset) + : SRECSectionWriterBase(EmptyBuffer, Offset) {} + +protected: + void writeRecord(SRecord &Record, uint64_t Off) override {} +}; + +class SRECSectionWriter : public SRECSectionWriterBase { +public: + SRECSectionWriter(WritableMemoryBuffer &Buf, uint64_t Offset) + : SRECSectionWriterBase(Buf, Offset) {} + Error visit(const StringTableSection &Sec) override; + +protected: + void writeRecord(SRecord &Record, uint64_t Off) override; }; class SectionBase { diff --git a/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test b/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test index 09ff8ae..6c07f9f 100644 --- a/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test +++ b/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test @@ -70,8 +70,8 @@ # SIGN_EXTENDED-NEXT: :051000000001020304E1 # SIGN_EXTENDED-NEXT: :00000001FF -# BAD-ADDR: error: {{.*}}: Section '.text2' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit -# BAD-ADDR2: error: {{.*}}: Section '.text3' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit +# BAD-ADDR: error: {{.*}}: section '.text2' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit +# BAD-ADDR2: error: {{.*}}: section '.text3' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit # There shouldn't be 'ExtendedAddr' nor 'Data' records # ZERO_SIZE_SEC-NOT: :02000004 @@ -81,4 +81,4 @@ # START1: :040000030000FFFFFB # START2: :0400000500100000E7 # START3: :040000058000100067 -# BAD-START: error: {{.*}}: Entry point address 0x{{.*}} overflows 32 bits +# BAD-START: error: {{.*}}: entry point address 0x{{.*}} overflows 32 bits diff --git a/llvm/test/tools/llvm-objcopy/ELF/srec-writer.test b/llvm/test/tools/llvm-objcopy/ELF/srec-writer.test new file mode 100644 index 0000000..e96b87b --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/ELF/srec-writer.test @@ -0,0 +1,196 @@ +## Check for basic functionality using an input file with +## various section types, adresses, data, and no segments. +# RUN: yaml2obj %s --docnum=1 -o %t +# RUN: llvm-objcopy -O srec %t - | \ +# RUN: FileCheck --match-full-lines --strict-whitespace %s --check-prefix=SREC + +## The record type for the header should be S0 with a 2 byte address +## of 0. For an output file named "-" the header data field should contain "2D". +## The byte count field should therefore have a value of 4: 2 bytes for address, +## 1 byte for output file and 1 byte for checksum. + # SREC:S00400002DCE +# SREC-NEXT:S31500001000000102030405060708090A0B0C0D0E0F62 +# SREC-NEXT:S30A0000101010111213147B +# SREC-NEXT:S30F00EFFFFF1111111111111111111159 +# SREC-NEXT:S31000FFFFF83031323334353637383940AC +# SREC-NEXT:S30A8000100000010203045B +# SREC-NEXT:S70500000000FA + +## Terminator should contain the entry point. +# RUN: llvm-objcopy -O srec --set-start=0xF0000000 %t --only-section=.dummy - 2>&1 | \ +# RUN: FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRY +## Sign-extended entry point is OK. +# RUN: llvm-objcopy -O srec --set-start=0xFFFFFFFFF0000000 %t --only-section=.dummy - 2>&1 | \ +# RUN: FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRY + + # ENTRY:S00400002DCE +# ENTRY-NEXT:S705F00000000A + +## Start address which exceeds 32 bit range triggers an error. +# RUN: not llvm-objcopy -O srec --set-start=0xF00000000 %t - 2>&1 | \ +# RUN: FileCheck %s --check-prefix=BAD_START + +# BAD_START: entry point address 0xf00000000 overflows 32 bits + +## Sign-extended start address which exceeds 32 bit range triggers an error. +# RUN: not llvm-objcopy -O srec --set-start=0xFFFFFFFF0F000000 %t - 2>&1 | \ +# RUN: FileCheck %s --check-prefix=BAD_EXTENDED_START + +# BAD_EXTENDED_START: entry point address 0xffffffff0f000000 overflows 32 bits + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .data1 +## Records for this section should come last. + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "11111111111111111111" + Address: 0xEFFFFF + - Name: .data2 +## This section overlaps 24-bit address boundary, so we expect +## its record type to be S3. + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "3031323334353637383940" + Address: 0xFFFFF8 +## Sign-extended addresses are OK. + - Name: .data3 + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0xFFFFFFFF80001000 + Content: "0001020304" + - Name: .text +## This section's contents exceed default line length of 16 bytes +## so we expect two lines created for it. Records for this section +## should appear before records for the previous section. + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x1000 + Content: "000102030405060708090A0B0C0D0E0F1011121314" + - Name: .bss +## NOBITS sections are not written. + Type: SHT_NOBITS + Flags: [ SHF_ALLOC ] + Address: 0x10100 + Size: 0x1000 + - Name: .dummy +## Non-allocatable sections are not written. + Type: SHT_PROGBITS + Flags: [ ] + Address: 0x20FFF8 + Size: 65536 + +## Check for various error cases. + +## Check that section address range overlapping 32 bit range +## triggers an error. +# RUN: yaml2obj %s --docnum=2 -o %t.err +# RUN: not llvm-objcopy -O srec --only-section=.text1 %t.err - 2>&1 | \ +# RUN: FileCheck %s --check-prefix=BAD-ADDR +# RUN: not llvm-objcopy -O srec --only-section=.text2 %t.err - 2>&1 | \ +# RUN: FileCheck %s --check-prefix=BAD-ADDR2 + +# BAD-ADDR: section '.text1' address range [0xfffffff8, 0x100000000] is not 32 bit +# BAD-ADDR2: section '.text2' address range [0xffffffff0, 0xffffffff4] is not 32 bit + +## Check that zero length section is not written. +# RUN: llvm-objcopy -O srec --only-section=.text %t.err - | \ +# RUN: FileCheck --match-full-lines --strict-whitespace --implicit-check-not={{.}} %s --check-prefix=ZERO_SIZE_SEC + +## There should be no records besides header and terminator. +# ZERO_SIZE_SEC:S00400002DCE +# ZERO_SIZE_SEC-NEXT:S9030000FC + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .text1 +## Part of section data is in 32-bit address range and part isn't. + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC] + Address: 0xFFFFFFF8 + Content: "000102030405060708" + - Name: .text2 + ## Entire secion is outside of 32-bit range. + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0xFFFFFFFF0 + Content: "0001020304" + +## This tests an input file with segments and expects +## physical addresses instead of virtual addresses. +# RUN: yaml2obj %s --docnum=3 -o %t.seg +# RUN: llvm-objcopy -O srec %t.seg - | \ +# RUN: FileCheck --match-full-lines --strict-whitespace %s --check-prefix=PADDR + + # PADDR:S00400002DCE +# PADDR-NEXT:S214100000000102030405060708090A0B0C0D0E0F63 +# PADDR-NEXT:S20910001010111213147C +# PADDR-NEXT:S20F10001530313233343536373839407E +# PADDR-NEXT:S20810002040414243C1 +# PADDR-NEXT:S20F10002450515253545556575859600F +# PADDR-NEXT:S20720FFF8000000E1 +# PADDR-NEXT:S804100000EB + +--- !ELF +## This file has a non-contiguous section layout with large gaps. +## These sections are all tightly packed into one PT_LOAD segment +## starting at physical address 0x100000. Records should use physical addresses. +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x100000 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x0 + Content: "000102030405060708090A0B0C0D0E0F1011121314" + - Name: .data1 + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "3031323334353637383940" + Address: 0xFFF8 + - Name: .data2 + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "40414243" + Address: 0x10100 + - Name: .data3 + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "5051525354555657585960" + Address: 0x10FFF8 + - Name: .bss + Type: SHT_NOBITS + Flags: [ SHF_ALLOC ] + Address: 0x10100 + Size: 0x1000 + - Name: .dummy + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x20FFF8 + Size: 3 + - Name: .nonalloc + Type: SHT_PROGBITS + Flags: [ ] + Address: 0x300000 + Size: 1 +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0xF00000000 + PAddr: 0x100000 + FirstSec: .text + LastSec: .bss diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp index 394eaca..9a9b631 100644 --- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp +++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp @@ -687,6 +687,7 @@ objcopy::parseObjcopyOptions(ArrayRef RawArgsArr, Config.OutputFormat = StringSwitch(OutputFormat) .Case("binary", FileFormat::Binary) .Case("ihex", FileFormat::IHex) + .Case("srec", FileFormat::SREC) .Default(FileFormat::Unspecified); if (Config.OutputFormat == FileFormat::Unspecified) { if (OutputFormat.empty()) { diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp index 730f423..ad3e604 100644 --- a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp @@ -121,6 +121,7 @@ static Error executeObjcopyOnRawBinary(ConfigManager &ConfigMgr, case FileFormat::Binary: case FileFormat::IHex: case FileFormat::Unspecified: + case FileFormat::SREC: Expected ELFConfig = ConfigMgr.getELFConfig(); if (!ELFConfig) return ELFConfig.takeError(); -- cgit v1.1 From 1f20bc2cd273dd21459b9007a10c6aa67e5da1e2 Mon Sep 17 00:00:00 2001 From: lntue <35648136+lntue@users.noreply.github.com> Date: Fri, 9 Feb 2024 11:21:04 -0500 Subject: [libc][math] Add C23 math function fdimf128. (#81074) --- libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/riscv/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/math/index.rst | 2 ++ libc/spec/stdc.td | 1 + libc/src/math/CMakeLists.txt | 1 + libc/src/math/fdimf128.h | 20 +++++++++++++++++++ libc/src/math/generic/CMakeLists.txt | 32 +++++++++++++++++++++++++----- libc/src/math/generic/fdimf128.cpp | 19 ++++++++++++++++++ libc/test/src/math/smoke/CMakeLists.txt | 17 +++++++++++++--- libc/test/src/math/smoke/FDimTest.h | 21 +++++++++++++++----- libc/test/src/math/smoke/fdim_test.cpp | 22 +------------------- libc/test/src/math/smoke/fdimf128_test.cpp | 13 ++++++++++++ libc/test/src/math/smoke/fdimf_test.cpp | 24 +--------------------- libc/test/src/math/smoke/fdiml_test.cpp | 24 +--------------------- 15 files changed, 119 insertions(+), 80 deletions(-) create mode 100644 libc/src/math/fdimf128.h create mode 100644 libc/src/math/generic/fdimf128.cpp create mode 100644 libc/test/src/math/smoke/fdimf128_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 5b03080..f75b267 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -382,6 +382,7 @@ if(LIBC_COMPILER_HAS_FLOAT128) libc.src.math.ceilf128 libc.src.math.copysignf128 libc.src.math.fabsf128 + libc.src.math.fdimf128 libc.src.math.floorf128 libc.src.math.fmaxf128 libc.src.math.fminf128 diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 5e98538..762beb9 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -391,6 +391,7 @@ if(LIBC_COMPILER_HAS_FLOAT128) libc.src.math.ceilf128 libc.src.math.copysignf128 libc.src.math.fabsf128 + libc.src.math.fdimf128 libc.src.math.floorf128 libc.src.math.fmaxf128 libc.src.math.fminf128 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index b35fc9f..52a3ce0 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -410,6 +410,7 @@ if(LIBC_COMPILER_HAS_FLOAT128) libc.src.math.ceilf128 libc.src.math.copysignf128 libc.src.math.fabsf128 + libc.src.math.fdimf128 libc.src.math.floorf128 libc.src.math.fmaxf128 libc.src.math.fminf128 diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index 3af7e10..2758b42 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -138,6 +138,8 @@ Basic Operations +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | fdiml | |check| | |check| | |check| | |check| | |check| | | | |check| | |check| | |check| | | | +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ +| fdimf128 | |check| | |check| | | |check| | | | | | | | | | ++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | floor | |check| | |check| | |check| | |check| | |check| | | | |check| | |check| | |check| | | | +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | floorf | |check| | |check| | |check| | |check| | |check| | | | |check| | |check| | |check| | | | diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index e37f95a..9c8b5e5 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -374,6 +374,7 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"fdim", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"fdimf", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"fdiml", RetValSpec, [ArgSpec, ArgSpec]>, + GuardedFunctionSpec<"fdimf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"floor", RetValSpec, [ArgSpec]>, FunctionSpec<"floorf", RetValSpec, [ArgSpec]>, diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index d4dbeeb..8cdd84a 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -111,6 +111,7 @@ add_math_entrypoint_object(fabsf128) add_math_entrypoint_object(fdim) add_math_entrypoint_object(fdimf) add_math_entrypoint_object(fdiml) +add_math_entrypoint_object(fdimf128) add_math_entrypoint_object(floor) add_math_entrypoint_object(floorf) diff --git a/libc/src/math/fdimf128.h b/libc/src/math/fdimf128.h new file mode 100644 index 0000000..c6f488a --- /dev/null +++ b/libc/src/math/fdimf128.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fdimf128 ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FDIMF128_H +#define LLVM_LIBC_SRC_MATH_FDIMF128_H + +#include "src/__support/macros/properties/float.h" + +namespace LIBC_NAMESPACE { + +float128 fdimf128(float128 x, float128 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FDIMF128_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 05b70be..3216ec3 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -43,6 +43,7 @@ add_entrypoint_object( COMPILE_OPTIONS -O3 DEPENDS + libc.src.__support.macros.properties.float libc.src.__support.FPUtil.nearest_integer_operations ) @@ -215,6 +216,7 @@ add_entrypoint_object( HDRS ../fabsf128.h DEPENDS + libc.src.__support.macros.properties.float libc.src.__support.FPUtil.basic_operations COMPILE_OPTIONS -O3 @@ -265,6 +267,7 @@ add_entrypoint_object( COMPILE_OPTIONS -O3 DEPENDS + libc.src.__support.macros.properties.float libc.src.__support.FPUtil.nearest_integer_operations ) @@ -313,6 +316,7 @@ add_entrypoint_object( COMPILE_OPTIONS -O3 DEPENDS + libc.src.__support.macros.properties.float libc.src.__support.FPUtil.nearest_integer_operations ) @@ -361,6 +365,7 @@ add_entrypoint_object( COMPILE_OPTIONS -O3 DEPENDS + libc.src.__support.macros.properties.float libc.src.__support.FPUtil.nearest_integer_operations ) @@ -899,6 +904,7 @@ add_entrypoint_object( HDRS ../copysignf128.h DEPENDS + libc.src.__support.macros.properties.float libc.src.__support.FPUtil.manipulation_functions COMPILE_OPTIONS -O3 @@ -1298,6 +1304,7 @@ add_entrypoint_object( HDRS ../fminf128.h DEPENDS + libc.src.__support.macros.properties.float libc.src.__support.FPUtil.basic_operations COMPILE_OPTIONS -O3 @@ -1346,6 +1353,7 @@ add_entrypoint_object( HDRS ../fmaxf128.h DEPENDS + libc.src.__support.macros.properties.float libc.src.__support.FPUtil.basic_operations COMPILE_OPTIONS -O3 @@ -1394,6 +1402,7 @@ add_entrypoint_object( HDRS ../sqrtf128.h DEPENDS + libc.src.__support.macros.properties.float libc.src.__support.FPUtil.sqrt COMPILE_OPTIONS -O3 @@ -1491,10 +1500,10 @@ add_entrypoint_object( fdim.cpp HDRS ../fdim.h + COMPILE_OPTIONS + -O3 DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -1503,10 +1512,10 @@ add_entrypoint_object( fdimf.cpp HDRS ../fdimf.h + COMPILE_OPTIONS + -O3 DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -1515,10 +1524,23 @@ add_entrypoint_object( fdiml.cpp HDRS ../fdiml.h + COMPILE_OPTIONS + -O3 DEPENDS libc.src.__support.FPUtil.basic_operations +) + +add_entrypoint_object( + fdimf128 + SRCS + fdimf128.cpp + HDRS + ../fdimf128.h COMPILE_OPTIONS - -O2 + -O3 + DEPENDS + libc.src.__support.macros.properties.float + libc.src.__support.FPUtil.basic_operations ) add_entrypoint_object( diff --git a/libc/src/math/generic/fdimf128.cpp b/libc/src/math/generic/fdimf128.cpp new file mode 100644 index 0000000..a3ea9e5 --- /dev/null +++ b/libc/src/math/generic/fdimf128.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fdimf128 function -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fdimf128.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float128, fdimf128, (float128 x, float128 y)) { + return fputil::fdim(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 4ee81ec..93ce0b7 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -1007,7 +1007,6 @@ add_fp_unittest( HDRS FDimTest.h DEPENDS - libc.include.math libc.src.math.fdimf libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits @@ -1022,7 +1021,6 @@ add_fp_unittest( HDRS FDimTest.h DEPENDS - libc.include.math libc.src.math.fdim libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits @@ -1037,12 +1035,25 @@ add_fp_unittest( HDRS FDimTest.h DEPENDS - libc.include.math libc.src.math.fdiml libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + fdimf128_test + SUITE + libc-math-smoke-tests + SRCS + fdimf128_test.cpp + HDRS + FDimTest.h + DEPENDS + libc.src.math.fdimf128 + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits +) + # FIXME: These tests are currently broken on the GPU. if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) add_fp_unittest( diff --git a/libc/test/src/math/smoke/FDimTest.h b/libc/test/src/math/smoke/FDimTest.h index e00b4fd..5cb3dd1 100644 --- a/libc/test/src/math/smoke/FDimTest.h +++ b/libc/test/src/math/smoke/FDimTest.h @@ -10,7 +10,6 @@ #include "src/__support/FPUtil/FPBits.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include template class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test { @@ -26,7 +25,7 @@ public: const T neg_zero = FPBits::zero(Sign::NEG).get_val(); const T nan = FPBits::quiet_nan().get_val(); - void test_na_n_arg(FuncPtr func) { + void test_nan_arg(FuncPtr func) { EXPECT_FP_EQ(nan, func(nan, inf)); EXPECT_FP_EQ(nan, func(neg_inf, nan)); EXPECT_FP_EQ(nan, func(nan, zero)); @@ -66,12 +65,15 @@ public: constexpr StorageType STEP = STORAGE_MAX / COUNT; for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; ++i, v += STEP, w -= STEP) { - T x = FPBits(v).get_val(), y = FPBits(w).get_val(); - if (isnan(x) || isinf(x)) + FPBits xbits(v), ybits(w); + if (xbits.is_inf_or_nan()) continue; - if (isnan(y) || isinf(y)) + if (ybits.is_inf_or_nan()) continue; + T x = xbits.get_val(); + T y = ybits.get_val(); + if (x > y) { EXPECT_FP_EQ(x - y, func(x, y)); } else { @@ -80,3 +82,12 @@ public: } } }; + +#define LIST_FDIM_TESTS(T, func) \ + using LlvmLibcFDimTest = FDimTestTemplate; \ + TEST_F(LlvmLibcFDimTest, NaNArg) { test_nan_arg(&func); } \ + TEST_F(LlvmLibcFDimTest, InfArg) { test_inf_arg(&func); } \ + TEST_F(LlvmLibcFDimTest, NegInfArg) { test_neg_inf_arg(&func); } \ + TEST_F(LlvmLibcFDimTest, BothZero) { test_both_zero(&func); } \ + TEST_F(LlvmLibcFDimTest, InFloatRange) { test_in_range(&func); } \ + static_assert(true, "Require semicolon.") diff --git a/libc/test/src/math/smoke/fdim_test.cpp b/libc/test/src/math/smoke/fdim_test.cpp index 2f00a30..e1c150d 100644 --- a/libc/test/src/math/smoke/fdim_test.cpp +++ b/libc/test/src/math/smoke/fdim_test.cpp @@ -8,26 +8,6 @@ #include "FDimTest.h" -#include "src/__support/FPUtil/FPBits.h" #include "src/math/fdim.h" -#include "test/UnitTest/FPMatcher.h" -#include "test/UnitTest/Test.h" -#include -using LlvmLibcFDimTest = FDimTestTemplate; - -TEST_F(LlvmLibcFDimTest, NaNArg_fdim) { test_na_n_arg(&LIBC_NAMESPACE::fdim); } - -TEST_F(LlvmLibcFDimTest, InfArg_fdim) { test_inf_arg(&LIBC_NAMESPACE::fdim); } - -TEST_F(LlvmLibcFDimTest, NegInfArg_fdim) { - test_neg_inf_arg(&LIBC_NAMESPACE::fdim); -} - -TEST_F(LlvmLibcFDimTest, BothZero_fdim) { - test_both_zero(&LIBC_NAMESPACE::fdim); -} - -TEST_F(LlvmLibcFDimTest, InDoubleRange_fdim) { - test_in_range(&LIBC_NAMESPACE::fdim); -} +LIST_FDIM_TESTS(double, LIBC_NAMESPACE::fdim); diff --git a/libc/test/src/math/smoke/fdimf128_test.cpp b/libc/test/src/math/smoke/fdimf128_test.cpp new file mode 100644 index 0000000..8e65c2b --- /dev/null +++ b/libc/test/src/math/smoke/fdimf128_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fdimf128 --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FDimTest.h" + +#include "src/math/fdimf128.h" + +LIST_FDIM_TESTS(float128, LIBC_NAMESPACE::fdimf128); diff --git a/libc/test/src/math/smoke/fdimf_test.cpp b/libc/test/src/math/smoke/fdimf_test.cpp index 27511ba..9c27c1d 100644 --- a/libc/test/src/math/smoke/fdimf_test.cpp +++ b/libc/test/src/math/smoke/fdimf_test.cpp @@ -8,28 +8,6 @@ #include "FDimTest.h" -#include "src/__support/FPUtil/FPBits.h" #include "src/math/fdimf.h" -#include "test/UnitTest/FPMatcher.h" -#include "test/UnitTest/Test.h" -#include -using LlvmLibcFDimTest = FDimTestTemplate; - -TEST_F(LlvmLibcFDimTest, NaNArg_fdimf) { - test_na_n_arg(&LIBC_NAMESPACE::fdimf); -} - -TEST_F(LlvmLibcFDimTest, InfArg_fdimf) { test_inf_arg(&LIBC_NAMESPACE::fdimf); } - -TEST_F(LlvmLibcFDimTest, NegInfArg_fdimf) { - test_neg_inf_arg(&LIBC_NAMESPACE::fdimf); -} - -TEST_F(LlvmLibcFDimTest, BothZero_fdimf) { - test_both_zero(&LIBC_NAMESPACE::fdimf); -} - -TEST_F(LlvmLibcFDimTest, InFloatRange_fdimf) { - test_in_range(&LIBC_NAMESPACE::fdimf); -} +LIST_FDIM_TESTS(float, LIBC_NAMESPACE::fdimf); diff --git a/libc/test/src/math/smoke/fdiml_test.cpp b/libc/test/src/math/smoke/fdiml_test.cpp index 45aedb0..ed448a6 100644 --- a/libc/test/src/math/smoke/fdiml_test.cpp +++ b/libc/test/src/math/smoke/fdiml_test.cpp @@ -8,28 +8,6 @@ #include "FDimTest.h" -#include "src/__support/FPUtil/FPBits.h" #include "src/math/fdiml.h" -#include "test/UnitTest/FPMatcher.h" -#include "test/UnitTest/Test.h" -#include -using LlvmLibcFDimTest = FDimTestTemplate; - -TEST_F(LlvmLibcFDimTest, NaNArg_fdiml) { - test_na_n_arg(&LIBC_NAMESPACE::fdiml); -} - -TEST_F(LlvmLibcFDimTest, InfArg_fdiml) { test_inf_arg(&LIBC_NAMESPACE::fdiml); } - -TEST_F(LlvmLibcFDimTest, NegInfArg_fdiml) { - test_neg_inf_arg(&LIBC_NAMESPACE::fdiml); -} - -TEST_F(LlvmLibcFDimTest, BothZero_fdiml) { - test_both_zero(&LIBC_NAMESPACE::fdiml); -} - -TEST_F(LlvmLibcFDimTest, InLongDoubleRange_fdiml) { - test_in_range(&LIBC_NAMESPACE::fdiml); -} +LIST_FDIM_TESTS(long double, LIBC_NAMESPACE::fdiml); -- cgit v1.1 From 088773b0d1c1ee99d78f0b68bf50029637fbead7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 17:22:40 +0100 Subject: [clang][Interp] Specify triple in C test This is what test/Sema/const-eval.c does as well and without specifying it, some windows builders are broken: https://lab.llvm.org/buildbot/#/builders/265/builds/2453 --- clang/test/AST/Interp/c.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c index bb2c7cf..afbc518 100644 --- a/clang/test/AST/Interp/c.c +++ b/clang/test/AST/Interp/c.c @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,all -std=c11 %s -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -pedantic -verify=pedantic-expected,all -std=c11 %s -// RUN: %clang_cc1 -verify=ref,all -std=c11 %s -// RUN: %clang_cc1 -pedantic -verify=pedantic-ref,all -std=c11 %s +// RUN: %clang_cc1 -triple x86_64-linux -fexperimental-new-constant-interpreter -verify=expected,all -std=c11 %s +// RUN: %clang_cc1 -triple x86_64-linux -fexperimental-new-constant-interpreter -pedantic -verify=pedantic-expected,all -std=c11 %s +// RUN: %clang_cc1 -triple x86_64-linux -verify=ref,all -std=c11 %s +// RUN: %clang_cc1 -triple x86_64-linux -pedantic -verify=pedantic-ref,all -std=c11 %s typedef __INTPTR_TYPE__ intptr_t; typedef __PTRDIFF_TYPE__ ptrdiff_t; -- cgit v1.1 From e973ab150a802a9503ca34753589d29863df30cc Mon Sep 17 00:00:00 2001 From: lntue <35648136+lntue@users.noreply.github.com> Date: Fri, 9 Feb 2024 11:23:39 -0500 Subject: [libc][NFC] Fix few warnings in tests. (#81262) ``` /usr/local/google/home/lntue/experiment/llvm/llvm-project/libc/test/src/__support/FPUtil/fpbits_test.cpp:268:2: warning: extra ';' outside of a function is incompatible with C++98 [-Wc++98-compat-extra-semi] }; ^ 1 warning generated. ``` ``` In file included from /usr/local/google/home/lntue/experiment/llvm/llvm-project/libc/test/src/sys/socket/linux/bind_test.cpp:17: /usr/local/google/home/lntue/experiment/llvm/llvm-project/libc/test/UnitTest/Test.h:17:9: warning: 'libc_make_test_file_path' macro redefined [-Wmacro-redefined] #define libc_make_test_file_path(file_name) (file_name) ^ /usr/local/google/home/lntue/experiment/llvm/llvm-project/libc/test/UnitTest/LibcTest.h:20:9: note: previous definition is here #define libc_make_test_file_path(file_name) \ ^ 1 warning generated. ``` --- libc/test/src/__support/FPUtil/fpbits_test.cpp | 2 +- libc/test/src/sys/socket/linux/bind_test.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp index 4504a4f..b1c4b66 100644 --- a/libc/test/src/__support/FPUtil/fpbits_test.cpp +++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp @@ -265,7 +265,7 @@ template constexpr auto make(Sign sign, FP fp) { case FP::QUIET_NAN: return T::quiet_nan(sign); } -}; +} // Tests all properties for all types of float. TYPED_TEST(LlvmLibcFPBitsTest, Properties, FPTypes) { diff --git a/libc/test/src/sys/socket/linux/bind_test.cpp b/libc/test/src/sys/socket/linux/bind_test.cpp index 305e4889..e70cbd5 100644 --- a/libc/test/src/sys/socket/linux/bind_test.cpp +++ b/libc/test/src/sys/socket/linux/bind_test.cpp @@ -13,7 +13,6 @@ #include "src/unistd/close.h" #include "src/errno/libc_errno.h" -#include "test/UnitTest/LibcTest.h" #include "test/UnitTest/Test.h" #include // For AF_UNIX and SOCK_DGRAM -- cgit v1.1 From 4f423e4989056316f9d807abb92c14b188490e30 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Fri, 9 Feb 2024 17:26:16 +0100 Subject: [libc++][test] Adds backdeployment shorthands. (#78204) Some changes in libc++ affect the dylib. These changes are not present on systems that use the system dylib. Currently that are the Apple backdeployment targets. Figuring out which MacOS versions to target is not trivial for non-Apple engineers. These shorthands make it easier to select the proper feature make a test UNSUPPORTED or XFAIL. During the design discussion with Louis we considered whether or not to add preprocessor definitions to allow partial disabling of a test. This would be useful when an existing feature is changed by modifying the dylib. In the end we decided not to add this feature to avoid additional complexity in the tests. Instead the test will be disabled for that target. --- libcxx/utils/libcxx/test/features.py | 101 ++++++++++++++++++++++++++++++++--- 1 file changed, 95 insertions(+), 6 deletions(-) diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py index ae719a1..a9fb64a 100644 --- a/libcxx/utils/libcxx/test/features.py +++ b/libcxx/utils/libcxx/test/features.py @@ -526,12 +526,94 @@ DEFAULT_FEATURES += [ # target that doesn't support it will fail at compile time, not at runtime. This can # be achieved by creating a `.verify.cpp` test that checks for the right errors, and # mark that test as requiring `stdlib=-libc++ && target=`. +# +# Since it is not always known which deployment target to pick there are +# short-hands based on the LLVM version like using-built-library-before-llvm-xx. +# These short-hands make it easy for libc++ developers to select the proper +# version the feature will be available in and allows vendors to set the proper +# target information. DEFAULT_FEATURES += [ + # Backdeployment short-hands + Feature( + name="using-built-library-before-llvm-11", + when=lambda cfg: BooleanExpression.evaluate( + "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0)(.0)?}}", + cfg.available_features, + ), + ), + Feature( + name="using-built-library-before-llvm-12", + when=lambda cfg: BooleanExpression.evaluate( + "using-built-library-before-llvm-11 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx12.{{(0|1|2)}}.0)", + cfg.available_features, + ), + ), + + Feature( + name="using-built-library-before-llvm-13", + when=lambda cfg: BooleanExpression.evaluate( + "using-built-library-before-llvm-12 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx{{((12.(3|4|5|6|7))|(13.(0|1|2|3)))}}.0)", + cfg.available_features, + ), + ), + + Feature( + name="using-built-library-before-llvm-14", + when=lambda cfg: BooleanExpression.evaluate( + "using-built-library-before-llvm-13", + cfg.available_features, + ), + ), + + Feature( + name="using-built-library-before-llvm-15", + when=lambda cfg: BooleanExpression.evaluate( + "using-built-library-before-llvm-14 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx13.{{(4|5|6)}}.0)", + cfg.available_features, + ), + ), + + Feature( + name="using-built-library-before-llvm-16", + when=lambda cfg: BooleanExpression.evaluate( + "using-built-library-before-llvm-15 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx14.{{(0|1|2|3)}}.0)", + cfg.available_features, + ), + ), + + Feature( + name="using-built-library-before-llvm-17", + when=lambda cfg: BooleanExpression.evaluate( + "using-built-library-before-llvm-16", + cfg.available_features, + ), + ), + + Feature( + name="using-built-library-before-llvm-18", + when=lambda cfg: BooleanExpression.evaluate( + # For now, no released version of macOS contains LLVM 18 + # TODO(ldionne) Please provide the correct value. + "using-built-library-before-llvm-17 || stdlib=apple-libc++ && target={{.+}}-apple-macosx{{.+}}", + cfg.available_features, + ), + ), + + Feature( + name="using-built-library-before-llvm-19", + when=lambda cfg: BooleanExpression.evaluate( + # For now, no released version of macOS contains LLVM 19 + # TODO(ldionne) Please provide the correct value. + "using-built-library-before-llvm-18 || stdlib=apple-libc++ && target={{.+}}-apple-macosx{{.+}}", + cfg.available_features, + ), + ), + # Tests that require std::to_chars(floating-point) in the built library Feature( name="availability-fp_to_chars-missing", when=lambda cfg: BooleanExpression.evaluate( - "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}", + "using-built-library-before-llvm-13", cfg.available_features, ), ), @@ -539,7 +621,7 @@ DEFAULT_FEATURES += [ Feature( name="availability-char8_t_support-missing", when=lambda cfg: BooleanExpression.evaluate( - "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0)(.0)?}}", + "using-built-library-before-llvm-11", cfg.available_features, ), ), @@ -547,7 +629,7 @@ DEFAULT_FEATURES += [ Feature( name="availability-verbose_abort-missing", when=lambda cfg: BooleanExpression.evaluate( - "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}", + "using-built-library-before-llvm-13", cfg.available_features, ), ), @@ -555,7 +637,7 @@ DEFAULT_FEATURES += [ Feature( name="availability-pmr-missing", when=lambda cfg: BooleanExpression.evaluate( - "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}", + "using-built-library-before-llvm-13", cfg.available_features, ), ), @@ -579,8 +661,15 @@ DEFAULT_FEATURES += [ Feature( name="availability-tzdb-missing", when=lambda cfg: BooleanExpression.evaluate( - # TODO(ldionne) Please provide the correct value. - "(stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}})", + "using-built-library-before-llvm-19", + cfg.available_features, + ), + ), + # Tests that require support for and std::print in in the built library. + Feature( + name="availability-print-missing", + when=lambda cfg: BooleanExpression.evaluate( + "using-built-library-before-llvm-18", cfg.available_features, ), ), -- cgit v1.1 From a5cc1dc82d61c156f75edc72eccacdb6776bf3f1 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Fri, 9 Feb 2024 17:29:02 +0100 Subject: [NFC][libc++] Removes obsolete compiler support. (#80481) These work-arounds were slated for removal in LLVM-18, but missed the deadline. --- .../header_exportable_declarations.cpp | 32 ---------------------- 1 file changed, 32 deletions(-) diff --git a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp index 5157a45..0a48f85 100644 --- a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp +++ b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp @@ -69,50 +69,18 @@ header_exportable_declarations::header_exportable_declarations( } std::optional list = Options.get("SkipDeclarations"); - // TODO(LLVM-17) Remove clang 15 work-around. -#if defined(__clang_major__) && __clang_major__ < 16 - if (list) { - std::string_view s = *list; - auto b = s.begin(); - auto e = std::find(b, s.end(), ' '); - while (b != e) { - skip_decls_.emplace(b, e); - if (e == s.end()) - break; - b = e + 1; - e = std::find(b, s.end(), ' '); - } - } -#else // defined(__clang_major__) && __clang_major__ < 16 if (list) for (auto decl : std::views::split(*list, ' ')) { std::string s; std::ranges::copy(decl, std::back_inserter(s)); // use range based constructor skip_decls_.emplace(std::move(s)); } -#endif // defined(__clang_major__) && __clang_major__ < 16 decls_ = skip_decls_; list = Options.get("ExtraDeclarations"); - // TODO(LLVM-17) Remove clang 15 work-around. -#if defined(__clang_major__) && __clang_major__ < 16 - if (list) { - std::string_view s = *list; - auto b = s.begin(); - auto e = std::find(b, s.end(), ' '); - while (b != e) { - std::cout << "using ::" << std::string_view{b, e} << ";\n"; - if (e == s.end()) - break; - b = e + 1; - e = std::find(b, s.end(), ' '); - } - } -#else // defined(__clang_major__) && __clang_major__ < 16 if (list) for (auto decl : std::views::split(*list, ' ')) std::cout << "using ::" << std::string_view{decl.data(), decl.size()} << ";\n"; -#endif // defined(__clang_major__) && __clang_major__ < 16 } header_exportable_declarations::~header_exportable_declarations() { -- cgit v1.1 From 7291761669dd63624ccaab30887aca7e9c7d3273 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Fri, 9 Feb 2024 17:30:30 +0100 Subject: [libc++] Fixes charconv operator bool tests. (#80598) This was spotted by @philnik. --- .../charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp | 4 ++-- .../charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp index b628a2c..a6aa590 100644 --- a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp @@ -28,13 +28,13 @@ constexpr bool test() { { std::from_chars_result value{nullptr, std::errc{}}; assert(bool(value) == true); - static_assert(noexcept(bool(true)) == true); + static_assert(noexcept(bool(value)) == true); } // False { std::from_chars_result value{nullptr, std::errc::value_too_large}; assert(bool(value) == false); - static_assert(noexcept(bool(true)) == true); + static_assert(noexcept(bool(value)) == true); } return true; diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp index ef9364d..621eb8a 100644 --- a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp @@ -28,13 +28,13 @@ constexpr bool test() { { std::to_chars_result value{nullptr, std::errc{}}; assert(bool(value) == true); - static_assert(noexcept(bool(true)) == true); + static_assert(noexcept(bool(value)) == true); } // False { std::to_chars_result value{nullptr, std::errc::value_too_large}; assert(bool(value) == false); - static_assert(noexcept(bool(true)) == true); + static_assert(noexcept(bool(value)) == true); } return true; -- cgit v1.1 From b33b91a21788d439f49d6db4e7224c20f740f1a7 Mon Sep 17 00:00:00 2001 From: "Oleksandr \"Alex\" Zinenko" Date: Fri, 9 Feb 2024 17:35:14 +0100 Subject: [mlir] update transform dialect tutorials (#81199) Use the "main" transform-interpreter pass instead of the test pass. This, along with the previously introduced debug extension, now allow tutorials to no longer depend on test passes and extensions. --- mlir/docs/Tutorials/transform/Ch1.md | 347 +++++++++++---------- mlir/docs/Tutorials/transform/Ch2.md | 202 ++++++------ mlir/docs/Tutorials/transform/Ch3.md | 12 +- mlir/docs/Tutorials/transform/Ch4.md | 2 +- .../transform/Ch2/transform-opt/transform-opt.cpp | 22 +- .../transform/Ch3/transform-opt/transform-opt.cpp | 26 +- .../transform/Ch4/transform-opt/transform-opt.cpp | 12 - .../mlir/Dialect/Transform/Transforms/Passes.td | 4 + .../Transforms/TransformInterpreterUtils.h | 5 + .../mlir/Dialect/Transform/Utils/RaggedArray.h | 3 + .../Transform/Transforms/InterpreterPass.cpp | 24 +- .../Transforms/TransformInterpreterUtils.cpp | 36 ++- .../Examples/transform/Ch1/invalidation-1.mlir | 75 ++--- .../Examples/transform/Ch1/invalidation-2.mlir | 18 +- mlir/test/Examples/transform/Ch1/sequence.mlir | 105 ++++--- mlir/test/Examples/transform/Ch2/invalid.mlir | 10 +- mlir/test/Examples/transform/Ch2/ops.mlir | 15 +- mlir/test/Examples/transform/Ch2/sequence.mlir | 99 +++--- mlir/test/Examples/transform/Ch3/invalid.mlir | 10 +- mlir/test/Examples/transform/Ch3/ops.mlir | 28 +- mlir/test/Examples/transform/Ch3/sequence.mlir | 113 +++---- mlir/test/Examples/transform/ChH/full.mlir | 6 +- 22 files changed, 615 insertions(+), 559 deletions(-) diff --git a/mlir/docs/Tutorials/transform/Ch1.md b/mlir/docs/Tutorials/transform/Ch1.md index 7a299a4..b0fdf08 100644 --- a/mlir/docs/Tutorials/transform/Ch1.md +++ b/mlir/docs/Tutorials/transform/Ch1.md @@ -6,7 +6,7 @@ The Transform dialect allows one to precisely target transformations at specific Transform IR operations operate on values that may be associated with payload IR operations, values or attributes. We call the first two kinds of values operation and value handles, respectively. We call the last kind of values parameters. -The application of transform IR always starts from one top-level operation. In the C++ API, this operation is passed to the `applyTransforms` function. This top-level operation specifies if other transformations should be performed and how. The most common top-level operation merely applies other transform operations listed in its body one after the other. +The application of transform IR always starts from one top-level operation. In the C++ API, this operation is passed to the `applyTransforms` function. This top-level operation specifies if other transformations should be performed and how. The most common top-level operation, `transform.named_sequence` merely applies other transform operations listed in its body one after the other, similarly to a function or a macro. Let us illustrate this with a simple sequence of transformations on the common “fully connected + bias + ReLU” ML layer, which boils down to performing a matrix multiplication, followed by an (elementwise) matrix addition and taking an elementwise maximum with 0. This can be expressed using the following IR: @@ -14,7 +14,7 @@ Let us illustrate this with a simple sequence of transformations on the common func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>) -> tensor<512x512xf32> { - // Matrix-matrix multiplication. + // Matrix-matrix multiplication. %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32> @@ -22,7 +22,7 @@ func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>) outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> - + // Elementwise max with 0 (ReLU). %c0f = arith.constant 0.0 : f32 %relued = linalg.elemwise_binary { fun = #linalg.binary_fn } @@ -37,30 +37,34 @@ func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, For performance reasons, we would like to tile and fuse these operations to exploit cache locality. This is a sequence of transformations that need to be performed one after another, so we naturally start with the corresponding top-level transform operation. ```mlir -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - transform.yield +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + transform.yield + } } ``` There are several aspects worth noticing in this operation. -The first entry block argument is mandatory for top-level transform operations and is associated with the top-level payload operation that sequence is applied to, for example, a module or a function. This operation is specified when calling `applyTransforms`. +Its special name, `@__transform_main` and the first argument are mandated by the interpreter pass, similarly to how the entry point of C programs needs to be called `main` and may have the `int (int argc, char** argv)` signature. This argument will be associated with the top-level payload operation, most often the operation that the pass is applied to. Note that none of this is required when applying the transformation _programmatically_ via `applyTransforms` or `applyNamedSequence`. The remaining entry block arguments are optional and can be associated with payload attributes, operations or values that are useful in the sequence. These are also specified when calling `applyTransforms`. In our case, we are interested in the matrix multiplication and elementwise operations that we are going to tile and fuse. All value handles have Transform dialect types. These types specify certain properties of the payload IR entities associated with them. In this example, `transform.any_op` indicates that the handle is associated with arbitrary payload operations. On the contrary, `transform.op<"X">` indicates that the handle is associated _only_ with payload operations of kind `X`. These constraints are verified when the handle/payload association is created. For entry block arguments of top-level transform operations, this happens early in the `applyTransforms` function. If the constraints are not satisfied, the transform application fails and produces diagnostics for the user. +Finally, the operation is wrapped in a module with the `transform.with_named_sequence` attribute that triggers all necessary verifications if multiple named sequences exist. + ## Failure Propagation -Speaking about diagnostics, the `sequence` operation itself has a mandatory attribute specifying the failure propagation mode. There are two options: +The Transform dialect infrastructure has a particular mechanism for handling diagnostics that supports recoverable errors. It is best understood by considering the (unnamed) sequence operation that has a mandatory attribute specifying the failure propagation mode. There are two options: * “propagate” makes the sequence transformation fail if any of the nested transformation fails; * “suppress” makes the sequence succeed even if one of the nested transformations fails, but without attempting to perform the transformations following the failed one in the sequence. -This latter allows the transformation to continue despite (recoverable) errors. As we are only building the transformation, it is preferable to propagate failures so we know when something did not apply. +This latter allows the transformation script surrounding the sequence to continue despite errors within the sequence, assuming they are recoverable. As we are only building the transformation script, it is preferable to propagate failures so we know when something did not apply. To check or debug a transform sequence, it is possible to print various entities associated with the transform IR values. For example, we can print the operations associated with the handles: @@ -83,27 +87,26 @@ Since we don’t want to recompile the compiler every time we change a transform ```sh -$ mlir-opt matmul.mlir --pass-pipeline=" - builtin.module(test-transform-dialect-interpreter{ - bind-first-extra-to-ops=linalg.matmul - bind-second-extra-to-ops=linalg.elemwise_binary})" +$ mlir-opt sequence.mlir --pass-pipeline=" + builtin.module(transform-interpreter{ + debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary})" ``` -The `matmul.mlir` file contains _both_ the payload IR function _and_ the transform IR sequence nested in the same module. The transform interpreter will find the first top-level transform operation in the root operation of the pass (the module in our case) and apply it to that root operation. In our case, we also asked the interpreter pass to associate the two extra arguments of the top-level sequence with all `linalg.matmul` and `linalg.elemwise_binary` payload operations through the respective pass options. Running this pass results in the expected remarks: +The `sequence.mlir` file contains _both_ the payload IR function _and_ the transform IR sequence nested in the same module. The transform interpreter pass will apply the `@__transform_main` named sequence to the anchor operation of the pass. In our case, we also asked the interpreter pass to associate the two extra arguments of the top-level sequence with all `linalg.matmul` and `linalg.elemwise_binary` payload operations through the respective pass options. Running this pass results in the expected remarks: ```sh -matmul.mlir:7:13: remark: matmul +sequence.mlir:7:13: remark: matmul %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) ^ -matmul.mlir:7:13: note: see current operation: %0 = linalg.matmul ins(%arg0, %arg1 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> -matmul.mlir:10:13: remark: elemwise_binaries +sequence.mlir:7:13: note: see current operation: %0 = linalg.matmul ins(%arg0, %arg1 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> +sequence.mlir:10:13: remark: elemwise_binaries %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } ^ -matmul.mlir:10:13: note: see current operation: %1 = linalg.elemwise_binary {fun = #linalg.binary_fn} ins(%0, %arg2 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> -matmul.mlir:14:13: remark: elemwise_binaries +sequence.mlir:10:13: note: see current operation: %1 = linalg.elemwise_binary {fun = #linalg.binary_fn} ins(%0, %arg2 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> +sequence.mlir:14:13: remark: elemwise_binaries %relued = linalg.elemwise_binary { fun = #linalg.binary_fn } ^ -matmul.mlir:14:13: note: see current operation: %2 = linalg.elemwise_binary {fun = #linalg.binary_fn} ins(%1, %cst : tensor<512x512xf32>, f32) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> +sequence.mlir:14:13: note: see current operation: %2 = linalg.elemwise_binary {fun = #linalg.binary_fn} ins(%1, %cst : tensor<512x512xf32>, f32) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> ``` Note that `%arg2` is associated with both elementwise payload operations. Any handle is associated with a list of entities. Individual transformations may or may not care about the order of elements in that list. @@ -114,26 +117,33 @@ Note that `%arg2` is associated with both elementwise payload operations. Any ha Now that we have handles to the operations we want to transform, we are ready to apply the transformations. Let us first try tiling the matmul operation itself. ```mlir -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // The actual tiling transformation takes tile sizes as attributes. - %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32] - : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) - transform.yield +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // The actual tiling transformation takes tile sizes as attributes. + %loop, %tiled = transform.structured.tile_using_forall %arg1 + tile_sizes [4, 32] + : (!transform.op<"linalg.matmul">) + -> (!transform.any_op, !transform.any_op) + transform.yield + } } ``` -The transformation returns two handles, as indicated in its [documentation](https://mlir.llvm.org/docs/Dialects/Transform/#transformstructuredtile_using_forall-transformtiletoforallop): +The transformation returns two handles, as indicated in its [documentation](https://mlir.llvm.org/docs/Dialects/Transform/#transformstructuredtile_using_forall-transformtileusingforallop): -* A handle to the `scf.forall` “multi-for” loop around tensors. * A handle to `linalg.generic` operating on the subset of the original data. +* A handle to the `scf.forall` “multi-for” loop around tensors. Running this transformation with the same command as above expectedly produces the tiled code. ```mlir -func.func @fc_relu(%arg0: tensor<512x512xf32>, %arg1: tensor<512x512xf32>, %arg2: tensor<512x512xf32>, %arg3: tensor<512x512xf32>) -> tensor<512x512xf32> { +func.func @fc_relu(%arg0: tensor<512x512xf32>, + %arg1: tensor<512x512xf32>, + %arg2: tensor<512x512xf32>, + %arg3: tensor<512x512xf32>) -> tensor<512x512xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = scf.forall (%arg4, %arg5) in (128, 16) shared_outs(%arg6 = %arg3) -> (tensor<512x512xf32>) { %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) @@ -144,7 +154,7 @@ func.func @fc_relu(%arg0: tensor<512x512xf32>, %arg1: tensor<512x512xf32>, %arg2 : tensor<512x512xf32> to tensor<512x32xf32> %extracted_slice_1 = tensor.extract_slice %arg6[%3, %4] [4, 32] [1, 1] : tensor<512x512xf32> to tensor<4x32xf32> - %5 = linalg.matmul + %5 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%extracted_slice_1 : tensor<4x32xf32>) -> tensor<4x32xf32> @@ -168,78 +178,79 @@ Besides producing new handles, the tiling transform operation _consumes_ the ope ## Handle Invalidation and Expensive Checks Mode -Undefined behavior is difficult to grapple with when it does happen, so the Transform dialect interpreter provides a set of additional expensive checks that detect most undefined behavior in the transform IR. For example, if we wanted to use the `%arg1` handle after it is consumed, it would cause undefined behavior that manifests as an assertion in the debug build, and likely as a segmentation fault in the release mode. +Undefined behavior is difficult to grapple with when it does happen, so the Transform dialect interpreter defaults to performing a set of additional, potentially expensive, checks that detect most undefined behavior in the transform IR. For example, if we wanted to use the `%arg1` handle after it is consumed, it would cause undefined behavior that manifests as an assertion in the debug build, and likely as a segmentation fault in the release mode. ```mlir -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // The actual tiling transformation takes tile sizes as attributes. - %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32] - : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) - - // This is trying to use an invalidated handle leading to undefined behavior. - transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul"> - transform.yield +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // The actual tiling transformation takes tile sizes as attributes. + %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32] + : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) + + // This is trying to use an invalidated handle leading to undefined behavior. + transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul"> + transform.yield + } } ``` However, with the expensive checks enabled in the interpreter, a nice diagnostic is produced: ```sh -$ mlir-opt matmul.mlir --pass-pipeline=" - builtin.module(test-transform-dialect-interpreter{ - bind-first-extra-to-ops=linalg.matmul - bind-second-extra-to-ops=linalg.elemwise_binary - enable-expensive-checks})" -``` - -```sh -matmul.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op +sequence.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op transform.debug.emit_remark_at %mm, "elemwise_binaries" : !transform.any_op ^ -matmul.mlir:26:9: note: handle to invalidated ops +sequence.mlir:26:9: note: handle to invalidated ops %mm = transform.cast %matmul : !transform.op<"linalg.matmul"> to !transform.any_op ^ -matmul.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them +sequence.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them %loop, %tiled = transform.structured.tile_using_forall %mm tile_sizes [4, 32] ``` -One may observe that some operations such as `transform.cast` do not consume the operand (because they don’t erase the corresponding operation). So what would happen if we tried to use that operand instead? +When compile-time performance is a concern, and the transformation sequence is sufficiently stable, it is possible to disable expensive checks in the interpreter for improved performance by providing the `disable-expensive-checks` option to the pass or by setting the corresponding flag in the `TransformOptions` passed into `applyTransforms`. + +One may observe that some operations such as `transform.cast` do not consume the operand (because they don’t erase the corresponding operation). So what would happen if we tried to use that operand instead? ```mlir -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // We can cast one type to another as long as operations are compatible - // with both types. This creates "aliasing" handles. - %casted = transform.cast %arg1 : !transform.op<"linalg.matmul"> - to !transform.any_op - - // The actual tiling transformation takes tile sizes as attributes. - %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32] - : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) - - // Consuming an operand invalidates the consumed handle and any other handle that is - // associated with the same payload operations, or payload operations nested in them. - transform.debug.emit_remark_at %casted, "remark" - : !transform.any_op - transform.yield +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // We can cast one type to another as long as operations are compatible + // with both types. This creates "aliasing" handles. + %casted = transform.cast %arg1 : !transform.op<"linalg.matmul"> + to !transform.any_op + + // The actual tiling transformation takes tile sizes as attributes. + %loop, %tiled = transform.structured.tile_using_forall %arg1 + tile_sizes [4, 32] + : (!transform.op<"linalg.matmul">) + -> (!transform.any_op, !transform.any_op) + + // Consuming an operand invalidates the consumed handle and any other handle + // that is associated with the same payload operations, or payload + // operations nested in them. + transform.debug.emit_remark_at %casted, "remark" + : !transform.any_op + transform.yield + } } ``` Both `%arg1` and `%casted` reference the same payload operation. Extending the reference analogy, these references alias. Naturally, when the payload operation is erased, all references to it become dangling. This is also the case for handles. In fact, consuming an operand invalidates the operand handle as well as any other handle that is associated with any of the same payload operations. The payload IR consideration is recursive: a handle associated with a payload operation _nested_ in the erased one is also invalidated (because erasing the operation also erases its regions and all contained operations). The expensive-checks mode can also handle this case. ```sh -matmul.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op +sequence.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op transform.debug.emit_remark_at %matmul, "elemwise_binaries" : !transform.op<"linalg.matmul"> ^ -matmul.mlir:21:29: note: handle to invalidated ops +sequence.mlir:21:29: note: handle to invalidated ops ^bb0(%root: !transform.any_op, %matmul: !transform.op<"linalg.matmul">, %elemwise: !transform.op<"linalg.elemwise_binary">): ^ -matmul.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them +sequence.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them %loop, %tiled = transform.structured.tile_using_forall %mm tile_sizes [4, 32] ``` @@ -248,39 +259,41 @@ matmul.mlir:27:19: note: invalidated by this transform op that consumes its oper Going back to the transformation sequence, we have tiled the matrix multiplication, but we also want to tile and fuse the elementwise operations. The typical way of doing in the structured operations paradigm is to tile the last operation in some acyclic dataflow graph, and then progressively fuse the operations that produce its operands. This removes the need to explicitly tile all operations as fusion can adapt their sizes and inject recomputation if desired. So instead of tiling the matmul operation, we are going to tile the last operation in the chain, and then fuse the preceding operations into the loops produced by tiling. ```mlir -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // Since the %arg2 handle is associated with both elementwise operations, - // we need to split it into two handles so we can target only the second - // elementwise operation. - %add, %max = transform.split_handle %arg2 - : (!transform.op<"linalg.elemwise_binary">) - -> (!transform.any_op, !transform.any_op) - - // The actual tiling transformation takes tile sizes as attributes. It - // produces a handle to the loop generated during tiling. - %tiled_max, %loop = - transform.structured.tile_using_forall %max tile_sizes [8, 32] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // We can now fuse the other operations into the loop. Here, we fuse - // operations one by one. This requires the operation that is being fused to - // define the value used within the loop, so the order of such fusions is - // important. We could also use "transform.merge_handles" to obtain a single - // handle to all operations and give it to `fuse_into_containing_op` that - // would take care of the ordering in this case. - %add_fused, %loop_0 = - transform.structured.fuse_into_containing_op %add into %loop - : (!transform.any_op, !transform.any_op) - -> (!transform.any_op, !transform.any_op) - %matmul_fused, %loop_1 = - transform.structured.fuse_into_containing_op %arg1 into %loop_0 - : (!transform.op<"linalg.matmul">, !transform.any_op) - -> (!transform.any_op, !transform.any_op) +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 + : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) - transform.yield + // The actual tiling transformation takes tile sizes as attributes. It + // produces a handle to the loop generated during tiling. + %tiled_max, %loop = + transform.structured.tile_using_forall %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one by one. This requires the operation that is being fused to + // define the value used within the loop, so the order of such fusions is + // important. We could also use "transform.merge_handles" to obtain a single + // handle to all operations and give it to `fuse_into_containing_op` that + // would take care of the ordering in this case. + %add_fused, %loop_0 = + transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) + -> (!transform.any_op, !transform.any_op) + %matmul_fused, %loop_1 = + transform.structured.fuse_into_containing_op %arg1 into %loop_0 + : (!transform.op<"linalg.matmul">, !transform.any_op) + -> (!transform.any_op, !transform.any_op) + + transform.yield + } } ``` @@ -291,64 +304,68 @@ This achieves the desired tiling and fusion. Finally, let us assume there exists an efficient microkernel, or a hardware instruction expressed as an intrinsic function, for a 4x4 matrix multiplication. For this purpose, we need to tile the fused operation to the desired size, and then outline it. The resulting function call can then be replaced with a call to the microkernel. ```mlir -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // Since the %arg2 handle is associated with both elementwise operations, - // we need to split it into two handles so we can target only the second - // elementwise operation. - %add, %max = transform.split_handle %arg2 - : (!transform.op<"linalg.elemwise_binary">) - -> (!transform.any_op, !transform.any_op) - - // The actual tiling transformation takes tile sizes as attributes. It - // produces a handle to the loop generated during tiling. - %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // We can now fuse the other operations into the loop. Here, we fuse - // operations one by one. This requires the operation that is being fused to - // define the value used within the loop, so the order of such fusions is - // important. We could also use "transform.merge_handles" to obtain a single - // handle to all operations and give it to `fuse_into_containing_op` that - // would take care of the ordering in this case. - %add_fused, %loop_0 = - transform.structured.fuse_into_containing_op %add into %loop - : (!transform.any_op, !transform.any_op) - -> (!transform.any_op, !transform.any_op) - %matmul_fused, %loop_1 = - transform.structured.fuse_into_containing_op %arg1 into %loop_0 - : (!transform.op<"linalg.matmul">, !transform.any_op) +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 + : (!transform.op<"linalg.elemwise_binary">) -> (!transform.any_op, !transform.any_op) - // Tile again to get the desired size. Note that this time this tiles the - // "add" operation and fuses matmul into the loop, but doesn't affect the - // "max" operation. This illustrates the precise targeting with the transform - // dialect. Otherwise, it is difficult to differentiate "add" and "max", both - // of which having the same kind. - %tiled_2, %loop_2 = - transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] + // The actual tiling transformation takes tile sizes as attributes. It + // produces a handle to the loop generated during tiling. + %tiled, %loop = transform.structured.tile_using_forall %max + tile_sizes [8, 32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul_fused_2, %loop_3 = - transform.structured.fuse_into_containing_op %matmul_fused into %loop_2 - : (!transform.any_op, !transform.any_op) - -> (!transform.any_op, !transform.any_op) - // Since outlining is currently only implemented for region-holding operations - // such as loops, use tiling to size 1 to materialize the outer loop that is - // going to be outlined. - %_, %outline_target = - transform.structured.tile_using_forall %tiled_2 tile_sizes [1] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %matmul_fused_2 - into %outline_target - : (!transform.any_op, !transform.any_op) - -> (!transform.any_op, !transform.any_op) - %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} - : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">) - - transform.yield + // We can now fuse the other operations into the loop. Here, we fuse + // operations one by one. This requires the operation that is being fused to + // define the value used within the loop, so the order of such fusions is + // important. We could also use "transform.merge_handles" to obtain a single + // handle to all operations and give it to `fuse_into_containing_op` that + // would take care of the ordering in this case. + %add_fused, %loop_0 = + transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) + -> (!transform.any_op, !transform.any_op) + %matmul_fused, %loop_1 = + transform.structured.fuse_into_containing_op %arg1 into %loop_0 + : (!transform.op<"linalg.matmul">, !transform.any_op) + -> (!transform.any_op, !transform.any_op) + + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the + // transform dialect. Otherwise, it is difficult to differentiate "add" and + // "max", both of which having the same kind. + %tiled_2, %loop_2 = + transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2, %loop_3 = + transform.structured.fuse_into_containing_op %matmul_fused into %loop_2 + : (!transform.any_op, !transform.any_op) + -> (!transform.any_op, !transform.any_op) + + // Since outlining is currently only implemented for region-holding + // operations such as loops, use tiling to size 1 to materialize the outer + // loop that is going to be outlined. + %_, %outline_target = + transform.structured.tile_using_forall %tiled_2 tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.structured.fuse_into_containing_op %matmul_fused_2 + into %outline_target + : (!transform.any_op, !transform.any_op) + -> (!transform.any_op, !transform.any_op) + %func, %call = transform.loop.outline %outline_target + {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">) + + transform.yield + } } ``` diff --git a/mlir/docs/Tutorials/transform/Ch2.md b/mlir/docs/Tutorials/transform/Ch2.md index ac6d7d42..1aaefd2 100644 --- a/mlir/docs/Tutorials/transform/Ch2.md +++ b/mlir/docs/Tutorials/transform/Ch2.md @@ -10,37 +10,40 @@ The Transform dialect uses the dialect extension mechanism to allow additional o // In MyExtension.cpp. #include "mlir/Dialect/Transform/IR/TransformDialect.h" -// Define a new Transform dialect extension. This uses the CRTP idiom to identify -// extensions. +// Define a new Transform dialect extension. This uses the CRTP idiom to +// identify extensions. class MyExtension : public ::mlir::transform::TransformDialectExtension { public: // The extension must derive the base constructor. using Base::Base; - // This function initializes the extension, similarly to `initialize` in dialect - // definitions. List individual operations and dependent dialects here. + // This function initializes the extension, similarly to `initialize` in + // dialect definitions. List individual operations and dependent dialects + // here. void init(); }; void MyExtension::init() { - // Similarly to dialects, an extension can declare a dependent dialect. This dialect - // will be loaded along with the extension and, therefore, along with the Transform - // dialect. Only declare as dependent the dialects that contain the attributes or - // types used by transform operations. Do NOT declare as dependent the dialects - // produced during the transformation. + // Similarly to dialects, an extension can declare a dependent dialect. This + // dialect will be loaded along with the extension and, therefore, along with + // the Transform dialect. Only declare as dependent the dialects that contain + // the attributes or types used by transform operations. Do NOT declare as + // dependent the dialects produced during the transformation. + // // declareDependentDialect(); - // When transformations are applied, they may produce new operations from previously - // unloaded dialects. Typically, a pass would need to declare itself dependent on - // the dialects containing such new operations. To avoid confusion with the dialects - // the extension itself depends on, the Transform dialects differentiates between: + // When transformations are applied, they may produce new operations from + // previously unloaded dialects. Typically, a pass would need to declare + // itself dependent on the dialects containing such new operations. To avoid + // confusion with the dialects the extension itself depends on, the Transform + // dialects differentiates between: // - dependent dialects, which are used by the transform operations, and - // - generated dialects, which contain the entities (attributes, operations, - // types) that may be produced by applying the transformation even when not - // present in the original payload IR. - // In the following chapter, we will be add operations that generate function calls - // and structured control flow operations, so let's declare the corresponding - // dialects as generated. + // - generated dialects, which contain the entities (attributes, operations, + // types) that may be produced by applying the transformation even when + // not present in the original payload IR. + // In the following chapter, we will be add operations that generate function + // calls and structured control flow operations, so let's declare the + // corresponding dialects as generated. declareGeneratedDialect<::mlir::scf::SCFDialect>(); declareGeneratedDialect<::mlir::func::FuncDialect>(); @@ -89,7 +92,7 @@ mlir_tablegen(MyExtension.cpp.inc -gen-op-defs) # Add a CMakeTarget we can depend on to ensure the generation happens before the compilation. add_public_tablegen_target(MyExtensionIncGen) -# Don't forget to generate the documentation, this will produce a MyExtension.md under +# Don't forget to generate the documentation, this will produce a MyExtension.md under # Dialects. add_mlir_doc(MyExtension MyExtension Dialects/ -gen-op-doc) ``` @@ -103,7 +106,8 @@ add_mlir_library( # Built from the following source files. MyExtension.cpp - # Make sure ODS declaration and definitions are generated before compiling this. + # Make sure ODS declaration and definitions are generated before compiling + # this. DEPENDS MyExtensionIncGen @@ -136,10 +140,10 @@ This will generate two files, `MyExtension.h.inc` and `MyExtension.cpp.inc`, tha void MyExtension::init() { // … - // Finally, we register the additional transform operations with the dialect. List all - // operations generated from ODS. This call will perform additional checks that the - // operations implement the transform and memory effect interfaces required by the - // dialect interpreter and assert if they do not. + // Finally, we register the additional transform operations with the dialect. + // List all operations generated from ODS. This call will perform additional + // checks that the operations implement the transform and memory effect + // interfaces required by the dialect interpreter and assert if they do not. registerTransformOps< #define GET_OP_LIST #include "MyExtension.cpp.inc" @@ -154,34 +158,36 @@ With this setup, we are now ready to define the new transform operation to rewri ```tablegen // In MyExtension.td. -// Define the new operation. By convention, prefix its name with the name of the dialect -// extension, "my.". The full operation name will be further prefixed with "transform.". +// Define the new operation. By convention, prefix its name with the name of the +// dialect extension, "my.". The full operation name will be further prefixed +// with "transform.". def ChangeCallTargetOp : Op, DeclareOpInterfaceMethods]> { - // Provide a brief and a full description. It is recommended that the latter describes - // the effects on the operands and how the operation processes various failure modes. + // Provide a brief and a full description. It is recommended that the latter + // describes the effects on the operands and how the operation processes + // various failure modes. let summary = "Changes the callee of a call operation to the specified one"; let description = [{ - For each `func.call` payload operation associated with the handle, changes its - callee to be the symbol whose name is provided as an attribute to this operation. + For each `func.call` payload operation associated with the handle, changes + its callee to be the symbol whose name is provided as an attribute to this operation. - Generates a silenceable failure if the operand is associated with payload operations - that are not `func.call`. - Only reads the operand. + Generates a silenceable failure if the operand is associated with payload operations that are not `func.call`. Only reads the operand. }]; - // The arguments include the handle to the payload operations and the attribute that - // specifies the new callee. The handle must implement TransformHandleTypeInterface. - // We use a string attribute as the symbol may not exist in the transform IR so the - // verification may fail. + // The arguments include the handle to the payload operations and the + // attribute that specifies the new callee. The handle must implement + // TransformHandleTypeInterface. + // We use a string attribute as the symbol may not exist in the transform IR + // so the verification may fail. let arguments = (ins TransformHandleTypeInterface:$call, StrAttr:$new_target); - // The results are empty as the transformation does not produce any new payload. + // The results are empty as the transformation does not produce any new + // payload. let results = (outs); // Provide nice syntax. @@ -224,8 +230,8 @@ must be modified with the provided rewriter. // It can also carry additional user-defined state. ::mlir::transform::TransformState &state) { - // First, we need to obtain the list of payload operations that are associated with - // the operand handle. + // First, we need to obtain the list of payload operations that are associated + // with the operand handle. auto payload = state.getPayloadOps(getCall()); // Then, we iterate over the list of operands and call the actual IR-mutating @@ -280,56 +286,66 @@ void registerMyExtension(::mlir::DialectRegistry ®istry) { After registering the extension, it becomes possible to use our new operation in the Transform dialect interpreter. The upstream testing pass can be used as is. ```mlir -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // Since the %arg2 handle is associated with both elementwise operations, - // we need to split it into two handles so we can target only the second - // elementwise operation. - %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) - -> (!transform.any_op, !transform.any_op) - - // The actual tiling transformation takes tile sizes as attributes. It produces a - // handle to the loop generated during tiling. - %loop, %tiled = transform.structured.tile_using_forall %max tile_sizes [8, 32] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // We can now fuse the other operations into the loop. Here, we fuse - // operations one-by-one. This requires the operation that is being fused - // to define the value used within the loop, so the order of such fusions - // is important. We could also use "transform.merge_handles" to obtain - // a single handle to all operations and give it to `fuse_into_containing_op` - // that would take care of the ordering in this case. - %add_fused = transform.structured.fuse_into_containing_op %add into %loop - : (!transform.any_op, !transform.any_op) -> !transform.any_op - %matmul_fused = transform.structured.fuse_into_containing_op %arg1 into %loop - : (!transform.op<"linalg.matmul">, !transform.any_op) -> !transform.any_op - - // Tile again to get the desired size. Note that this time this tiles the - // "add" operation and fuses matmul into the loop, but doesn't affect the - // "max" operation. This illustrates the precise targeting with the transform - // dialect. Otherwise, it is difficult to differentiate "add" and "max", both - // of which having the same kind. - %loop_2, %tiled_2 = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul_fused_2 = transform.structured.fuse_into_containing_op %matmul_fused into %loop_2 - : (!transform.any_op, !transform.any_op) -> !transform.any_op - - // Since outlining is currently only implemented for region-holding operations - // such as loops, use tiling to size 1 to materialize the outer loop that is - // going to be outlined. - %outline_target, %_ = transform.structured.tile_using_forall %tiled_2 tile_sizes [1] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %matmul_fused_2 into %outline_target - : (!transform.any_op, !transform.any_op) -> !transform.any_op - %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Rewrite the call target. - transform.my.change_call_target %call, "microkernel" : !transform.any_op - - transform.yield +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 + : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) + + // The actual tiling transformation takes tile sizes as attributes. It + // produces a handle to the loop generated during tiling. + %loop, %tiled = transform.structured.tile_using_forall %max + tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to + // `fuse_into_containing_op` that would take care of the ordering in this + // case. + %add_fused = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> !transform.any_op + %matmul_fused = transform.structured.fuse_into_containing_op %arg1 + into %loop + : (!transform.op<"linalg.matmul">, !transform.any_op) + -> !transform.any_op + + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the + // transform dialect. Otherwise, it is difficult to differentiate "add" and + // "max", both of which having the same kind. + %loop_2, %tiled_2 = transform.structured.tile_using_forall %add_fused + tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2 = transform.structured.fuse_into_containing_op %matmul_fused + into %loop_2 + : (!transform.any_op, !transform.any_op) -> !transform.any_op + + // Since outlining is currently only implemented for region-holding + // operations such as loops, use tiling to size 1 to materialize the outer + // loop that is going to be outlined. + %outline_target, %_ = transform.structured.tile_using_forall %tiled_2 tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.structured.fuse_into_containing_op %matmul_fused_2 into %outline_target + : (!transform.any_op, !transform.any_op) -> !transform.any_op + %func, %call = transform.loop.outline %outline_target + {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Rewrite the call target. + transform.my.change_call_target %call, "microkernel" : !transform.any_op + + transform.yield + } } ``` diff --git a/mlir/docs/Tutorials/transform/Ch3.md b/mlir/docs/Tutorials/transform/Ch3.md index 84251df..fa788d1 100644 --- a/mlir/docs/Tutorials/transform/Ch3.md +++ b/mlir/docs/Tutorials/transform/Ch3.md @@ -79,7 +79,7 @@ def CallOpInterfaceHandle // The type must implement `TransformHandleTypeInterface`. [DeclareTypeInterfaceMethods]> { - // The usual components of a type such as description, mnemonic and assembly format + // The usual components of a type such as description, mnemonic and assembly format // should be provided. let summary = "handle to payload operations implementing CallOpInterface"; let mnemonic = "my.call_op_interface"; @@ -87,7 +87,7 @@ def CallOpInterfaceHandle } ``` -We will omit the generation of declaration and definitions using Tablegen for brevity as it is identical to the regular case. +We will omit the generation of declaration and definitions using Tablegen for brevity as it is identical to the regular case. To finalize the definition of a transform type, one must implement the interface methods. @@ -109,9 +109,9 @@ mlir::transform::CallOpInterfaceHandleType::checkPayload( if (llvm::isa(op)) continue; - // By convention, these verifiers always emit a silenceable failure since they are + // By convention, these verifiers always emit a silenceable failure since they are // checking a precondition. - DiagnosedSilenceableFailure diag = emitSilenceableError(loc) + DiagnosedSilenceableFailure diag = emitSilenceableError(loc) << "expected the payload operation to implement CallOpInterface"; diag.attachNote(op->getLoc()) << "offending operation"; return diag; @@ -129,8 +129,8 @@ Additional attributes and types need to be registered in the extension, next to // In MyExtension.cpp. void MyExtension::init() { - // … - + // ... + registerTypes< #define GET_TYPEDEF_LIST #include "MyExtensionTypes.cpp.inc" diff --git a/mlir/docs/Tutorials/transform/Ch4.md b/mlir/docs/Tutorials/transform/Ch4.md index 9c9aba1..ad5221c 100644 --- a/mlir/docs/Tutorials/transform/Ch4.md +++ b/mlir/docs/Tutorials/transform/Ch4.md @@ -205,7 +205,7 @@ transform.named_sequence @__transform_main( %root: !transform.any_op {transform.readonly}) { // Collect groups of operations that match the criteria specified in the // named sequence. - %matmul, %el1, %el2 = transform.collect_matching @match_matmul_elemwise in %root + %matmul, %el1, %el2 = transform.collect_matching @match_matmul_elemwise in %root : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %elemwise = transform.merge_handles %el1, %el2 : !transform.any_op diff --git a/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp index 3a97531..874ad78 100644 --- a/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp +++ b/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp @@ -12,6 +12,7 @@ #include "MyExtension.h" +#include "mlir/Dialect/Transform/Transforms/Passes.h" #include "mlir/IR/DialectRegistry.h" #include "mlir/IR/MLIRContext.h" #include "mlir/InitAllDialects.h" @@ -20,14 +21,6 @@ #include "mlir/Transforms/Passes.h" #include -// Forward declarations of test passes that used in this chapter for -// illustrative purposes. Test passes are not directly exposed for use in -// binaries other than mlir-opt, which is too big to serve as an example. -namespace mlir::test { -void registerTestTransformDialectEraseSchedulePass(); -void registerTestTransformDialectInterpreterPass(); -} // namespace mlir::test - namespace test { void registerTestTransformDialectExtension(mlir::DialectRegistry &); } // namespace test @@ -39,22 +32,15 @@ int main(int argc, char **argv) { mlir::registerAllExtensions(registry); registerMyExtension(registry); + // Register transform interpreter pass. + mlir::transform::registerInterpreterPass(); + // Register a handful of cleanup passes that we can run to make the output IR // look nicer. mlir::registerCanonicalizerPass(); mlir::registerCSEPass(); mlir::registerSymbolDCEPass(); - // Register the test passes. -#ifdef MLIR_INCLUDE_TESTS - mlir::test::registerTestTransformDialectEraseSchedulePass(); - mlir::test::registerTestTransformDialectInterpreterPass(); - test::registerTestTransformDialectExtension(registry); -#else - llvm::errs() << "warning: MLIR built without test passes, interpreter " - "testing will not be available\n"; -#endif // MLIR_INCLUDE_TESTS - // Delegate to the MLIR utility for parsing and pass management. return mlir::MlirOptMain(argc, argv, "transform-opt-ch2", registry) .succeeded() diff --git a/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp index 3c348c6..c9150c6 100644 --- a/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp +++ b/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp @@ -12,6 +12,7 @@ #include "MyExtension.h" +#include "mlir/Dialect/Transform/Transforms/Passes.h" #include "mlir/IR/DialectRegistry.h" #include "mlir/IR/MLIRContext.h" #include "mlir/InitAllDialects.h" @@ -20,18 +21,6 @@ #include "mlir/Transforms/Passes.h" #include -// Forward declarations of test passes that used in this chapter for -// illustrative purposes. Test passes are not directly exposed for use in -// binaries other than mlir-opt, which is too big to serve as an example. -namespace mlir::test { -void registerTestTransformDialectEraseSchedulePass(); -void registerTestTransformDialectInterpreterPass(); -} // namespace mlir::test - -namespace test { -void registerTestTransformDialectExtension(mlir::DialectRegistry &); -} // namespace test - int main(int argc, char **argv) { // Register all "core" dialects and our transform dialect extension. mlir::DialectRegistry registry; @@ -39,22 +28,15 @@ int main(int argc, char **argv) { mlir::registerAllExtensions(registry); registerMyExtension(registry); + // Register the interpreter pass. + mlir::transform::registerInterpreterPass(); + // Register a handful of cleanup passes that we can run to make the output IR // look nicer. mlir::registerCanonicalizerPass(); mlir::registerCSEPass(); mlir::registerSymbolDCEPass(); - // Register the test passes. -#ifdef MLIR_INCLUDE_TESTS - mlir::test::registerTestTransformDialectEraseSchedulePass(); - mlir::test::registerTestTransformDialectInterpreterPass(); - test::registerTestTransformDialectExtension(registry); -#else - llvm::errs() << "warning: MLIR built without test passes, interpreter " - "testing will not be available\n"; -#endif // MLIR_INCLUDE_TESTS - // Delegate to the MLIR utility for parsing and pass management. return mlir::MlirOptMain(argc, argv, "transform-opt-ch3", registry) .succeeded() diff --git a/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp index 1019066..03c84bd 100644 --- a/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp +++ b/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp @@ -21,10 +21,6 @@ #include "mlir/Transforms/Passes.h" #include -namespace test { -void registerTestTransformDialectExtension(mlir::DialectRegistry &); -} // namespace test - int main(int argc, char **argv) { // Register all "core" dialects and our transform dialect extension. mlir::DialectRegistry registry; @@ -39,14 +35,6 @@ int main(int argc, char **argv) { mlir::registerSymbolDCEPass(); mlir::transform::registerInterpreterPass(); - // Register the test passes. -#ifdef MLIR_INCLUDE_TESTS - test::registerTestTransformDialectExtension(registry); -#else - llvm::errs() << "warning: MLIR built without test extension, interpreter " - "testing will not be available\n"; -#endif // MLIR_INCLUDE_TESTS - // Delegate to the MLIR utility for parsing and pass management. return mlir::MlirOptMain(argc, argv, "transform-opt-ch4", registry) .succeeded() diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td index c3436fd..1d6eb24 100644 --- a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td @@ -75,6 +75,10 @@ def InterpreterPass : Pass<"transform-interpreter"> { "Select the operation with 'transform.target_tag' attribute having " "the given value as payload IR root. If empty select the pass " "anchor operation as the payload IR root.">, + ListOption<"debugBindTrailingArgs", "debug-bind-trailing-args", + "std::string", + "Binds trailing arguments of the entry point to the payload " + "operations with specified names.">, Option<"disableExpensiveChecks", "disable-expensive-checks", "bool", "false", "Disable expensive checks in the interpreter for a faster run.">, diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h index 1737d72..738e0c5 100644 --- a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h +++ b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h @@ -84,6 +84,11 @@ LogicalResult applyTransformNamedSequence(Operation *payload, ModuleOp transformModule, const TransformOptions &options); +LogicalResult applyTransformNamedSequence(RaggedArray bindings, + TransformOpInterface transformRoot, + ModuleOp transformModule, + const TransformOptions &options); + } // namespace transform } // namespace mlir diff --git a/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h b/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h index 0ee2391..3d4083b 100644 --- a/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h +++ b/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h @@ -150,6 +150,9 @@ public: slices.resize(slices.size() + num, std::pair(-1, 0)); } + /// Removes the first subarray in-place. Invalidates iterators to all rows. + void removeFront() { slices.erase(slices.begin()); } + private: /// Appends the given elements to the storage and returns an ArrayRef /// pointing to them in the storage. diff --git a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp index c875519..5073234 100644 --- a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp +++ b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" #include "mlir/Dialect/Transform/Transforms/Passes.h" #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h" @@ -64,6 +65,20 @@ public: transform::detail::getPreloadedTransformModule(context); Operation *payloadRoot = findPayloadRoot(getOperation(), debugPayloadRootTag); + if (!payloadRoot) + return signalPassFailure(); + auto debugBindNames = llvm::map_to_vector( + debugBindTrailingArgs, + [&](const std::string &name) { return OperationName(name, context); }); + SmallVector, 2> trailingBindings; + trailingBindings.resize(debugBindNames.size()); + payloadRoot->walk([&](Operation *payload) { + for (auto &&[position, name] : llvm::enumerate(debugBindNames)) { + if (payload->getName() == name) + trailingBindings[position].push_back(payload); + } + }); + Operation *transformEntryPoint = transform::detail::findTransformEntryPoint( getOperation(), transformModule, entryPoint); if (!transformEntryPoint) { @@ -73,8 +88,15 @@ public: return signalPassFailure(); } + RaggedArray bindings; + bindings.push_back(ArrayRef{payloadRoot}); + for (SmallVector &trailing : trailingBindings) + bindings.push_back(std::move(trailing)); + if (failed(transform::applyTransformNamedSequence( - payloadRoot, transformEntryPoint, transformModule, + bindings, + cast(transformEntryPoint), + transformModule, options.enableExpensiveChecks(!disableExpensiveChecks)))) { return signalPassFailure(); } diff --git a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp index 2f74b76..8a9cd7c 100644 --- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp +++ b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp @@ -191,22 +191,46 @@ LogicalResult transform::detail::assembleTransformLibraryFromPaths( LogicalResult transform::applyTransformNamedSequence( Operation *payload, Operation *transformRoot, ModuleOp transformModule, const TransformOptions &options) { + RaggedArray bindings; + bindings.push_back(ArrayRef{payload}); + return applyTransformNamedSequence(bindings, + cast(transformRoot), + transformModule, options); +} + +LogicalResult transform::applyTransformNamedSequence( + RaggedArray bindings, TransformOpInterface transformRoot, + ModuleOp transformModule, const TransformOptions &options) { + if (bindings.empty()) { + return transformRoot.emitError() + << "expected at least one binding for the root"; + } + if (bindings.at(0).size() != 1) { + return transformRoot.emitError() + << "expected one payload to be bound to the first argument, got " + << bindings.at(0).size(); + } + auto *payloadRoot = bindings.at(0).front().dyn_cast(); + if (!payloadRoot) { + return transformRoot->emitError() << "expected the object bound to the " + "first argument to be an operation"; + } + + bindings.removeFront(); + // `transformModule` may not be modified. if (transformModule && !transformModule->isAncestor(transformRoot)) { OwningOpRef clonedTransformModule(transformModule->clone()); if (failed(detail::mergeSymbolsInto( SymbolTable::getNearestSymbolTable(transformRoot), std::move(clonedTransformModule)))) { - return payload->emitError() << "failed to merge symbols"; + return payloadRoot->emitError() << "failed to merge symbols"; } } LLVM_DEBUG(DBGS() << "Apply\n" << *transformRoot << "\n"); - LLVM_DEBUG(DBGS() << "To\n" << *payload << "\n"); + LLVM_DEBUG(DBGS() << "To\n" << *payloadRoot << "\n"); - // Apply the transform to the IR, do not enforce top-level constraints. - RaggedArray noExtraMappings; - return applyTransforms(payload, cast(transformRoot), - noExtraMappings, options, + return applyTransforms(payloadRoot, transformRoot, bindings, options, /*enforceToplevelTransformOp=*/false); } diff --git a/mlir/test/Examples/transform/Ch1/invalidation-1.mlir b/mlir/test/Examples/transform/Ch1/invalidation-1.mlir index 69b10ae..2264ade 100644 --- a/mlir/test/Examples/transform/Ch1/invalidation-1.mlir +++ b/mlir/test/Examples/transform/Ch1/invalidation-1.mlir @@ -1,8 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \ -// RUN: bind-first-extra-to-ops=linalg.matmul \ -// RUN: bind-second-extra-to-ops=linalg.elemwise_binary \ -// RUN: enable-expensive-checks},canonicalize,cse,symbol-dce)" \ +// RUN: --pass-pipeline="builtin.module(transform-interpreter{ \ +// RUN: debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\ +// RUN: canonicalize,cse,symbol-dce)" \ // RUN: --split-input-file --verify-diagnostics // ****************************** IMPORTANT NOTE ****************************** @@ -12,20 +11,22 @@ // // **************************************************************************** -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - // expected-note @below {{handle to invalidated ops}} - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // The actual tiling transformation takes tile sizes as attributes. - // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}} - %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32] - : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + // expected-note @below {{handle to invalidated ops}} + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // The actual tiling transformation takes tile sizes as attributes. + // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}} + %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32] + : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) - // This is trying to use an invalidated handle leading to undefined behavior. - // expected-error @below {{uses a handle invalidated by a previously executed transform op}} - transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul"> - transform.yield + // This is trying to use an invalidated handle leading to undefined behavior. + // expected-error @below {{uses a handle invalidated by a previously executed transform op}} + transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul"> + transform.yield + } } // Original function to optimize. @@ -52,27 +53,29 @@ func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, // ----- -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // We can cast one type to another as long as operations are compatible - // with both types. This creates "aliasing" handles. - // expected-note @below {{handle to invalidated ops}} - %casted = transform.cast %arg1 : !transform.op<"linalg.matmul"> to - !transform.any_op +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // We can cast one type to another as long as operations are compatible + // with both types. This creates "aliasing" handles. + // expected-note @below {{handle to invalidated ops}} + %casted = transform.cast %arg1 : !transform.op<"linalg.matmul"> to + !transform.any_op - // The actual tiling transformation takes tile sizes as attributes. - // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}} - %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32] - : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) + // The actual tiling transformation takes tile sizes as attributes. + // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}} + %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32] + : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) - // Consuming an operand invalidates the consumed handle and any other handle that is - // associated with the same payload operations, or payload operations nested in them. - // expected-error @below {{uses a handle invalidated by a previously executed transform op}} - transform.debug.emit_remark_at %casted, "remark" - : !transform.any_op - transform.yield + // Consuming an operand invalidates the consumed handle and any other handle that is + // associated with the same payload operations, or payload operations nested in them. + // expected-error @below {{uses a handle invalidated by a previously executed transform op}} + transform.debug.emit_remark_at %casted, "remark" + : !transform.any_op + transform.yield + } } // Original function to optimize. diff --git a/mlir/test/Examples/transform/Ch1/invalidation-2.mlir b/mlir/test/Examples/transform/Ch1/invalidation-2.mlir index c4a2f1e..0a84a5c 100644 --- a/mlir/test/Examples/transform/Ch1/invalidation-2.mlir +++ b/mlir/test/Examples/transform/Ch1/invalidation-2.mlir @@ -1,10 +1,8 @@ // RUN: mlir-opt %s \ -// RUN: --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \ -// RUN: bind-first-extra-to-ops=linalg.matmul \ -// RUN: bind-second-extra-to-ops=linalg.elemwise_binary \ -// RUN: enable-expensive-checks},canonicalize,cse,symbol-dce)" \ +// RUN: --pass-pipeline="builtin.module(transform-interpreter{ \ +// RUN: debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\ +// RUN: canonicalize,cse,symbol-dce)" \ // RUN: --split-input-file --verify-diagnostics - // ****************************** IMPORTANT NOTE ****************************** // // If you are changing this file, you may also need to change @@ -45,10 +43,11 @@ func.func private @microkernel( %init: tensor<4x4xf32>, %output: tensor<4x4xf32>) -> tensor<4x4xf32> -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { // Since the %arg2 handle is associated with both elementwise operations, // we need to split it into two handles so we can target only the second // elementwise operation. @@ -99,4 +98,5 @@ transform.sequence failures(propagate) { transform.debug.emit_remark_at %f, "fused" : !transform.any_op transform.yield + } } diff --git a/mlir/test/Examples/transform/Ch1/sequence.mlir b/mlir/test/Examples/transform/Ch1/sequence.mlir index 5de6e6e..3107adc 100644 --- a/mlir/test/Examples/transform/Ch1/sequence.mlir +++ b/mlir/test/Examples/transform/Ch1/sequence.mlir @@ -1,8 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \ -// RUN: bind-first-extra-to-ops=linalg.matmul \ -// RUN: bind-second-extra-to-ops=linalg.elemwise_binary \ -// RUN: enable-expensive-checks},canonicalize,cse,symbol-dce)" |\ +// RUN: --pass-pipeline="builtin.module(transform-interpreter{ \ +// RUN: debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\ +// RUN: canonicalize,cse,symbol-dce)" |\ // RUN: FileCheck %s // ****************************** IMPORTANT NOTE ****************************** @@ -60,52 +59,54 @@ func.func private @microkernel( %init: tensor<4x4xf32>, %output: tensor<4x4xf32>) -> tensor<4x4xf32> -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // Since the %arg2 handle is associated with both elementwise operations, - // we need to split it into two handles so we can target only the second - // elementwise operation. - %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) - -> (!transform.any_op, !transform.any_op) - - // The actual tiling transformation takes tile sizes as attributes. It produces a - // handle to the loop generated during tiling. - %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // We can now fuse the other operations into the loop. Here, we fuse - // operations one-by-one. This requires the operation that is being fused - // to define the value used within the loop, so the order of such fusions - // is important. We could also use "transform.merge_handles" to obtain - // a single handle to all operations and give it to `fuse_into_containing_op` - // that would take care of the ordering in this case. - %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 - : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Tile again to get the desired size. Note that this time this tiles the - // "add" operation and fuses matmul into the loop, but doesn't affect the - // "max" operation. This illustrates the precise targeting with the transform - // dialect. Otherwise, it is difficult to differentiate "add" and "max", both - // of which having the same kind. - %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul_fused_2, %loop_second_2 = - transform.structured.fuse_into_containing_op %matmul_fused into %loop_second - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Since outlining is currently only implemented for region-holding operations - // such as loops, use tiling to size 1 to materialize the outer loop that is - // going to be outlined. - %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} - : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">) - - transform.yield +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) + + // The actual tiling transformation takes tile sizes as attributes. It produces a + // handle to the loop generated during tiling. + %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to `fuse_into_containing_op` + // that would take care of the ordering in this case. + %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 + : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the transform + // dialect. Otherwise, it is difficult to differentiate "add" and "max", both + // of which having the same kind. + %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2, %loop_second_2 = + transform.structured.fuse_into_containing_op %matmul_fused into %loop_second + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Since outlining is currently only implemented for region-holding operations + // such as loops, use tiling to size 1 to materialize the outer loop that is + // going to be outlined. + %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">) + + transform.yield + } } diff --git a/mlir/test/Examples/transform/Ch2/invalid.mlir b/mlir/test/Examples/transform/Ch2/invalid.mlir index ad53683..cb67389 100644 --- a/mlir/test/Examples/transform/Ch2/invalid.mlir +++ b/mlir/test/Examples/transform/Ch2/invalid.mlir @@ -1,11 +1,11 @@ -// RUN: transform-opt-ch2 %s --test-transform-dialect-interpreter --split-input-file --verify-diagnostics +// RUN: transform-opt-ch2 %s --transform-interpreter --split-input-file \ +// RUN: --verify-diagnostics // expected-note @below {{offending payload}} -module { - transform.sequence failures(propagate) { - ^bb0(%arg0: !transform.any_op): +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op) { // expected-error @below {{only applies to func.call payloads}} transform.my.change_call_target %arg0, "updated" : !transform.any_op - yield + transform.yield } } diff --git a/mlir/test/Examples/transform/Ch2/ops.mlir b/mlir/test/Examples/transform/Ch2/ops.mlir index d66f89b..410a6e3 100644 --- a/mlir/test/Examples/transform/Ch2/ops.mlir +++ b/mlir/test/Examples/transform/Ch2/ops.mlir @@ -1,4 +1,4 @@ -// RUN: transform-opt-ch2 %s --test-transform-dialect-interpreter | FileCheck %s +// RUN: transform-opt-ch2 %s --transform-interpreter | FileCheck %s // ****************************** IMPORTANT NOTE ****************************** // @@ -17,10 +17,11 @@ func.func @test() { return } -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op): - %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.any_op - // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.any_op - transform.my.change_call_target %call, "updated" : !transform.any_op - transform.yield +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op) { + %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.any_op + // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.any_op + transform.my.change_call_target %call, "updated" : !transform.any_op + transform.yield + } } diff --git a/mlir/test/Examples/transform/Ch2/sequence.mlir b/mlir/test/Examples/transform/Ch2/sequence.mlir index b6f32dc..976df1d 100644 --- a/mlir/test/Examples/transform/Ch2/sequence.mlir +++ b/mlir/test/Examples/transform/Ch2/sequence.mlir @@ -1,8 +1,7 @@ // RUN: transform-opt-ch2 %s \ -// RUN: --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \ -// RUN: bind-first-extra-to-ops=linalg.matmul \ -// RUN: bind-second-extra-to-ops=linalg.elemwise_binary \ -// RUN: enable-expensive-checks},canonicalize,cse,symbol-dce)" |\ +// RUN: --pass-pipeline="builtin.module(transform-interpreter{ \ +// RUN: debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\ +// RUN: canonicalize,cse,symbol-dce)" |\ // RUN: FileCheck %s // ****************************** IMPORTANT NOTE ****************************** @@ -56,55 +55,57 @@ func.func private @microkernel( %init: tensor<4x4xf32>, %output: tensor<4x4xf32>) -> tensor<4x4xf32> -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // Since the %arg2 handle is associated with both elementwise operations, - // we need to split it into two handles so we can target only the second - // elementwise operation. - %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) - -> (!transform.any_op, !transform.any_op) +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) - // The actual tiling transformation takes tile sizes as attributes. It produces a - // handle to the loop generated during tiling. - %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + // The actual tiling transformation takes tile sizes as attributes. It produces a + // handle to the loop generated during tiling. + %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // We can now fuse the other operations into the loop. Here, we fuse - // operations one-by-one. This requires the operation that is being fused - // to define the value used within the loop, so the order of such fusions - // is important. We could also use "transform.merge_handles" to obtain - // a single handle to all operations and give it to `fuse_into_containing_op` - // that would take care of the ordering in this case. - %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 - : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to `fuse_into_containing_op` + // that would take care of the ordering in this case. + %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 + : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) - // Tile again to get the desired size. Note that this time this tiles the - // "add" operation and fuses matmul into the loop, but doesn't affect the - // "max" operation. This illustrates the precise targeting with the transform - // dialect. Otherwise, it is difficult to differentiate "add" and "max", both - // of which having the same kind. - %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul_fused_2, %loop_second_2 = - transform.structured.fuse_into_containing_op %matmul_fused into %loop_second - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the transform + // dialect. Otherwise, it is difficult to differentiate "add" and "max", both + // of which having the same kind. + %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2, %loop_second_2 = + transform.structured.fuse_into_containing_op %matmul_fused into %loop_second + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - // Since outlining is currently only implemented for region-holding operations - // such as loops, use tiling to size 1 to materialize the outer loop that is - // going to be outlined. - %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + // Since outlining is currently only implemented for region-holding operations + // such as loops, use tiling to size 1 to materialize the outer loop that is + // going to be outlined. + %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // Rewrite the call target. - transform.my.change_call_target %call, "microkernel" : !transform.any_op + // Rewrite the call target. + transform.my.change_call_target %call, "microkernel" : !transform.any_op - transform.yield + transform.yield + } } diff --git a/mlir/test/Examples/transform/Ch3/invalid.mlir b/mlir/test/Examples/transform/Ch3/invalid.mlir index 2226295..acaabd5 100644 --- a/mlir/test/Examples/transform/Ch3/invalid.mlir +++ b/mlir/test/Examples/transform/Ch3/invalid.mlir @@ -1,10 +1,10 @@ -// RUN: transform-opt-ch3 %s --test-transform-dialect-interpreter --split-input-file --verify-diagnostics +// RUN: transform-opt-ch3 %s --transform-interpreter --split-input-file --verify-diagnostics // expected-note @below {{offending operation}} -module { - transform.sequence failures(suppress) { +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( // expected-error @below {{expected the payload operation to implement CallOpInterface}} - ^bb0(%arg0: !transform.my.call_op_interface): - yield + %arg0: !transform.my.call_op_interface) { + transform.yield } } diff --git a/mlir/test/Examples/transform/Ch3/ops.mlir b/mlir/test/Examples/transform/Ch3/ops.mlir index f4170b8..b2d47cc 100644 --- a/mlir/test/Examples/transform/Ch3/ops.mlir +++ b/mlir/test/Examples/transform/Ch3/ops.mlir @@ -1,4 +1,4 @@ -// RUN: transform-opt-ch3 %s --test-transform-dialect-interpreter \ +// RUN: transform-opt-ch3 %s --transform-interpreter \ // RUN: --allow-unregistered-dialect --split-input-file | FileCheck %s // ****************************** IMPORTANT NOTE ****************************** @@ -18,12 +18,13 @@ func.func @test1() { return } -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op): - %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.op<"func.call"> - // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.op<"func.call"> - transform.my.change_call_target %call, "updated" : !transform.op<"func.call"> - transform.yield +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op) { + %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.op<"func.call"> + // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.op<"func.call"> + transform.my.change_call_target %call, "updated" : !transform.op<"func.call"> + transform.yield + } } // ----- @@ -37,10 +38,11 @@ func.func @test2() { return } -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op): - %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.my.call_op_interface - // CHECK: transform.my.call_to_op %{{.*}} : (!transform.my.call_op_interface) -> !transform.any_op - transform.my.call_to_op %call : (!transform.my.call_op_interface) -> !transform.any_op - transform.yield +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op) { + %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.my.call_op_interface + // CHECK: transform.my.call_to_op %{{.*}} : (!transform.my.call_op_interface) -> !transform.any_op + transform.my.call_to_op %call : (!transform.my.call_op_interface) -> !transform.any_op + transform.yield + } } diff --git a/mlir/test/Examples/transform/Ch3/sequence.mlir b/mlir/test/Examples/transform/Ch3/sequence.mlir index 9dd46b3..8dc33c3 100644 --- a/mlir/test/Examples/transform/Ch3/sequence.mlir +++ b/mlir/test/Examples/transform/Ch3/sequence.mlir @@ -1,8 +1,7 @@ -// RUN: transform-opt-ch2 %s \ -// RUN: --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \ -// RUN: bind-first-extra-to-ops=linalg.matmul \ -// RUN: bind-second-extra-to-ops=linalg.elemwise_binary \ -// RUN: enable-expensive-checks},canonicalize,cse,symbol-dce)" |\ +// RUN: transform-opt-ch3 %s \ +// RUN: --pass-pipeline="builtin.module(transform-interpreter{ \ +// RUN: debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\ +// RUN: canonicalize,cse,symbol-dce)" |\ // RUN: FileCheck %s // ****************************** IMPORTANT NOTE ****************************** @@ -56,55 +55,57 @@ func.func private @microkernel( %init: tensor<4x4xf32>, %output: tensor<4x4xf32>) -> tensor<4x4xf32> -transform.sequence failures(propagate) { -^bb0(%arg0: !transform.any_op, - %arg1: !transform.op<"linalg.matmul">, - %arg2: !transform.op<"linalg.elemwise_binary">): - // Since the %arg2 handle is associated with both elementwise operations, - // we need to split it into two handles so we can target only the second - // elementwise operation. - %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) - -> (!transform.any_op, !transform.any_op) - - // The actual tiling transformation takes tile sizes as attributes. It produces a - // handle to the loop generated during tiling. - %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // We can now fuse the other operations into the loop. Here, we fuse - // operations one-by-one. This requires the operation that is being fused - // to define the value used within the loop, so the order of such fusions - // is important. We could also use "transform.merge_handles" to obtain - // a single handle to all operations and give it to `fuse_into_containing_op` - // that would take care of the ordering in this case. - %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 - : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Tile again to get the desired size. Note that this time this tiles the - // "add" operation and fuses matmul into the loop, but doesn't affect the - // "max" operation. This illustrates the precise targeting with the transform - // dialect. Otherwise, it is difficult to differentiate "add" and "max", both - // of which having the same kind. - %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul_fused_2, %loop_second_2 = - transform.structured.fuse_into_containing_op %matmul_fused into %loop_second - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Since outlining is currently only implemented for region-holding operations - // such as loops, use tiling to size 1 to materialize the outer loop that is - // going to be outlined. - %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} - : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">) - - // Rewrite the call target. - transform.my.change_call_target %call, "microkernel" : !transform.op<"func.call"> - - transform.yield +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">) { + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) + + // The actual tiling transformation takes tile sizes as attributes. It produces a + // handle to the loop generated during tiling. + %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to `fuse_into_containing_op` + // that would take care of the ordering in this case. + %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 + : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the transform + // dialect. Otherwise, it is difficult to differentiate "add" and "max", both + // of which having the same kind. + %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2, %loop_second_2 = + transform.structured.fuse_into_containing_op %matmul_fused into %loop_second + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Since outlining is currently only implemented for region-holding operations + // such as loops, use tiling to size 1 to materialize the outer loop that is + // going to be outlined. + %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">) + + // Rewrite the call target. + transform.my.change_call_target %call, "microkernel" : !transform.op<"func.call"> + + transform.yield + } } diff --git a/mlir/test/Examples/transform/ChH/full.mlir b/mlir/test/Examples/transform/ChH/full.mlir index d90d740..f8d9103 100644 --- a/mlir/test/Examples/transform/ChH/full.mlir +++ b/mlir/test/Examples/transform/ChH/full.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --test-transform-dialect-interpreter \ +// RUN: mlir-opt %s --transform-interpreter \ // RUN: --test-transform-dialect-erase-schedule \ // RUN: --math-uplift-to-fma \ // RUN: --convert-bufferization-to-memref \ @@ -115,9 +115,9 @@ module attributes { transform.with_named_sequence } { // have no effect on the Halide IR as of 294f80c49bf3bb8582446613c25fcce03b82. // Also note that the order of dimensions in Halide is inverted, e.g., co and // n are the outermost loops in the respective reorder directives. - transform.sequence failures(propagate) { + transform.named_sequence @__transform_main( // This argument will point to the top-level module. - ^bb0(%arg0: !transform.any_op): + %arg0: !transform.any_op) { // 1. Find the operations we are going to transform usnig their names. This // is a simplistic approach that works when there are few operations in the -- cgit v1.1 From 6d1396148977ca275df243a965ac504448bf5faa Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Fri, 9 Feb 2024 17:40:08 +0100 Subject: [libc++][test] Improves substitution naming (#80471) Using the `-dir` suffix for directories makes it easier to understand. Fixes: https://github.com/llvm/llvm-project/issues/78310 --- libcxx/test/configs/apple-libc++-backdeployment.cfg.in | 4 ++-- libcxx/test/configs/apple-libc++-shared.cfg.in | 6 +++--- libcxx/test/configs/armv7m-picolibc-libc++.cfg.in | 4 ++-- libcxx/test/configs/cmake-bridge.cfg.in | 12 ++++++------ libcxx/test/configs/ibm-libc++-shared.cfg.in | 6 +++--- libcxx/test/configs/llvm-libc++-android-ndk.cfg.in | 4 ++-- libcxx/test/configs/llvm-libc++-mingw.cfg.in | 6 +++--- libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in | 6 +++--- libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in | 4 ++-- .../llvm-libc++-shared-no-vcruntime-clangcl.cfg.in | 6 +++--- libcxx/test/configs/llvm-libc++-shared.cfg.in | 4 ++-- libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in | 4 ++-- libcxx/test/configs/llvm-libc++-static.cfg.in | 4 ++-- .../assertions/headers_declare_verbose_abort.gen.py | 2 +- libcxx/test/libcxx/clang_modules_include.gen.py | 2 +- libcxx/test/libcxx/clang_tidy.gen.py | 6 +++--- libcxx/test/libcxx/double_include.gen.py | 2 +- libcxx/test/libcxx/header_inclusions.gen.py | 2 +- libcxx/test/libcxx/headers_in_modulemap.sh.py | 2 +- libcxx/test/libcxx/libcpp_version.gen.py | 2 +- libcxx/test/libcxx/module_std.gen.py | 6 +++--- libcxx/test/libcxx/module_std_compat.gen.py | 6 +++--- libcxx/test/libcxx/no_assert_include.gen.py | 2 +- libcxx/test/libcxx/system_reserved_names.gen.py | 2 +- libcxx/test/libcxx/transitive_includes.gen.py | 8 ++++---- .../vendor/apple/system-install-properties.sh.cpp | 18 +++++++++--------- .../libcxx/vendor/clang-cl/static-lib-exports.sh.cpp | 4 ++-- .../test/libcxx/vendor/mingw/static-lib-exports.sh.cpp | 4 ++-- libcxx/utils/libcxx/test/features.py | 2 +- libcxx/utils/libcxx/test/format.py | 4 ++-- 30 files changed, 72 insertions(+), 72 deletions(-) diff --git a/libcxx/test/configs/apple-libc++-backdeployment.cfg.in b/libcxx/test/configs/apple-libc++-backdeployment.cfg.in index b471c02..4259446 100644 --- a/libcxx/test/configs/apple-libc++-backdeployment.cfg.in +++ b/libcxx/test/configs/apple-libc++-backdeployment.cfg.in @@ -45,10 +45,10 @@ config.substitutions.append(('%{flags}', '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '' )) config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -I %{include} -I %{libcxx}/test/support' + '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support' )) config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib} -lc++' + '-nostdlib++ -L %{lib-dir} -lc++' )) config.substitutions.append(('%{exec}', '%{executor} --execdir %T --env DYLD_LIBRARY_PATH="%{cxx-runtime-root}:%{abi-runtime-root}:%{unwind-runtime-root}" -- ' diff --git a/libcxx/test/configs/apple-libc++-shared.cfg.in b/libcxx/test/configs/apple-libc++-shared.cfg.in index af1926e..2d0aee3 100644 --- a/libcxx/test/configs/apple-libc++-shared.cfg.in +++ b/libcxx/test/configs/apple-libc++-shared.cfg.in @@ -13,13 +13,13 @@ config.substitutions.append(('%{flags}', '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '' )) config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -I %{include} -I %{libcxx}/test/support' + '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support' )) config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib} -lc++' + '-nostdlib++ -L %{lib-dir} -lc++' )) config.substitutions.append(('%{exec}', - '%{executor} --execdir %T --env DYLD_LIBRARY_PATH=%{lib} -- ' + '%{executor} --execdir %T --env DYLD_LIBRARY_PATH=%{lib-dir} -- ' )) config.stdlib = 'apple-libc++' diff --git a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in index a39d43a..8ca8603 100644 --- a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in +++ b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in @@ -5,7 +5,7 @@ libc_linker_script = '@CMAKE_INSTALL_PREFIX@/lib/picolibcpp.ld' config.substitutions.append(('%{flags}', '--sysroot=@CMAKE_INSTALL_PREFIX@')) config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support' + '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support' # Disable warnings in cxx_atomic_impl.h: # "large atomic operation may incur significant performance penalty; the @@ -17,7 +17,7 @@ config.substitutions.append(('%{compile_flags}', ' -include picolibc.h' )) config.substitutions.append(('%{link_flags}', - '-nostdlib -nostdlib++ -L %{lib} -lc++ -lc++abi' + '-nostdlib -nostdlib++ -L %{lib-dir} -lc++ -lc++abi' ' -lc -lm -lclang_rt.builtins -lsemihost -lcrt0-semihost' + ' -T {}'.format(libc_linker_script) + ' -Wl,--defsym=__flash=0x0' diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in index 72b2ddf..84b3270 100644 --- a/libcxx/test/configs/cmake-bridge.cfg.in +++ b/libcxx/test/configs/cmake-bridge.cfg.in @@ -25,9 +25,9 @@ config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test') # Add substitutions for bootstrapping the test suite configuration import shlex config.substitutions.append(('%{cxx}', shlex.quote('@CMAKE_CXX_COMPILER@'))) -config.substitutions.append(('%{libcxx}', '@LIBCXX_SOURCE_DIR@')) -config.substitutions.append(('%{include}', '@LIBCXX_GENERATED_INCLUDE_DIR@')) -config.substitutions.append(('%{target-include}', '@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@')) -config.substitutions.append(('%{lib}', '@LIBCXX_LIBRARY_DIR@')) -config.substitutions.append(('%{module}', '@LIBCXX_GENERATED_MODULE_DIR@')) -config.substitutions.append(('%{test-tools}', '@LIBCXX_TEST_TOOLS_PATH@')) +config.substitutions.append(('%{libcxx-dir}', '@LIBCXX_SOURCE_DIR@')) +config.substitutions.append(('%{include-dir}', '@LIBCXX_GENERATED_INCLUDE_DIR@')) +config.substitutions.append(('%{target-include-dir}', '@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@')) +config.substitutions.append(('%{lib-dir}', '@LIBCXX_LIBRARY_DIR@')) +config.substitutions.append(('%{module-dir}', '@LIBCXX_GENERATED_MODULE_DIR@')) +config.substitutions.append(('%{test-tools-dir}', '@LIBCXX_TEST_TOOLS_PATH@')) diff --git a/libcxx/test/configs/ibm-libc++-shared.cfg.in b/libcxx/test/configs/ibm-libc++-shared.cfg.in index 50061e9..0f86e74 100644 --- a/libcxx/test/configs/ibm-libc++-shared.cfg.in +++ b/libcxx/test/configs/ibm-libc++-shared.cfg.in @@ -12,13 +12,13 @@ if lit.util.isAIXTriple(config.target_triple): config.substitutions.append(('%{flags}', '-pthread')) config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -D__LIBC_NO_CPP_MATH_OVERLOADS__ -I %{include} -I %{libcxx}/test/support' + '-nostdinc++ -D__LIBC_NO_CPP_MATH_OVERLOADS__ -I %{include-dir} -I %{libcxx-dir}/test/support' )) config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib} -lc++ -lc++abi -latomic -Wl,-bbigtoc' + '-nostdlib++ -L %{lib-dir} -lc++ -lc++abi -latomic -Wl,-bbigtoc' )) config.substitutions.append(('%{exec}', - '%{executor} --execdir %T --env LIBPATH=%{lib} -- ' + '%{executor} --execdir %T --env LIBPATH=%{lib-dir} -- ' )) # LIBCXX-AIX-FIXME is the feature name used to XFAIL the diff --git a/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in b/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in index 1be8527..d5f1ccc 100644 --- a/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in +++ b/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in @@ -16,7 +16,7 @@ config.substitutions.append(('%{flags}', '--sysroot @CMAKE_SYSROOT@' if '@CMAKE_SYSROOT@' else '' )) -compile_flags = '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support' +compile_flags = '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support' if re.match(r'i686-linux-android(21|22|23)$', config.target_triple): # 32-bit x86 Android has a bug where the stack is sometimes misaligned. # The problem appears limited to versions before Android N (API 24) and only @@ -31,7 +31,7 @@ config.substitutions.append(('%{compile_flags}', compile_flags)) # libc++_shared.so because older Bionic dynamic loaders don't support rpath # lookup. config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib} -lc++_shared' + '-nostdlib++ -L %{lib-dir} -lc++_shared' )) config.substitutions.append(('%{exec}', '%{executor}' + diff --git a/libcxx/test/configs/llvm-libc++-mingw.cfg.in b/libcxx/test/configs/llvm-libc++-mingw.cfg.in index eb77f11..8a0cc96 100644 --- a/libcxx/test/configs/llvm-libc++-mingw.cfg.in +++ b/libcxx/test/configs/llvm-libc++-mingw.cfg.in @@ -5,13 +5,13 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') config.substitutions.append(('%{flags}', '')) config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support' + '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support' )) config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib} -lc++' + '-nostdlib++ -L %{lib-dir} -lc++' )) config.substitutions.append(('%{exec}', - '%{executor} --execdir %T --prepend_env PATH=%{lib} -- ' + '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- ' )) import os, site diff --git a/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in index 50d28eb..cca88c8 100644 --- a/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in +++ b/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in @@ -5,13 +5,13 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') config.substitutions.append(('%{flags}', '--driver-mode=g++')) config.substitutions.append(('%{compile_flags}', - '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include + '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include )) config.substitutions.append(('%{link_flags}', - '-nostdlib -L %{lib} -lc++ -l' + config.cxx_lib + '-nostdlib -L %{lib-dir} -lc++ -l' + config.cxx_lib )) config.substitutions.append(('%{exec}', - '%{executor} --execdir %T --prepend_env PATH=%{lib} -- ' + '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- ' )) import os, site diff --git a/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in b/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in index a75e90b..7d107c8 100644 --- a/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in +++ b/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in @@ -6,10 +6,10 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') config.substitutions.append(('%{flags}', '-pthread')) config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support' + '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support' )) config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib} -Wl,-rpath,%{lib} -lc++ -lm' + '-nostdlib++ -L %{lib-dir} -Wl,-rpath,%{lib-dir} -lc++ -lm' )) config.substitutions.append(('%{exec}', '%{executor} --execdir %T -- ' diff --git a/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in index 4c88af3..a8ad920 100644 --- a/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in +++ b/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in @@ -6,13 +6,13 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') config.substitutions.append(('%{flags}', '--driver-mode=g++')) config.substitutions.append(('%{compile_flags}', - '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX -D_HAS_EXCEPTIONS=0' + config.dbg_include + '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX -D_HAS_EXCEPTIONS=0' + config.dbg_include )) config.substitutions.append(('%{link_flags}', - '-nostdlib -L %{lib} -lc++ -l' + config.cxx_lib + '-nostdlib -L %{lib-dir} -lc++ -l' + config.cxx_lib )) config.substitutions.append(('%{exec}', - '%{executor} --execdir %T --prepend_env PATH=%{lib} -- ' + '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- ' )) import os, site diff --git a/libcxx/test/configs/llvm-libc++-shared.cfg.in b/libcxx/test/configs/llvm-libc++-shared.cfg.in index 143b3b3..5199f64 100644 --- a/libcxx/test/configs/llvm-libc++-shared.cfg.in +++ b/libcxx/test/configs/llvm-libc++-shared.cfg.in @@ -7,10 +7,10 @@ config.substitutions.append(('%{flags}', '-pthread' + (' -isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '') )) config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support' + '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support' )) config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib} -Wl,-rpath,%{lib} -lc++' + '-nostdlib++ -L %{lib-dir} -Wl,-rpath,%{lib-dir} -lc++' )) config.substitutions.append(('%{exec}', '%{executor} --execdir %T -- ' diff --git a/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in index 4baaad7..7c700bf 100644 --- a/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in +++ b/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in @@ -5,10 +5,10 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') config.substitutions.append(('%{flags}', '--driver-mode=g++')) config.substitutions.append(('%{compile_flags}', - '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include + '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include )) config.substitutions.append(('%{link_flags}', - '-nostdlib -L %{lib} -llibc++ -l' + config.cxx_lib + '-nostdlib -L %{lib-dir} -llibc++ -l' + config.cxx_lib )) config.substitutions.append(('%{exec}', '%{executor} --execdir %T -- ' diff --git a/libcxx/test/configs/llvm-libc++-static.cfg.in b/libcxx/test/configs/llvm-libc++-static.cfg.in index e866d4f..097cc4d 100644 --- a/libcxx/test/configs/llvm-libc++-static.cfg.in +++ b/libcxx/test/configs/llvm-libc++-static.cfg.in @@ -7,10 +7,10 @@ config.substitutions.append(('%{flags}', '-pthread' + (' -isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '') )) config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support' + '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support' )) config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib} -lc++ -lc++abi' + '-nostdlib++ -L %{lib-dir} -lc++ -lc++abi' )) config.substitutions.append(('%{exec}', '%{executor} --execdir %T -- ' diff --git a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py index a4e1c3c..bd883aa 100644 --- a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py +++ b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py @@ -10,7 +10,7 @@ # is required for users to be able to include any public header and then override # the function using a strong definition. -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys sys.path.append(sys.argv[1]) diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py index 26ef207..e3593ee 100644 --- a/libcxx/test/libcxx/clang_modules_include.gen.py +++ b/libcxx/test/libcxx/clang_modules_include.gen.py @@ -10,7 +10,7 @@ # This is important notably because the LLDB data formatters use # libc++ headers with modules enabled. -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys sys.path.append(sys.argv[1]) diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py index b2f1a17..19b6a99 100644 --- a/libcxx/test/libcxx/clang_tidy.gen.py +++ b/libcxx/test/libcxx/clang_tidy.gen.py @@ -8,7 +8,7 @@ # Run our custom libc++ clang-tidy checks on all public headers. -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys sys.path.append(sys.argv[1]) @@ -27,8 +27,8 @@ for header in public_headers: {lit_header_restrictions.get(header, '')} // TODO: run clang-tidy with modules enabled once they are supported -// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules -// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules +// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules +// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules #include <{header}> """) diff --git a/libcxx/test/libcxx/double_include.gen.py b/libcxx/test/libcxx/double_include.gen.py index 85055df..2fcfa50 100644 --- a/libcxx/test/libcxx/double_include.gen.py +++ b/libcxx/test/libcxx/double_include.gen.py @@ -8,7 +8,7 @@ # Test that we can include each header in two TU's and link them together. -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys sys.path.append(sys.argv[1]) diff --git a/libcxx/test/libcxx/header_inclusions.gen.py b/libcxx/test/libcxx/header_inclusions.gen.py index cdbc5b3..faaa4cf 100644 --- a/libcxx/test/libcxx/header_inclusions.gen.py +++ b/libcxx/test/libcxx/header_inclusions.gen.py @@ -9,7 +9,7 @@ # Test that all headers include all the other headers they're supposed to, as # prescribed by the Standard. -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys sys.path.append(sys.argv[1]) diff --git a/libcxx/test/libcxx/headers_in_modulemap.sh.py b/libcxx/test/libcxx/headers_in_modulemap.sh.py index fe007f0..237b006 100644 --- a/libcxx/test/libcxx/headers_in_modulemap.sh.py +++ b/libcxx/test/libcxx/headers_in_modulemap.sh.py @@ -1,4 +1,4 @@ -# RUN: %{python} %s %{libcxx}/utils %{include} +# RUN: %{python} %s %{libcxx-dir}/utils %{include-dir} import sys diff --git a/libcxx/test/libcxx/libcpp_version.gen.py b/libcxx/test/libcxx/libcpp_version.gen.py index 47439b0..7d9519d 100644 --- a/libcxx/test/libcxx/libcpp_version.gen.py +++ b/libcxx/test/libcxx/libcpp_version.gen.py @@ -8,7 +8,7 @@ # Test that all headers define the _LIBCPP_VERSION macro. -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys sys.path.append(sys.argv[1]) diff --git a/libcxx/test/libcxx/module_std.gen.py b/libcxx/test/libcxx/module_std.gen.py index a9a05a0..fc23985 100644 --- a/libcxx/test/libcxx/module_std.gen.py +++ b/libcxx/test/libcxx/module_std.gen.py @@ -16,7 +16,7 @@ # to be one monolitic test. Since the test doesn't take very long it's # not a huge issue. -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys @@ -25,9 +25,9 @@ from libcxx.test.modules import module_test_generator generator = module_test_generator( "%t", - "%{module}", + "%{module-dir}", "%{clang-tidy}", - "%{test-tools}/clang_tidy_checks/libcxx-tidy.plugin", + "%{test-tools-dir}/clang_tidy_checks/libcxx-tidy.plugin", "%{cxx}", "%{flags} %{compile_flags}", "std", diff --git a/libcxx/test/libcxx/module_std_compat.gen.py b/libcxx/test/libcxx/module_std_compat.gen.py index 270d131..000aa29 100644 --- a/libcxx/test/libcxx/module_std_compat.gen.py +++ b/libcxx/test/libcxx/module_std_compat.gen.py @@ -16,7 +16,7 @@ # to be one monolitic test. Since the test doesn't take very long it's # not a huge issue. -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys @@ -26,9 +26,9 @@ from libcxx.test.modules import module_test_generator generator = module_test_generator( "%t", - "%{module}", + "%{module-dir}", "%{clang-tidy}", - "%{test-tools}/clang_tidy_checks/libcxx-tidy.plugin", + "%{test-tools-dir}/clang_tidy_checks/libcxx-tidy.plugin", "%{cxx}", "%{flags} %{compile_flags}", "std.compat", diff --git a/libcxx/test/libcxx/no_assert_include.gen.py b/libcxx/test/libcxx/no_assert_include.gen.py index a5e733d..dd8006d 100644 --- a/libcxx/test/libcxx/no_assert_include.gen.py +++ b/libcxx/test/libcxx/no_assert_include.gen.py @@ -9,7 +9,7 @@ # Ensure that none of the standard C++ headers implicitly include cassert or # assert.h (because assert() is implemented as a macro). -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys sys.path.append(sys.argv[1]) diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py index 5b75dba..0d935a1 100644 --- a/libcxx/test/libcxx/system_reserved_names.gen.py +++ b/libcxx/test/libcxx/system_reserved_names.gen.py @@ -10,7 +10,7 @@ # alphabetic macros. Also ensure that we don't swallow the definition of user # provided macros (in other words, ensure that we push/pop correctly everywhere). -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys sys.path.append(sys.argv[1]) diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py index 43f92d9..28f223c 100644 --- a/libcxx/test/libcxx/transitive_includes.gen.py +++ b/libcxx/test/libcxx/transitive_includes.gen.py @@ -16,7 +16,7 @@ # forever, however we do try to group removals for a couple of releases # to avoid breaking users at every release. -# RUN: %{python} %s %{libcxx}/utils +# RUN: %{python} %s %{libcxx-dir}/utils import sys sys.path.append(sys.argv[1]) @@ -48,7 +48,7 @@ if regenerate_expected_results: all_traces.append(f'%t/trace-includes.{normalized_header}.txt') print(f"""\ -// RUN{BLOCKLIT}: %{{python}} %{{libcxx}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv +// RUN{BLOCKLIT}: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv """) else: @@ -83,8 +83,8 @@ else: // RUN{BLOCKLIT}: mkdir %t // RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt -// RUN{BLOCKLIT}: %{{python}} %{{libcxx}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv -// RUN{BLOCKLIT}: cat %{{libcxx}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv +// RUN{BLOCKLIT}: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv +// RUN{BLOCKLIT}: cat %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv // RUN{BLOCKLIT}: diff -w %t/expected_transitive_includes.csv %t/actual_transitive_includes.csv #include <{header}> """) diff --git a/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp b/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp index 6c84e0d..3e2e080 100644 --- a/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp +++ b/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp @@ -13,17 +13,17 @@ // Make sure we install the libc++ headers in the right location. // -// RUN: stat "%{include}/__config" +// RUN: stat "%{include-dir}/__config" // Make sure we install libc++.1.dylib and libc++experimental.a in the right location. // -// RUN: stat "%{lib}/libc++.1.dylib" -// RUN: stat "%{lib}/libc++experimental.a" +// RUN: stat "%{lib-dir}/libc++.1.dylib" +// RUN: stat "%{lib-dir}/libc++experimental.a" // Make sure we install a symlink from libc++.dylib to libc++.1.dylib. // -// RUN: stat "%{lib}/libc++.dylib" -// RUN: readlink "%{lib}/libc++.dylib" | grep "libc++.1.dylib" +// RUN: stat "%{lib-dir}/libc++.dylib" +// RUN: readlink "%{lib-dir}/libc++.dylib" | grep "libc++.1.dylib" // Make sure the install_name is /usr/lib. // @@ -34,15 +34,15 @@ // // TODO: We currently don't do that correctly in the CMake build. // -// XRUNX: otool -L "%{lib}/libc++.1.dylib" | grep '/usr/lib/libc++.1.dylib' -// XRUNX: ! otool -l "%{lib}/libc++.1.dylib" | grep -E "LC_RPATH|@loader_path|@rpath" +// XRUNX: otool -L "%{lib-dir}/libc++.1.dylib" | grep '/usr/lib/libc++.1.dylib' +// XRUNX: ! otool -l "%{lib-dir}/libc++.1.dylib" | grep -E "LC_RPATH|@loader_path|@rpath" // Make sure the compatibility_version of libc++ is 1.0.0. // Failure to respect this can result in applications not being able to find libc++ // when they are loaded by dyld, if the compatibility version was bumped. // -// RUN: otool -L "%{lib}/libc++.1.dylib" | grep "libc++.1.dylib" | grep "compatibility version 1.0.0" +// RUN: otool -L "%{lib-dir}/libc++.1.dylib" | grep "libc++.1.dylib" | grep "compatibility version 1.0.0" // Make sure we use the libdispatch backend for the PSTL. // -// RUN: grep "%{include}/__config_site" -e '#define _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH' +// RUN: grep "%{include-dir}/__config_site" -e '#define _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH' diff --git a/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp b/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp index 447454e..7ed1492 100644 --- a/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp +++ b/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp @@ -11,6 +11,6 @@ // This file checks that the built static libraries don't contain dllexport // directives in clang-cl builds. -// RUN: llvm-readobj --coff-directives "%{lib}/libc++.lib" | not grep -i "export:" > /dev/null +// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++.lib" | not grep -i "export:" > /dev/null -// RUN: llvm-readobj --coff-directives "%{lib}/libc++experimental.lib" | not grep -i "export:" > /dev/null +// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++experimental.lib" | not grep -i "export:" > /dev/null diff --git a/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp b/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp index 8f29f5a..e20269f 100644 --- a/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp +++ b/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp @@ -11,6 +11,6 @@ // This file checks that the built static libraries don't contain dllexport // directives in MinGW builds. -// RUN: llvm-readobj --coff-directives "%{lib}/libc++.a" | not grep -i "export:" > /dev/null +// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++.a" | not grep -i "export:" > /dev/null -// RUN: llvm-readobj --coff-directives "%{lib}/libc++experimental.a" | not grep -i "export:" > /dev/null +// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++experimental.a" | not grep -i "export:" > /dev/null diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py index a9fb64a..6ef4075 100644 --- a/libcxx/utils/libcxx/test/features.py +++ b/libcxx/utils/libcxx/test/features.py @@ -27,7 +27,7 @@ _msvcVersion = lambda cfg: (int(compilerMacros(cfg)["_MSC_VER"]) // 100, int(com def _getSuitableClangTidy(cfg): try: # If we didn't build the libcxx-tidy plugin via CMake, we can't run the clang-tidy tests. - if runScriptExitCode(cfg, ["stat %{test-tools}/clang_tidy_checks/libcxx-tidy.plugin"]) != 0: + if runScriptExitCode(cfg, ["stat %{test-tools-dir}/clang_tidy_checks/libcxx-tidy.plugin"]) != 0: return None # TODO MODULES require ToT due module specific fixes. diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py index 1317521..229da22 100644 --- a/libcxx/utils/libcxx/test/format.py +++ b/libcxx/utils/libcxx/test/format.py @@ -172,7 +172,7 @@ def parseScript(test, preamble): f"{compileFlags} " "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal " "-fmodule-file=std=%T/std.pcm " # The std.compat module imports std. - "--precompile -o %T/std.compat.pcm -c %{module}/std.compat.cppm", + "--precompile -o %T/std.compat.pcm -c %{module-dir}/std.compat.cppm", ) moduleCompileFlags.extend( ["-fmodule-file=std.compat=%T/std.compat.pcm", "%T/std.compat.pcm"] @@ -188,7 +188,7 @@ def parseScript(test, preamble): "%dbg(MODULE std) %{cxx} %{flags} " f"{compileFlags} " "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal " - "--precompile -o %T/std.pcm -c %{module}/std.cppm", + "--precompile -o %T/std.pcm -c %{module-dir}/std.cppm", ) moduleCompileFlags.extend(["-fmodule-file=std=%T/std.pcm", "%T/std.pcm"]) -- cgit v1.1 From 4bf9fa5fb50497878edf8e277574ea9fb7d6bb7f Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Fri, 9 Feb 2024 17:41:46 +0100 Subject: [libc++][modules] Guard missing header validation on Windows. (#80478) On Windows the libc++ test suite sees the MSVC STL headers and may conclude these are libc++ headers when inspecting the name. Modules guard against forgetting to export new headers. Finding MSVC STL's headers gives false positives. Since the CI tests non-Windows platforms too, the validation will be disabled on Windows. Fixes: https://github.com/llvm/llvm-project/issues/79010 --------- Co-authored-by: Louis Dionne --- libcxx/modules/std.compat.cppm.in | 75 ++++++++++++++++++--------------- libcxx/modules/std.cppm.in | 75 ++++++++++++++++++--------------- libcxx/utils/generate_libcxx_cppm_in.py | 23 +++++++--- 3 files changed, 102 insertions(+), 71 deletions(-) diff --git a/libcxx/modules/std.compat.cppm.in b/libcxx/modules/std.compat.cppm.in index 651d6ec..1636371 100644 --- a/libcxx/modules/std.compat.cppm.in +++ b/libcxx/modules/std.compat.cppm.in @@ -46,39 +46,48 @@ module; #endif // *** Headers not yet available *** -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() +// +// This validation is mainly to aid libc++ developers to add modules for new +// headers. On Windows the Windows SDK can be in the include path. This SDK +// contains the MSVC STL headers. This may give false positives when MSVC STL +// provides a header libc++ has not implemented yet. Therefore this validation +// is not done on Windows. +// +#ifndef _WIN32 +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +#endif // _WIN32 export module std.compat; export import std; diff --git a/libcxx/modules/std.cppm.in b/libcxx/modules/std.cppm.in index 6ce8e28..3b59c28 100644 --- a/libcxx/modules/std.cppm.in +++ b/libcxx/modules/std.cppm.in @@ -168,39 +168,48 @@ module; #include // *** Headers not yet available *** -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() -#if __has_include() -# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include() +// +// This validation is mainly to aid libc++ developers to add modules for new +// headers. On Windows the Windows SDK can be in the include path. This SDK +// contains the MSVC STL headers. This may give false positives when MSVC STL +// provides a header libc++ has not implemented yet. Therefore this validation +// is not done on Windows. +// +#ifndef _WIN32 +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +# if __has_include() +# error "please update the header information for in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include() +#endif // _WIN32 export module std; diff --git a/libcxx/utils/generate_libcxx_cppm_in.py b/libcxx/utils/generate_libcxx_cppm_in.py index 2d3f829..0390ce5 100644 --- a/libcxx/utils/generate_libcxx_cppm_in.py +++ b/libcxx/utils/generate_libcxx_cppm_in.py @@ -57,18 +57,31 @@ module; else: module_cpp_in.write(f"#include <{header}>\n") - module_cpp_in.write("\n// *** Headers not yet available ***\n") + module_cpp_in.write( + """ +// *** Headers not yet available *** +// +// This validation is mainly to catch when a new header is added but adding the +// corresponding .inc file is forgotten. However, the check based on __has_include +// alone doesn't work on Windows because the Windows SDK is on the include path, +// and that means the MSVC STL headers can be found as well, tricking __has_include +// into thinking that libc++ provides the header. +// +#ifndef _WIN32 +""" + ) for header in sorted(headers_not_available): module_cpp_in.write( f"""\ -#if __has_include(<{header}>) -# error "please update the header information for <{header}> in headers_not_available in utils/libcxx/header_information.py" -#endif // __has_include(<{header}>) +# if __has_include(<{header}>) +# error "please update the header information for <{header}> in headers_not_available in utils/libcxx/header_information.py" +# endif // __has_include(<{header}>) """ ) module_cpp_in.write( - f""" + f"""#endif // _WIN32 + export module {module}; {'export import std;' if module == 'std.compat' else ''} -- cgit v1.1 From a7520d9727d2638047e5c464b2937581f64e2ce5 Mon Sep 17 00:00:00 2001 From: Shourya Goel Date: Fri, 9 Feb 2024 22:14:04 +0530 Subject: [Clang-tidy] bugprone-too-small-loop-variable - false-negative when const variable is used as loop bound (#81183) Changed LibASTMatcher to give an appropriate warning when a const loop bound is initialized with a function declaration. Fixes: #79580 --- .../clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp | 12 ++++++++---- clang-tools-extra/docs/ReleaseNotes.rst | 4 ++++ .../clang-tidy/checks/bugprone/too-small-loop-variable.rst | 4 ++++ .../clang-tidy/checkers/bugprone/too-small-loop-variable.cpp | 12 ++++++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp index 8ba8b89..a73d46f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp @@ -82,10 +82,14 @@ void TooSmallLoopVariableCheck::registerMatchers(MatchFinder *Finder) { // We are interested in only those cases when the loop bound is a variable // value (not const, enum, etc.). StatementMatcher LoopBoundMatcher = - expr(ignoringParenImpCasts(allOf(hasType(isInteger()), - unless(integerLiteral()), - unless(hasType(isConstQualified())), - unless(hasType(enumType()))))) + expr(ignoringParenImpCasts(allOf( + hasType(isInteger()), unless(integerLiteral()), + unless(allOf( + hasType(isConstQualified()), + declRefExpr(to(varDecl(anyOf( + hasInitializer(ignoringParenImpCasts(integerLiteral())), + isConstexpr(), isConstinit())))))), + unless(hasType(enumType()))))) .bind(LoopUpperBoundName); // We use the loop increment expression only to make sure we found the right diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index e50914a..dff8dd2 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -117,6 +117,10 @@ Changes in existing checks options `HeaderFileExtensions` and `ImplementationFileExtensions` by the global options of the same name. +- Improved :doc:`bugprone-too-small-loop-variable + ` support by correctly + implementing the check for const loop boundary. + - Cleaned up :doc:`cppcoreguidelines-prefer-member-initializer ` by removing enforcement of rule `C.48 diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst index 0f45cc2..2c3ded9 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst @@ -28,6 +28,10 @@ In a real use case size means a container's size which depends on the user input This algorithm works for a small amount of objects, but will lead to freeze for a larger user input. +It's recommended to enable the compiler warning +`-Wtautological-constant-out-of-range-compare` as well, since check does not +inspect compile-time constant loop boundaries to avoid overlaps with the warning. + .. option:: MagnitudeBitsUpperLimit Upper limit for the magnitude bits of the loop variable. If it's set the check diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp index 3229deb..113150b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp @@ -93,6 +93,18 @@ void voidBadForLoopWithMacroBound() { } } +unsigned int getVal() { + return 300; +} + +// The iteration's upper bound has a function declaration. +void voidBadForLoop8() { + const unsigned int l = getVal(); + for (unsigned char i = 0; i < l; ++i) { + // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: loop variable has narrower type 'unsigned char' than iteration's upper bound 'const unsigned int' [bugprone-too-small-loop-variable] + } +} + //////////////////////////////////////////////////////////////////////////////// /// Correct loops: we should not warn here. -- cgit v1.1 From 5afbed1968588fe443a8a55053c2f1eaa321d28e Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 9 Feb 2024 08:48:49 -0800 Subject: [llvm-objcopy] Fix the build after 7ddc32052546abd41656d2e670f3902b1bf805a7. NFCI --- llvm/lib/ObjCopy/ELF/ELFObject.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp index c2de456..d7559ab 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp @@ -2787,7 +2787,7 @@ IHexWriter::getTotalSize(WritableMemoryBuffer &EmptyBuffer) const { IHexSectionWriterBase LengthCalc(EmptyBuffer); for (const SectionBase *Sec : Sections) if (Error Err = Sec->accept(LengthCalc)) - return Err; + return std::move(Err); // We need space to write section records + StartAddress record // (if start adress is not zero) + EndOfFile record. -- cgit v1.1 From 1245f5f4da8f88d031c0a69388d97e8a6d7f00b5 Mon Sep 17 00:00:00 2001 From: Cyndy Ishida Date: Fri, 9 Feb 2024 08:52:06 -0800 Subject: [clang][Driver] Add support for XROS_DEPLOYMENT_TARGET env var (#81011) --- clang/lib/Driver/ToolChains/Darwin.cpp | 6 ++++-- clang/lib/Driver/ToolChains/Darwin.h | 2 +- clang/test/Driver/xros-driver-requires-darwin-host.c | 13 +++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 clang/test/Driver/xros-driver-requires-darwin-host.c diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index fae8ad1..cc1219d 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1902,6 +1902,7 @@ getDeploymentTargetFromEnvironmentVariables(const Driver &TheDriver, "TVOS_DEPLOYMENT_TARGET", "WATCHOS_DEPLOYMENT_TARGET", "DRIVERKIT_DEPLOYMENT_TARGET", + "XROS_DEPLOYMENT_TARGET" }; static_assert(std::size(EnvVars) == Darwin::LastDarwinPlatform + 1, "Missing platform"); @@ -1914,14 +1915,15 @@ getDeploymentTargetFromEnvironmentVariables(const Driver &TheDriver, // default platform. if (!Targets[Darwin::MacOS].empty() && (!Targets[Darwin::IPhoneOS].empty() || - !Targets[Darwin::WatchOS].empty() || !Targets[Darwin::TvOS].empty())) { + !Targets[Darwin::WatchOS].empty() || !Targets[Darwin::TvOS].empty() || + !Targets[Darwin::XROS].empty())) { if (Triple.getArch() == llvm::Triple::arm || Triple.getArch() == llvm::Triple::aarch64 || Triple.getArch() == llvm::Triple::thumb) Targets[Darwin::MacOS] = ""; else Targets[Darwin::IPhoneOS] = Targets[Darwin::WatchOS] = - Targets[Darwin::TvOS] = ""; + Targets[Darwin::TvOS] = Targets[Darwin::XROS] = ""; } else { // Don't allow conflicts in any other platform. unsigned FirstTarget = std::size(Targets); diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h index 5e60b08..10d4b69 100644 --- a/clang/lib/Driver/ToolChains/Darwin.h +++ b/clang/lib/Driver/ToolChains/Darwin.h @@ -300,7 +300,7 @@ public: WatchOS, DriverKit, XROS, - LastDarwinPlatform = DriverKit + LastDarwinPlatform = XROS }; enum DarwinEnvironmentKind { NativeEnvironment, diff --git a/clang/test/Driver/xros-driver-requires-darwin-host.c b/clang/test/Driver/xros-driver-requires-darwin-host.c new file mode 100644 index 0000000..e5bfcca --- /dev/null +++ b/clang/test/Driver/xros-driver-requires-darwin-host.c @@ -0,0 +1,13 @@ +// REQUIRES: system-darwin + +// RUN: env XROS_DEPLOYMENT_TARGET=1.0 %clang -arch arm64 -c -### %s 2>&1 | FileCheck %s + +// RUN: rm -rf %t.dir +// RUN: mkdir -p %t.dir/XROS1.0.sdk +// RUN: %clang -arch arm64 -isysroot %t.dir/XROS1.0.sdk -c -### %s 2>&1 | FileCheck %s +// RUN: mkdir -p %t.dir/XRSimulator1.0.sdk +// RUN: %clang -arch arm64 -isysroot %t.dir/XRSimulator1.0.sdk -c -### %s 2>&1 | FileCheck --check-prefix=CHECK_SIM %s + + +// CHECK: "-cc1"{{.*}} "-triple" "arm64-apple-xros1.0.0" +// CHECK_SIM: "-cc1"{{.*}} "-triple" "arm64-apple-xros1.0.0-simulator" -- cgit v1.1 From 94272a5a5d1549b32818805b82805e42c62ccfb4 Mon Sep 17 00:00:00 2001 From: Daniil Fukalov <1671137+dfukalov@users.noreply.github.com> Date: Fri, 9 Feb 2024 17:54:14 +0100 Subject: [OpenMP] Fix libomp debug build. (#81029) Disable libstdc++ assertions in the runtime library just like in https://reviews.llvm.org/D143168. --- openmp/runtime/src/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt index b0ecf12..ff129fe 100644 --- a/openmp/runtime/src/CMakeLists.txt +++ b/openmp/runtime/src/CMakeLists.txt @@ -152,6 +152,10 @@ if(UNIX) set(LIBOMP_DL_LIBS ${CMAKE_DL_LIBS}) endif() +# Disable libstdc++ assertions, even in an LLVM_ENABLE_ASSERTIONS build, to +# avoid an unwanted dependency on libstdc++.so. +add_definitions(-U_GLIBCXX_ASSERTIONS) + # Add the OpenMP library libomp_get_ldflags(LIBOMP_CONFIGURED_LDFLAGS) -- cgit v1.1 From c58c6aac7715d720358e317c26b6768940430ce9 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Fri, 9 Feb 2024 20:59:02 +0400 Subject: [clang][Sema] Add checks for validity of default ctor's class (#78898) Fixes #10518 Fixes #67914 Fixes #78388 Also addresses the second example in #49103 This patch is based on suggestion from @cor3ntin in https://github.com/llvm/llvm-project/issues/67914#issuecomment-1896011898 --- clang/docs/ReleaseNotes.rst | 4 ++ clang/lib/Sema/SemaDeclCXX.cpp | 7 ++ clang/test/SemaCXX/crash-GH10518.cpp | 22 ++++++ clang/test/SemaCXX/crash-GH49103-2.cpp | 13 ++++ clang/test/SemaCXX/crash-GH67914.cpp | 78 ++++++++++++++++++++++ clang/test/SemaCXX/crash-GH78388.cpp | 17 +++++ .../transform_error.mandates.verify.cpp | 2 +- 7 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 clang/test/SemaCXX/crash-GH10518.cpp create mode 100644 clang/test/SemaCXX/crash-GH49103-2.cpp create mode 100644 clang/test/SemaCXX/crash-GH67914.cpp create mode 100644 clang/test/SemaCXX/crash-GH78388.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index df3ad20..7631f3b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -182,6 +182,10 @@ Bug Fixes to Attribute Support Bug Fixes to C++ Support ^^^^^^^^^^^^^^^^^^^^^^^^ +- Fix crash when calling the constructor of an invalid class. + Fixes (`#10518 `_), + (`#67914 `_), + and (`#78388 `_) - Fix crash when using lifetimebound attribute in function with trailing return. Fixes (`#73619 `_) - Addressed an issue where constraints involving injected class types are perceived diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index fea8c50..ba233c9 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -5998,6 +5998,10 @@ void Sema::ActOnDefaultCtorInitializers(Decl *CDtorDecl) { if (CXXConstructorDecl *Constructor = dyn_cast(CDtorDecl)) { + if (CXXRecordDecl *ClassDecl = Constructor->getParent(); + !ClassDecl || ClassDecl->isInvalidDecl()) { + return; + } SetCtorInitializers(Constructor, /*AnyErrors=*/false); DiagnoseUninitializedFields(*this, Constructor); } @@ -14038,6 +14042,9 @@ void Sema::DefineImplicitDefaultConstructor(SourceLocation CurrentLocation, CXXRecordDecl *ClassDecl = Constructor->getParent(); assert(ClassDecl && "DefineImplicitDefaultConstructor - invalid constructor"); + if (ClassDecl->isInvalidDecl()) { + return; + } SynthesizedFunctionScope Scope(*this, Constructor); diff --git a/clang/test/SemaCXX/crash-GH10518.cpp b/clang/test/SemaCXX/crash-GH10518.cpp new file mode 100644 index 0000000..6c5f80a --- /dev/null +++ b/clang/test/SemaCXX/crash-GH10518.cpp @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -verify -std=c++98 %s +// RUN: %clang_cc1 -verify -std=c++11 %s +// RUN: %clang_cc1 -verify -std=c++14 %s +// RUN: %clang_cc1 -verify -std=c++17 %s +// RUN: %clang_cc1 -verify -std=c++20 %s +// RUN: %clang_cc1 -verify -std=c++23 %s +// RUN: %clang_cc1 -verify -std=c++2c %s + +// https://github.com/llvm/llvm-project/issues/10518 + +template +class A : public T { +}; + +template +class B : public A { +}; + +template +class B : public A { // expected-error 0-1 {{}} + B(T *t) {} +}; diff --git a/clang/test/SemaCXX/crash-GH49103-2.cpp b/clang/test/SemaCXX/crash-GH49103-2.cpp new file mode 100644 index 0000000..4c17a05 --- /dev/null +++ b/clang/test/SemaCXX/crash-GH49103-2.cpp @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -verify -std=c++98 %s +// RUN: %clang_cc1 -verify -std=c++11 %s +// RUN: %clang_cc1 -verify -std=c++14 %s +// RUN: %clang_cc1 -verify -std=c++17 %s +// RUN: %clang_cc1 -verify -std=c++20 %s +// RUN: %clang_cc1 -verify -std=c++23 %s +// RUN: %clang_cc1 -verify -std=c++2c %s + +// https://github.com/llvm/llvm-project/issues/49103 + +template struct A; // expected-note 0+ {{}} +struct S : __make_integer_seq { }; // expected-error 0+ {{}} +S s; diff --git a/clang/test/SemaCXX/crash-GH67914.cpp b/clang/test/SemaCXX/crash-GH67914.cpp new file mode 100644 index 0000000..fbaeac6 --- /dev/null +++ b/clang/test/SemaCXX/crash-GH67914.cpp @@ -0,0 +1,78 @@ +// RUN: %clang_cc1 -verify -std=c++98 %s +// RUN: %clang_cc1 -verify -std=c++11 %s +// RUN: %clang_cc1 -verify -std=c++14 %s +// RUN: %clang_cc1 -verify -std=c++17 %s +// RUN: %clang_cc1 -verify -std=c++20 %s +// RUN: %clang_cc1 -verify -std=c++23 %s +// RUN: %clang_cc1 -verify -std=c++2c %s + +// https://github.com/llvm/llvm-project/issues/67914 + +template < typename, int > +struct Mask; + +template < int, class > +struct conditional { + using type = Mask< int, 16 >; // expected-warning 0+ {{}} +}; + +template < class _Then > +struct conditional< 0, _Then > { + using type = _Then; // expected-warning 0+ {{}} +}; + +template < int _Bp, class, class _Then > +using conditional_t = typename conditional< _Bp, _Then >::type; // expected-warning 0+ {{}} + +template < typename, int > +struct Array; + +template < typename, int, bool, typename > +struct StaticArrayImpl; + +template < typename Value_, int Size_ > +struct Mask : StaticArrayImpl< Value_, Size_, 1, Mask< Value_, Size_ > > { // expected-note 0+ {{}} + template < typename T1 > + Mask(T1) {} // expected-note 0+ {{}} +}; + +template < typename T > +void load(typename T::MaskType mask) { + T::load_(mask); // expected-note 0+ {{}} +} + +template < typename Value_, int IsMask_, typename Derived_ > +struct StaticArrayImpl< Value_, 32, IsMask_, Derived_ > { + using Array1 = conditional_t< IsMask_, void, Array< Value_, 16 > >; // expected-warning 0+ {{}} + + template < typename Mask > + static Derived_ load_(Mask mask) { + return Derived_{load< Array1 >(mask.a1), Mask{}}; // expected-error 0+ {{}} + } + + Array1 a1; +}; + +template < typename Derived_ > +struct KMaskBase; + +template < typename Derived_ > +struct StaticArrayImpl< float, 16, 0, Derived_ > { + template < typename Mask > + static Derived_ load_(Mask mask); +}; + +template < typename Derived_ > +struct StaticArrayImpl< float, 16, 1, Mask< float, 16 > > : KMaskBase< Derived_ > {}; // expected-error 0+ {{}} + +template < typename Derived_ > +struct StaticArrayImpl< int, 16, 1, Derived_ > {}; + +template < typename Value_, int Size_ > +struct Array : StaticArrayImpl< Value_, Size_, 0, Array< Value_, Size_ > > { + using MaskType = Mask< Value_, Size_ >; // expected-warning 0+ {{}} +}; + +void test11_load_masked() { + load< Array< float, 32 > >{} == 0; // expected-error 0+ {{}} expected-warning 0+ {{}} expected-note 0+ {{}} +} diff --git a/clang/test/SemaCXX/crash-GH78388.cpp b/clang/test/SemaCXX/crash-GH78388.cpp new file mode 100644 index 0000000..cdec4d5 --- /dev/null +++ b/clang/test/SemaCXX/crash-GH78388.cpp @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 -verify -std=c++98 %s +// RUN: %clang_cc1 -verify -std=c++11 %s +// RUN: %clang_cc1 -verify -std=c++14 %s +// RUN: %clang_cc1 -verify -std=c++17 %s +// RUN: %clang_cc1 -verify -std=c++20 %s +// RUN: %clang_cc1 -verify -std=c++23 %s +// RUN: %clang_cc1 -verify -std=c++2c %s + +// https://github.com/llvm/llvm-project/issues/78388 + +typedef mbstate_t; // expected-error 0+ {{}} expected-note 0+ {{}} + template < typename , typename , typename > + class a // expected-error 0+ {{}} + class b { // expected-error 0+ {{}} + namespace { // expected-note 0+ {{}} expected-note 0+ {{}} + template < typename c > b::operator=() { // expected-error 0+ {{}} expected-note 0+ {{}} + struct :a< c, char, stdmbstate_t > d // expected-error 0+ {{}} expected-warning 0+ {{}} diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp index 4f4f5839..508b01a 100644 --- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp +++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp @@ -56,7 +56,7 @@ void test() { e.transform_error(return_unexpected); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}} // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected with a E that is not a valid argument for unexpected is ill-formed}} - // expected-error-re@*:* {{call to deleted constructor of {{.*}}}} + // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}} // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}} e.transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} -- cgit v1.1 From 9bb54b2aa006e3bf5df5eb8672075dd589fb9ba5 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 9 Feb 2024 09:01:05 -0800 Subject: Move the new test added in 2095655f to its own file ... and set an explicit target triple. Should fix buildbot issues like: https://lab.llvm.org/buildbot/#/builders/245/builds/20379/steps/5/logs/FAIL__Clang__warn-unused-filescoped_cpp --- clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp | 18 ++++++++++++++++++ clang/test/SemaCXX/warn-unused-filescoped.cpp | 16 ---------------- 2 files changed, 18 insertions(+), 16 deletions(-) create mode 100644 clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp diff --git a/clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp b/clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp new file mode 100644 index 0000000..8c21da5 --- /dev/null +++ b/clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify -Wunused -std=c++98 %s +// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify -Wunused -std=c++14 %s + +__attribute__((target_version("fp16"))) +static int not_used_fmv(void) { return 1; } +__attribute__((target_version("fp16fml"))) +static int not_used_fmv(void) { return 2; } +__attribute__((target_version("default"))) +static int not_used_fmv(void) { return 0; } // expected-warning {{unused function 'not_used_fmv'}} + + +__attribute__((target_version("fp16"))) +static int definitely_used_fmv(void) { return 1; } +__attribute__((target_version("fp16fml"))) +static int definitely_used_fmv(void) { return 2; } +__attribute__((target_version("default"))) +static int definitely_used_fmv(void) { return 0; } +int definite_user(void) { return definitely_used_fmv(); } diff --git a/clang/test/SemaCXX/warn-unused-filescoped.cpp b/clang/test/SemaCXX/warn-unused-filescoped.cpp index 0c347e9..be8d350 100644 --- a/clang/test/SemaCXX/warn-unused-filescoped.cpp +++ b/clang/test/SemaCXX/warn-unused-filescoped.cpp @@ -236,20 +236,4 @@ constexpr int constexpr4() { return 2; } #endif } -__attribute__((target_version("fp16"))) -static int not_used_fmv(void) { return 1; } -__attribute__((target_version("fp16fml"))) -static int not_used_fmv(void) { return 2; } -__attribute__((target_version("default"))) -static int not_used_fmv(void) { return 0; } // expected-warning {{unused function 'not_used_fmv'}} - - -__attribute__((target_version("fp16"))) -static int definitely_used_fmv(void) { return 1; } -__attribute__((target_version("fp16fml"))) -static int definitely_used_fmv(void) { return 2; } -__attribute__((target_version("default"))) -static int definitely_used_fmv(void) { return 0; } -int definite_user(void) { return definitely_used_fmv(); } - #endif -- cgit v1.1 From 9dd8ba4429fc22063e6ce18017e7bdbd7552a927 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 9 Feb 2024 17:05:22 +0000 Subject: [InstCombine] Add memcpy test with !tbaa.struct with multiple fields. Add an additional test with a struct with multiple fields. --- .../Transforms/InstCombine/struct-assign-tbaa.ll | 44 ++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll index 5c2ea39..1042c41 100644 --- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll +++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll @@ -35,17 +35,55 @@ define ptr @test2() { ret ptr %tmp } +define void @test3_multiple_fields(ptr nocapture %a, ptr nocapture %b) { +; CHECK-LABEL: @test3_multiple_fields( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[B:%.*]], align 4 +; CHECK-NEXT: store i64 [[TMP0]], ptr [[A:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a, ptr align 4 %b, i64 8, i1 false), !tbaa.struct !6 + ret void +} + +define void @test4_multiple_copy_first_field(ptr nocapture %a, ptr nocapture %b) { +; CHECK-LABEL: @test4_multiple_copy_first_field( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4 +; CHECK-NEXT: store i32 [[TMP0]], ptr [[A:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a, ptr align 4 %b, i64 4, i1 false), !tbaa.struct !6 + ret void +} + +define void @test5_multiple_copy_more_than_first_field(ptr nocapture %a, ptr nocapture %b) { +; CHECK-LABEL: @test5_multiple_copy_more_than_first_field( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4 +; CHECK-NEXT: store i32 [[TMP0]], ptr [[A:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a, ptr align 4 %b, i64 4, i1 false), !tbaa.struct !7 + ret void +} + !0 = !{!"Simple C/C++ TBAA"} !1 = !{!"omnipotent char", !0} !2 = !{!5, !5, i64 0} !3 = !{i64 0, i64 4, !2} !4 = !{i64 0, i64 8, null} !5 = !{!"float", !0} +!6 = !{i64 0, i64 4, !2, i64 4, i64 4, !2} +!7 = !{i64 0, i64 2, !2, i64 4, i64 6, !2} ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } ;. -; CHECK: [[TBAA0]] = !{!1, !1, i64 0} -; CHECK: [[META1:![0-9]+]] = !{!"float", !2} -; CHECK: [[META2:![0-9]+]] = !{!"Simple C/C++ TBAA"} +; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK: [[META1]] = !{!"float", [[META2:![0-9]+]]} +; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"} ;. -- cgit v1.1 From 0d72f0beabc180754eae334f22f01e48a5032bbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Fri, 9 Feb 2024 17:13:37 +0000 Subject: [mlir][Vector] Fix "scalability" in CastAwayExtractStridedSliceLeadingOneDim (#81187) Makes sure that "scalability" flags in the `CastAwayExtractStridedSliceLeadingOneDim` pattern are correctly updated. --- .../Vector/Transforms/VectorDropLeadUnitDim.cpp | 3 ++- .../Vector/vector-dropleadunitdim-transforms.mlir | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp index e1ed5d8..74382b0 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp @@ -73,7 +73,8 @@ struct CastAwayExtractStridedSliceLeadingOneDim VectorType oldDstType = extractOp.getType(); VectorType newDstType = VectorType::get(oldDstType.getShape().drop_front(dropCount), - oldDstType.getElementType()); + oldDstType.getElementType(), + oldDstType.getScalableDims().drop_front(dropCount)); Location loc = extractOp.getLoc(); diff --git a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir index f601be0..bb2d30f 100644 --- a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir @@ -206,6 +206,16 @@ func.func @cast_away_extract_strided_slice_leading_one_dims(%arg0: vector<1x8x8x return %0: vector<1x1x8xf16> } +// CHECK-LABEL: func @cast_away_extract_strided_slice_leading_one_dims_scalable +func.func @cast_away_extract_strided_slice_leading_one_dims_scalable(%arg0: vector<1x8x[8]xf16>) -> vector<1x1x[8]xf16> { + // CHECK: %[[SRC:.+]] = vector.extract %{{.*}}[0] : vector<8x[8]xf16> from vector<1x8x[8]xf16> + // CHECK: %[[EXTRACT:.+]] = vector.extract_strided_slice %[[SRC]] {offsets = [4], sizes = [1], strides = [1]} : vector<8x[8]xf16> to vector<1x[8]xf16> + %0 = vector.extract_strided_slice %arg0 {offsets = [0, 4], sizes = [1, 1], strides = [1, 1]} : vector<1x8x[8]xf16> to vector<1x1x[8]xf16> + // CHECK: %[[RET:.+]] = vector.broadcast %[[EXTRACT]] : vector<1x[8]xf16> to vector<1x1x[8]xf16> + // CHECK: return %[[RET]] + return %0: vector<1x1x[8]xf16> +} + // CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims func.func @cast_away_insert_strided_slice_leading_one_dims(%arg0: vector<1x8xf16>, %arg1: vector<1x8x8xf16>) -> vector<1x8x8xf16> { // CHECK: %[[SRC:.+]] = vector.extract %{{.*}}[0] : vector<8xf16> from vector<1x8xf16> @@ -217,6 +227,17 @@ func.func @cast_away_insert_strided_slice_leading_one_dims(%arg0: vector<1x8xf16 return %0: vector<1x8x8xf16> } +// CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims_scalable +func.func @cast_away_insert_strided_slice_leading_one_dims_scalable(%arg0: vector<1x[8]xf16>, %arg1: vector<1x8x[8]xf16>) -> vector<1x8x[8]xf16> { + // CHECK: %[[SRC:.+]] = vector.extract %{{.*}}[0] : vector<[8]xf16> from vector<1x[8]xf16> + // CHECK: %[[DST:.+]] = vector.extract %{{.*}}[0] : vector<8x[8]xf16> from vector<1x8x[8]xf16> + // CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[SRC]], %[[DST]] {offsets = [0, 0], strides = [1]} : vector<[8]xf16> into vector<8x[8]xf16> + %0 = vector.insert_strided_slice %arg0, %arg1 {offsets = [0, 0, 0], strides = [1, 1]} : vector<1x[8]xf16> into vector<1x8x[8]xf16> + // CHECK: %[[RET:.+]] = vector.broadcast %[[INSERT]] : vector<8x[8]xf16> to vector<1x8x[8]xf16> + // CHECK: return %[[RET]] + return %0: vector<1x8x[8]xf16> +} + // CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims_one_element // CHECK-SAME: %[[ARG0:.+]]: vector<1x1xf16>, %{{.+}}: vector<1x1x1xf16> func.func @cast_away_insert_strided_slice_leading_one_dims_one_element(%arg0: vector<1x1xf16>, %arg1: vector<1x1x1xf16>) -> vector<1x1x1xf16> { -- cgit v1.1 From 2884d048396abc82c8356c4e350ef968fb24a0d7 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 9 Feb 2024 17:16:21 +0000 Subject: [SROA] Add additional tests for splitting up ops with !tbaa.struct. --- llvm/test/Transforms/SROA/tbaa-struct3.ll | 107 ++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 llvm/test/Transforms/SROA/tbaa-struct3.ll diff --git a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll new file mode 100644 index 0000000..4910e0e --- /dev/null +++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -p sroa -S %s | FileCheck %s + +define void @load_store_transfer_split_struct_tbaa_2_float(ptr dereferenceable(24) %res, float %a, float %b) { +; CHECK-LABEL: define void @load_store_transfer_split_struct_tbaa_2_float( +; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[B]] to i32 +; CHECK-NEXT: store i32 [[TMP0]], ptr [[RES]], align 4 +; CHECK-NEXT: [[RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 4 +; CHECK-NEXT: store i32 [[TMP1]], ptr [[RES_SROA_IDX]], align 4 +; CHECK-NEXT: [[P:%.*]] = load ptr, ptr [[RES]], align 8 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca { float, float }, align 4 + store float %a, ptr %tmp, align 4 + %tmp.4 = getelementptr inbounds i8, ptr %tmp, i64 4 + store float %b, ptr %tmp.4, align 4 + %l1 = load i64, ptr %tmp, !tbaa.struct !0 + store i64 %l1, ptr %res, !tbaa.struct !0 + %p = load ptr, ptr %res, align 8 + ret void +} + +define void @memcpy_transfer(ptr dereferenceable(24) %res, float %a, float %b) { +; CHECK-LABEL: define void @memcpy_transfer( +; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[L_PTR:%.*]] = load ptr, ptr [[RES]], align 8 +; CHECK-NEXT: store float [[A]], ptr [[L_PTR]], align 1, !tbaa.struct [[TBAA_STRUCT0:![0-9]+]] +; CHECK-NEXT: [[TMP_SROA_2_0_L_PTR_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[L_PTR]], i64 4 +; CHECK-NEXT: store float [[B]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1, !tbaa.struct [[TBAA_STRUCT5:![0-9]+]] +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca { float, float }, align 4 + store float %a, ptr %tmp, align 4 + %__im_.i.i = getelementptr inbounds i8, ptr %tmp, i64 4 + store float %b, ptr %__im_.i.i, align 4 + %l.ptr = load ptr, ptr %res, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr %l.ptr, ptr %tmp, i64 8, i1 false), !tbaa.struct !0 + ret void +} + +define void @memcpy_transfer_tbaa_field_and_size_do_not_align(ptr dereferenceable(24) %res, float %a, float %b) { +; CHECK-LABEL: define void @memcpy_transfer_tbaa_field_and_size_do_not_align( +; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[L_PTR:%.*]] = load ptr, ptr [[RES]], align 8 +; CHECK-NEXT: store float [[A]], ptr [[L_PTR]], align 1, !tbaa.struct [[TBAA_STRUCT0]] +; CHECK-NEXT: [[TMP_SROA_2_0_L_PTR_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[L_PTR]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B]] to i32 +; CHECK-NEXT: [[TMP_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +; CHECK-NEXT: store i16 [[TMP_SROA_2_0_EXTRACT_TRUNC]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1, !tbaa.struct [[TBAA_STRUCT5]] +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca { float, float }, align 4 + store float %a, ptr %tmp, align 4 + %__im_.i.i = getelementptr inbounds i8, ptr %tmp, i64 4 + store float %b, ptr %__im_.i.i, align 4 + %l.ptr = load ptr, ptr %res, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr %l.ptr, ptr %tmp, i64 6, i1 false), !tbaa.struct !0 + ret void +} + +define void @load_store_transfer_split_struct_tbaa_2_i31(ptr dereferenceable(24) %res, i31 %a, i31 %b) { +; CHECK-LABEL: define void @load_store_transfer_split_struct_tbaa_2_i31( +; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], i31 [[A:%.*]], i31 [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca { i31, i31 }, align 4 +; CHECK-NEXT: store i31 [[A]], ptr [[TMP]], align 4 +; CHECK-NEXT: [[TMP_4_TMP_4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 4 +; CHECK-NEXT: store i31 [[B]], ptr [[TMP_4_TMP_4_SROA_IDX]], align 4 +; CHECK-NEXT: [[TMP_0_L1:%.*]] = load i62, ptr [[TMP]], align 4, !tbaa.struct [[TBAA_STRUCT0]] +; CHECK-NEXT: store i62 [[TMP_0_L1]], ptr [[RES]], align 4, !tbaa.struct [[TBAA_STRUCT0]] +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca { i31 , i31 }, align 4 + store i31 %a, ptr %tmp, align 4 + %tmp.4 = getelementptr inbounds i8, ptr %tmp, i64 4 + store i31 %b, ptr %tmp.4, align 4 + %l1 = load i62, ptr %tmp, !tbaa.struct !0 + store i62 %l1, ptr %res, !tbaa.struct !0 + ret void +} + + +; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2 + +!0 = !{i64 0, i64 4, !1, i64 4, i64 4, !1} +!1 = !{!2, !2, i64 0} +!2 = !{!"float", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C++ TBAA"} +;. +; CHECK: [[TBAA_STRUCT0]] = !{i64 0, i64 4, [[META1:![0-9]+]], i64 4, i64 4, [[META1]]} +; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META2]], i64 0} +; CHECK: [[META2]] = !{!"float", [[META3:![0-9]+]], i64 0} +; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0} +; CHECK: [[META4]] = !{!"Simple C++ TBAA"} +; CHECK: [[TBAA_STRUCT5]] = !{i64 0, i64 4, [[META1]]} +;. -- cgit v1.1 From bb5c3899d1936ebdf7ebf5ca4347ee2e057bee7f Mon Sep 17 00:00:00 2001 From: Zain Jaffal Date: Fri, 9 Feb 2024 17:24:41 +0000 Subject: [InstCombine] Optimise x / sqrt(y / z) with fast-math pattern. (#76737) Replace the pattern with x * sqrt(z/y) --------- Co-authored-by: Matt Arsenault --- .../InstCombine/InstCombineMulDivRem.cpp | 30 ++++++++++++++++++++++ llvm/test/Transforms/InstCombine/fdiv-sqrt.ll | 18 ++++++------- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index f9cee9d..5918567 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -1709,6 +1709,33 @@ static Instruction *foldFDivPowDivisor(BinaryOperator &I, return BinaryOperator::CreateFMulFMF(Op0, Pow, &I); } +/// Convert div to mul if we have an sqrt divisor iff sqrt's operand is a fdiv +/// instruction. +static Instruction *foldFDivSqrtDivisor(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + // X / sqrt(Y / Z) --> X * sqrt(Z / Y) + if (!I.hasAllowReassoc() || !I.hasAllowReciprocal()) + return nullptr; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + auto *II = dyn_cast(Op1); + if (!II || II->getIntrinsicID() != Intrinsic::sqrt || !II->hasOneUse() || + !II->hasAllowReassoc() || !II->hasAllowReciprocal()) + return nullptr; + + Value *Y, *Z; + auto *DivOp = dyn_cast(II->getOperand(0)); + if (!DivOp || !DivOp->hasAllowReassoc() || !I.hasAllowReciprocal() || + !DivOp->hasOneUse()) + return nullptr; + if (match(DivOp, m_FDiv(m_Value(Y), m_Value(Z)))) { + Value *SwapDiv = Builder.CreateFDivFMF(Z, Y, DivOp); + Value *NewSqrt = + Builder.CreateUnaryIntrinsic(II->getIntrinsicID(), SwapDiv, II); + return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I); + } + return nullptr; +} + Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { Module *M = I.getModule(); @@ -1816,6 +1843,9 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { if (Instruction *Mul = foldFDivPowDivisor(I, Builder)) return Mul; + if (Instruction *Mul = foldFDivSqrtDivisor(I, Builder)) + return Mul; + // pow(X, Y) / X --> pow(X, Y-1) if (I.hasAllowReassoc() && match(Op0, m_OneUse(m_Intrinsic(m_Specific(Op1), diff --git a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll index 346271b..361837e 100644 --- a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll +++ b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll @@ -6,9 +6,9 @@ declare double @llvm.sqrt.f64(double) define double @sqrt_div_fast(double %x, double %y, double %z) { ; CHECK-LABEL: @sqrt_div_fast( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]]) -; CHECK-NEXT: [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: [[TMP0:%.*]] = fdiv fast double [[Z:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.sqrt.f64(double [[TMP0]]) +; CHECK-NEXT: [[DIV1:%.*]] = fmul fast double [[TMP1]], [[X:%.*]] ; CHECK-NEXT: ret double [[DIV1]] ; entry: @@ -36,9 +36,9 @@ entry: define double @sqrt_div_reassoc_arcp(double %x, double %y, double %z) { ; CHECK-LABEL: @sqrt_div_reassoc_arcp( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]]) -; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc arcp double [[Z:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[TMP0]]) +; CHECK-NEXT: [[DIV1:%.*]] = fmul reassoc arcp double [[TMP1]], [[X:%.*]] ; CHECK-NEXT: ret double [[DIV1]] ; entry: @@ -96,9 +96,9 @@ entry: define double @sqrt_div_arcp_missing(double %x, double %y, double %z) { ; CHECK-LABEL: @sqrt_div_arcp_missing( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc double [[Y:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]]) -; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]] +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double [[Z:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[TMP0]]) +; CHECK-NEXT: [[DIV1:%.*]] = fmul reassoc arcp double [[TMP1]], [[X:%.*]] ; CHECK-NEXT: ret double [[DIV1]] ; entry: -- cgit v1.1 From 301f6840522e3d924cf00ab6a04f93f1354142f5 Mon Sep 17 00:00:00 2001 From: Kiran Chandramohan Date: Fri, 9 Feb 2024 17:52:30 +0000 Subject: [Flang][OpenMP] NFC: Refactor reduction code (#79876) Introduces a new enumeration to list all Fortran reduction identifiers. Moves the combiner code-generation into a separate function for possible reuse in array context in future. --- flang/lib/Lower/OpenMP.cpp | 344 ++++++++++++++++++++++----------------------- 1 file changed, 172 insertions(+), 172 deletions(-) diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp index ad4cffc..fd18b21 100644 --- a/flang/lib/Lower/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP.cpp @@ -731,21 +731,59 @@ static void checkMapType(mlir::Location location, mlir::Type type) { class ReductionProcessor { public: - enum IntrinsicProc { MAX, MIN, IAND, IOR, IEOR }; - static IntrinsicProc + // TODO: Move this enumeration to the OpenMP dialect + enum ReductionIdentifier { + ID, + USER_DEF_OP, + ADD, + SUBTRACT, + MULTIPLY, + AND, + OR, + EQV, + NEQV, + MAX, + MIN, + IAND, + IOR, + IEOR + }; + static ReductionIdentifier getReductionType(const Fortran::parser::ProcedureDesignator &pd) { - auto redType = llvm::StringSwitch>( + auto redType = llvm::StringSwitch>( getRealName(pd).ToString()) - .Case("max", IntrinsicProc::MAX) - .Case("min", IntrinsicProc::MIN) - .Case("iand", IntrinsicProc::IAND) - .Case("ior", IntrinsicProc::IOR) - .Case("ieor", IntrinsicProc::IEOR) + .Case("max", ReductionIdentifier::MAX) + .Case("min", ReductionIdentifier::MIN) + .Case("iand", ReductionIdentifier::IAND) + .Case("ior", ReductionIdentifier::IOR) + .Case("ieor", ReductionIdentifier::IEOR) .Default(std::nullopt); assert(redType && "Invalid Reduction"); return *redType; } + static ReductionIdentifier getReductionType( + Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp) { + switch (intrinsicOp) { + case Fortran::parser::DefinedOperator::IntrinsicOperator::Add: + return ReductionIdentifier::ADD; + case Fortran::parser::DefinedOperator::IntrinsicOperator::Subtract: + return ReductionIdentifier::SUBTRACT; + case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply: + return ReductionIdentifier::MULTIPLY; + case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: + return ReductionIdentifier::AND; + case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: + return ReductionIdentifier::EQV; + case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: + return ReductionIdentifier::OR; + case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: + return ReductionIdentifier::NEQV; + default: + llvm_unreachable("unexpected intrinsic operator in reduction"); + } + } + static bool supportedIntrinsicProcReduction( const Fortran::parser::ProcedureDesignator &pd) { const auto *name{Fortran::parser::Unwrap(pd)}; @@ -753,17 +791,14 @@ public: if (!name->symbol->GetUltimate().attrs().test( Fortran::semantics::Attr::INTRINSIC)) return false; - auto redType = llvm::StringSwitch>( - getRealName(name).ToString()) - .Case("max", IntrinsicProc::MAX) - .Case("min", IntrinsicProc::MIN) - .Case("iand", IntrinsicProc::IAND) - .Case("ior", IntrinsicProc::IOR) - .Case("ieor", IntrinsicProc::IEOR) - .Default(std::nullopt); - if (redType) - return true; - return false; + auto redType = llvm::StringSwitch(getRealName(name).ToString()) + .Case("max", true) + .Case("min", true) + .Case("iand", true) + .Case("ior", true) + .Case("ieor", true) + .Default(false); + return redType; } static const Fortran::semantics::SourceName @@ -817,32 +852,30 @@ public: /// reductionOpName. For example: /// 0 + x = x, /// 1 * x = x - static int getOperationIdentity( - Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp, - mlir::Location loc) { - switch (intrinsicOp) { - case Fortran::parser::DefinedOperator::IntrinsicOperator::Add: - case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: - case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: + static int getOperationIdentity(ReductionIdentifier redId, + mlir::Location loc) { + switch (redId) { + case ReductionIdentifier::ADD: + case ReductionIdentifier::OR: + case ReductionIdentifier::NEQV: return 0; - case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply: - case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: - case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: + case ReductionIdentifier::MULTIPLY: + case ReductionIdentifier::AND: + case ReductionIdentifier::EQV: return 1; default: TODO(loc, "Reduction of some intrinsic operators is not supported"); } } - static mlir::Value getIntrinsicProcInitValue( - mlir::Location loc, mlir::Type type, - const Fortran::parser::ProcedureDesignator &procDesignator, - fir::FirOpBuilder &builder) { + static mlir::Value getReductionInitValue(mlir::Location loc, mlir::Type type, + ReductionIdentifier redId, + fir::FirOpBuilder &builder) { assert((fir::isa_integer(type) || fir::isa_real(type) || type.isa()) && "only integer, logical and real types are currently supported"); - switch (getReductionType(procDesignator)) { - case IntrinsicProc::MAX: { + switch (redId) { + case ReductionIdentifier::MAX: { if (auto ty = type.dyn_cast()) { const llvm::fltSemantics &sem = ty.getFloatSemantics(); return builder.createRealConstant( @@ -852,7 +885,7 @@ public: int64_t minInt = llvm::APInt::getSignedMinValue(bits).getSExtValue(); return builder.createIntegerConstant(loc, type, minInt); } - case IntrinsicProc::MIN: { + case ReductionIdentifier::MIN: { if (auto ty = type.dyn_cast()) { const llvm::fltSemantics &sem = ty.getFloatSemantics(); return builder.createRealConstant( @@ -862,46 +895,50 @@ public: int64_t maxInt = llvm::APInt::getSignedMaxValue(bits).getSExtValue(); return builder.createIntegerConstant(loc, type, maxInt); } - case IntrinsicProc::IOR: { + case ReductionIdentifier::IOR: { unsigned bits = type.getIntOrFloatBitWidth(); int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue(); return builder.createIntegerConstant(loc, type, zeroInt); } - case IntrinsicProc::IEOR: { + case ReductionIdentifier::IEOR: { unsigned bits = type.getIntOrFloatBitWidth(); int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue(); return builder.createIntegerConstant(loc, type, zeroInt); } - case IntrinsicProc::IAND: { + case ReductionIdentifier::IAND: { unsigned bits = type.getIntOrFloatBitWidth(); int64_t allOnInt = llvm::APInt::getAllOnes(bits).getSExtValue(); return builder.createIntegerConstant(loc, type, allOnInt); } - } - llvm_unreachable("Unknown Reduction Intrinsic"); - } + case ReductionIdentifier::ADD: + case ReductionIdentifier::MULTIPLY: + case ReductionIdentifier::AND: + case ReductionIdentifier::OR: + case ReductionIdentifier::EQV: + case ReductionIdentifier::NEQV: + if (type.isa()) + return builder.create( + loc, type, + builder.getFloatAttr(type, + (double)getOperationIdentity(redId, loc))); + + if (type.isa()) { + mlir::Value intConst = builder.create( + loc, builder.getI1Type(), + builder.getIntegerAttr(builder.getI1Type(), + getOperationIdentity(redId, loc))); + return builder.createConvert(loc, type, intConst); + } - static mlir::Value getIntrinsicOpInitValue( - mlir::Location loc, mlir::Type type, - Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp, - fir::FirOpBuilder &builder) { - if (type.isa()) return builder.create( loc, type, - builder.getFloatAttr(type, - (double)getOperationIdentity(intrinsicOp, loc))); - - if (type.isa()) { - mlir::Value intConst = builder.create( - loc, builder.getI1Type(), - builder.getIntegerAttr(builder.getI1Type(), - getOperationIdentity(intrinsicOp, loc))); - return builder.createConvert(loc, type, intConst); + builder.getIntegerAttr(type, getOperationIdentity(redId, loc))); + case ReductionIdentifier::ID: + case ReductionIdentifier::USER_DEF_OP: + case ReductionIdentifier::SUBTRACT: + TODO(loc, "Reduction of some identifier types is not supported"); } - - return builder.create( - loc, type, - builder.getIntegerAttr(type, getOperationIdentity(intrinsicOp, loc))); + llvm_unreachable("Unhandled Reduction identifier : getReductionInitValue"); } template @@ -915,118 +952,46 @@ public: return builder.create(loc, op1, op2); } - /// Creates an OpenMP reduction declaration and inserts it into the provided - /// symbol table. The declaration has a constant initializer with the neutral - /// value `initValue`, and the reduction combiner carried over from `reduce`. - /// TODO: Generalize this for non-integer types, add atomic region. - static mlir::omp::ReductionDeclareOp createReductionDecl( - fir::FirOpBuilder &builder, llvm::StringRef reductionOpName, - const Fortran::parser::ProcedureDesignator &procDesignator, - mlir::Type type, mlir::Location loc) { - mlir::OpBuilder::InsertionGuard guard(builder); - mlir::ModuleOp module = builder.getModule(); - - auto decl = - module.lookupSymbol(reductionOpName); - if (decl) - return decl; - - mlir::OpBuilder modBuilder(module.getBodyRegion()); - - decl = modBuilder.create( - loc, reductionOpName, type); - builder.createBlock(&decl.getInitializerRegion(), - decl.getInitializerRegion().end(), {type}, {loc}); - builder.setInsertionPointToEnd(&decl.getInitializerRegion().back()); - mlir::Value init = - getIntrinsicProcInitValue(loc, type, procDesignator, builder); - builder.create(loc, init); - - builder.createBlock(&decl.getReductionRegion(), - decl.getReductionRegion().end(), {type, type}, - {loc, loc}); - - builder.setInsertionPointToEnd(&decl.getReductionRegion().back()); - mlir::Value op1 = decl.getReductionRegion().front().getArgument(0); - mlir::Value op2 = decl.getReductionRegion().front().getArgument(1); - + static mlir::Value createScalarCombiner(fir::FirOpBuilder &builder, + mlir::Location loc, + ReductionIdentifier redId, + mlir::Type type, mlir::Value op1, + mlir::Value op2) { mlir::Value reductionOp; - switch (getReductionType(procDesignator)) { - case IntrinsicProc::MAX: + switch (redId) { + case ReductionIdentifier::MAX: reductionOp = getReductionOperation( builder, type, loc, op1, op2); break; - case IntrinsicProc::MIN: + case ReductionIdentifier::MIN: reductionOp = getReductionOperation( builder, type, loc, op1, op2); break; - case IntrinsicProc::IOR: + case ReductionIdentifier::IOR: assert((type.isIntOrIndex()) && "only integer is expected"); reductionOp = builder.create(loc, op1, op2); break; - case IntrinsicProc::IEOR: + case ReductionIdentifier::IEOR: assert((type.isIntOrIndex()) && "only integer is expected"); reductionOp = builder.create(loc, op1, op2); break; - case IntrinsicProc::IAND: + case ReductionIdentifier::IAND: assert((type.isIntOrIndex()) && "only integer is expected"); reductionOp = builder.create(loc, op1, op2); break; - } - - builder.create(loc, reductionOp); - return decl; - } - - /// Creates an OpenMP reduction declaration and inserts it into the provided - /// symbol table. The declaration has a constant initializer with the neutral - /// value `initValue`, and the reduction combiner carried over from `reduce`. - /// TODO: Generalize this for non-integer types, add atomic region. - static mlir::omp::ReductionDeclareOp createReductionDecl( - fir::FirOpBuilder &builder, llvm::StringRef reductionOpName, - Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp, - mlir::Type type, mlir::Location loc) { - mlir::OpBuilder::InsertionGuard guard(builder); - mlir::ModuleOp module = builder.getModule(); - - auto decl = - module.lookupSymbol(reductionOpName); - if (decl) - return decl; - - mlir::OpBuilder modBuilder(module.getBodyRegion()); - - decl = modBuilder.create( - loc, reductionOpName, type); - builder.createBlock(&decl.getInitializerRegion(), - decl.getInitializerRegion().end(), {type}, {loc}); - builder.setInsertionPointToEnd(&decl.getInitializerRegion().back()); - mlir::Value init = getIntrinsicOpInitValue(loc, type, intrinsicOp, builder); - builder.create(loc, init); - - builder.createBlock(&decl.getReductionRegion(), - decl.getReductionRegion().end(), {type, type}, - {loc, loc}); - - builder.setInsertionPointToEnd(&decl.getReductionRegion().back()); - mlir::Value op1 = decl.getReductionRegion().front().getArgument(0); - mlir::Value op2 = decl.getReductionRegion().front().getArgument(1); - - mlir::Value reductionOp; - switch (intrinsicOp) { - case Fortran::parser::DefinedOperator::IntrinsicOperator::Add: + case ReductionIdentifier::ADD: reductionOp = getReductionOperation( builder, type, loc, op1, op2); break; - case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply: + case ReductionIdentifier::MULTIPLY: reductionOp = getReductionOperation( builder, type, loc, op1, op2); break; - case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: { + case ReductionIdentifier::AND: { mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); @@ -1036,7 +1001,7 @@ public: reductionOp = builder.createConvert(loc, type, andiOp); break; } - case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: { + case ReductionIdentifier::OR: { mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); @@ -1045,7 +1010,7 @@ public: reductionOp = builder.createConvert(loc, type, oriOp); break; } - case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: { + case ReductionIdentifier::EQV: { mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); @@ -1055,7 +1020,7 @@ public: reductionOp = builder.createConvert(loc, type, cmpiOp); break; } - case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: { + case ReductionIdentifier::NEQV: { mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); @@ -1069,7 +1034,46 @@ public: TODO(loc, "Reduction of some intrinsic operators is not supported"); } + return reductionOp; + } + + /// Creates an OpenMP reduction declaration and inserts it into the provided + /// symbol table. The declaration has a constant initializer with the neutral + /// value `initValue`, and the reduction combiner carried over from `reduce`. + /// TODO: Generalize this for non-integer types, add atomic region. + static mlir::omp::ReductionDeclareOp createReductionDecl( + fir::FirOpBuilder &builder, llvm::StringRef reductionOpName, + const ReductionIdentifier redId, mlir::Type type, mlir::Location loc) { + mlir::OpBuilder::InsertionGuard guard(builder); + mlir::ModuleOp module = builder.getModule(); + + auto decl = + module.lookupSymbol(reductionOpName); + if (decl) + return decl; + + mlir::OpBuilder modBuilder(module.getBodyRegion()); + + decl = modBuilder.create( + loc, reductionOpName, type); + builder.createBlock(&decl.getInitializerRegion(), + decl.getInitializerRegion().end(), {type}, {loc}); + builder.setInsertionPointToEnd(&decl.getInitializerRegion().back()); + mlir::Value init = getReductionInitValue(loc, type, redId, builder); + builder.create(loc, init); + + builder.createBlock(&decl.getReductionRegion(), + decl.getReductionRegion().end(), {type, type}, + {loc, loc}); + + builder.setInsertionPointToEnd(&decl.getReductionRegion().back()); + mlir::Value op1 = decl.getReductionRegion().front().getArgument(0); + mlir::Value op2 = decl.getReductionRegion().front().getArgument(1); + + mlir::Value reductionOp = + createScalarCombiner(builder, loc, redId, type, op1, op2); builder.create(loc, reductionOp); + return decl; } @@ -1092,15 +1096,15 @@ public: const auto &intrinsicOp{ std::get( redDefinedOp->u)}; - switch (intrinsicOp) { - case Fortran::parser::DefinedOperator::IntrinsicOperator::Add: - case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply: - case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: - case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: - case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: - case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: + ReductionIdentifier redId = getReductionType(intrinsicOp); + switch (redId) { + case ReductionIdentifier::ADD: + case ReductionIdentifier::MULTIPLY: + case ReductionIdentifier::AND: + case ReductionIdentifier::EQV: + case ReductionIdentifier::OR: + case ReductionIdentifier::NEQV: break; - default: TODO(currentLocation, "Reduction of some intrinsic operators is not supported"); @@ -1120,11 +1124,11 @@ public: decl = createReductionDecl( firOpBuilder, getReductionName(intrinsicOp, firOpBuilder.getI1Type()), - intrinsicOp, redType, currentLocation); + redId, redType, currentLocation); else if (redType.isIntOrIndexOrFloat()) { decl = createReductionDecl(firOpBuilder, getReductionName(intrinsicOp, redType), - intrinsicOp, redType, currentLocation); + redId, redType, currentLocation); } else { TODO(currentLocation, "Reduction of some types is not supported"); } @@ -1138,6 +1142,8 @@ public: &redOperator.u)) { if (ReductionProcessor::supportedIntrinsicProcReduction( *reductionIntrinsic)) { + ReductionProcessor::ReductionIdentifier redId = + ReductionProcessor::getReductionType(*reductionIntrinsic); for (const Fortran::parser::OmpObject &ompObject : objectList.v) { if (const auto *name{ Fortran::parser::Unwrap(ompObject)}) { @@ -1154,7 +1160,7 @@ public: firOpBuilder, getReductionName(getRealName(*reductionIntrinsic).ToString(), redType), - *reductionIntrinsic, redType, currentLocation); + redId, redType, currentLocation); reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get( firOpBuilder.getContext(), decl.getSymName())); } @@ -4174,7 +4180,7 @@ void Fortran::lower::genOpenMPReduction( if (!ReductionProcessor::supportedIntrinsicProcReduction( *reductionIntrinsic)) continue; - ReductionProcessor::IntrinsicProc redIntrinsicProc = + ReductionProcessor::ReductionIdentifier redId = ReductionProcessor::getReductionType(*reductionIntrinsic); for (const Fortran::parser::OmpObject &ompObject : objectList.v) { if (const auto *name{ @@ -4195,10 +4201,8 @@ void Fortran::lower::genOpenMPReduction( if (reductionOp == nullptr) continue; - if (redIntrinsicProc == - ReductionProcessor::IntrinsicProc::MAX || - redIntrinsicProc == - ReductionProcessor::IntrinsicProc::MIN) { + if (redId == ReductionProcessor::ReductionIdentifier::MAX || + redId == ReductionProcessor::ReductionIdentifier::MIN) { assert(mlir::isa(reductionOp) && "Selection Op not found in reduction intrinsic"); mlir::Operation *compareOp = @@ -4206,13 +4210,9 @@ void Fortran::lower::genOpenMPReduction( updateReduction(compareOp, firOpBuilder, loadVal, reductionVal); } - if (redIntrinsicProc == - ReductionProcessor::IntrinsicProc::IOR || - redIntrinsicProc == - ReductionProcessor::IntrinsicProc::IEOR || - redIntrinsicProc == - ReductionProcessor::IntrinsicProc::IAND) { - + if (redId == ReductionProcessor::ReductionIdentifier::IOR || + redId == ReductionProcessor::ReductionIdentifier::IEOR || + redId == ReductionProcessor::ReductionIdentifier::IAND) { updateReduction(reductionOp, firOpBuilder, loadVal, reductionVal); } -- cgit v1.1 From b2b3a5248540320e74347fcdaffbd148d1e9d494 Mon Sep 17 00:00:00 2001 From: Mats Petersson Date: Fri, 9 Feb 2024 18:05:51 +0000 Subject: Skip compiler directives between OMP PARALLEL DO and the loop (#81021) This fixes a compilation error when code like this is presented to the compiler: !$OMP PARALLEL DO !DIR$ VECTOR ALIGNED DO 20 i=1,N a = a + 0.5 20 CONTINUE The directive itself is later ignored (with a warning that this is happening), but because the compiler already errored out before that point, it completely fails to compile this code. Other compilers accept the code without complaints. --- flang/lib/Semantics/canonicalize-omp.cpp | 16 +++++++++++----- flang/test/Semantics/OpenMP/loop-association.f90 | 8 ++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp index 013fb40..01adcf5 100644 --- a/flang/lib/Semantics/canonicalize-omp.cpp +++ b/flang/lib/Semantics/canonicalize-omp.cpp @@ -90,7 +90,11 @@ private: auto &dir{std::get(beginDir.t)}; nextIt = it; - if (++nextIt != block.end()) { + while (++nextIt != block.end()) { + // Ignore compiler directives. + if (auto *directive{GetConstructIf(*nextIt)}) + continue; + if (auto *doCons{GetConstructIf(*nextIt)}) { if (doCons->GetLoopControl()) { // move DoConstruct @@ -111,12 +115,14 @@ private: "DO loop after the %s directive must have loop control"_err_en_US, parser::ToUpperCaseLetters(dir.source.ToString())); } - return; // found do-loop + } else { + messages_.Say(dir.source, + "A DO loop must follow the %s directive"_err_en_US, + parser::ToUpperCaseLetters(dir.source.ToString())); } + // If we get here, we either found a loop, or issued an error message. + return; } - messages_.Say(dir.source, - "A DO loop must follow the %s directive"_err_en_US, - parser::ToUpperCaseLetters(dir.source.ToString())); } void RewriteOmpAllocations(parser::ExecutionPart &body) { diff --git a/flang/test/Semantics/OpenMP/loop-association.f90 b/flang/test/Semantics/OpenMP/loop-association.f90 index 8a28fd8..d216766 100644 --- a/flang/test/Semantics/OpenMP/loop-association.f90 +++ b/flang/test/Semantics/OpenMP/loop-association.f90 @@ -30,6 +30,14 @@ c = c - 1 END DO outer + ! Accept directives between parallel do and actual loop. + !$OMP PARALLEL DO + !DIR$ VECTOR ALIGNED + DO 20 i=1,N + a = a + 0.5 +20 CONTINUE + !$OMP END PARALLEL DO + c = 16 !ERROR: DO loop after the PARALLEL DO directive must have loop control !$omp parallel do -- cgit v1.1 From d86f21693c5fb8eaa597cfcb15813ffc52d00847 Mon Sep 17 00:00:00 2001 From: Piotr Zegar Date: Fri, 9 Feb 2024 18:19:53 +0000 Subject: [clang-tidy][NFC] Fixes in release notes and documentation Minor fixes in documentation & release notes. --- clang-tools-extra/docs/ReleaseNotes.rst | 12 ++++++------ .../checks/readability/avoid-return-with-void-value.rst | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index dff8dd2..ee68c8f 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -118,8 +118,8 @@ Changes in existing checks global options of the same name. - Improved :doc:`bugprone-too-small-loop-variable - ` support by correctly - implementing the check for const loop boundary. + ` check by incorporating + better support for ``const`` loop boundaries. - Cleaned up :doc:`cppcoreguidelines-prefer-member-initializer ` @@ -163,13 +163,13 @@ Changes in existing checks Removed checks ^^^^^^^^^^^^^^ -Miscellaneous -^^^^^^^^^^^^^ - - Removed `cert-dcl21-cpp`, which was deprecated since :program:`clang-tidy` 17, since the rule DCL21-CPP has been removed from the CERT guidelines. -- Fixed incorrect formatting in ``clang-apply-repalcements`` when no ``--format`` +Miscellaneous +^^^^^^^^^^^^^ + +- Fixed incorrect formatting in ``clang-apply-replacements`` when no ``--format`` option is specified. Now ``clang-apply-replacements`` applies formatting only with the option. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst index d802f9b..b079581 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst @@ -29,7 +29,7 @@ that should be written as g(); return; -to make clear that ``g()`` is called and immediately afterwards the function +to make clear that ``g()`` is called and immediately afterwards the function returns (nothing). In C, the same issue is detected by the compiler if the ``-Wpedantic`` mode @@ -46,6 +46,6 @@ Options .. option:: StrictMode The value `false` specifies that a direct return statement shall - be excluded from the analysis if it is the only statement not - contained in a block like ``if (cond) return g();``. The default + be excluded from the analysis if it is the only statement not + contained in a block, like ``if (cond) return g();``. The default value is `true`. -- cgit v1.1 From 407f9c06ea2a4f3fc32647ba22e5b60f695ca4b3 Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Fri, 9 Feb 2024 10:33:58 -0800 Subject: [clang][driver] Set TLSDESC as the default for Android on RISC-V (#81198) --- clang/test/Driver/tls-dialect.c | 4 ++++ llvm/include/llvm/TargetParser/Triple.h | 5 +---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/clang/test/Driver/tls-dialect.c b/clang/test/Driver/tls-dialect.c index 4e105ce..f73915b 100644 --- a/clang/test/Driver/tls-dialect.c +++ b/clang/test/Driver/tls-dialect.c @@ -3,6 +3,10 @@ // RUN: %clang -### --target=riscv64-linux %s 2>&1 | FileCheck --check-prefix=NODESC %s // RUN: %clang -### --target=x86_64-linux -mtls-dialect=gnu %s 2>&1 | FileCheck --check-prefix=NODESC %s +/// Android supports TLSDESC by default on RISC-V +/// TLSDESC is not on by default in Linux, even on RISC-V, and is covered above +// RUN: %clang -### --target=riscv64-android %s 2>&1 | FileCheck --check-prefix=DESC %s + /// LTO // RUN: %clang -### --target=riscv64-linux -flto -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=LTO-DESC %s // RUN: %clang -### --target=riscv64-linux -flto %s 2>&1 | FileCheck --check-prefix=LTO-NODESC %s diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 98d8490..e732070 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -1035,10 +1035,7 @@ public: /// True if the target supports both general-dynamic and TLSDESC, and TLSDESC /// is enabled by default. - bool hasDefaultTLSDESC() const { - // TODO: Improve check for other platforms, like Android, and RISC-V - return false; - } + bool hasDefaultTLSDESC() const { return isAndroid() && isRISCV64(); } /// Tests whether the target uses -data-sections as default. bool hasDefaultDataSections() const { -- cgit v1.1 From 0329c1b6d838ec983f215244549b3c5ff2d5fb51 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 9 Feb 2024 10:38:03 -0800 Subject: [ELF] --no-rosegment: don't mark read-only PT_LOAD segments executable (#81223) Once we move `.lrodata` after .bss (#78521), or if we use `SECTIONS` commands, certain read-only sections may be in their own PT_LOAD, not in the traditional "text segment". Current --no-rosegment code may unnecessarily mark read-only PT_LOAD executable. Fix it. --- lld/ELF/Writer.cpp | 28 ++++++++++++++++------------ lld/test/ELF/segments.s | 2 +- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 6df43a3..53ca70b 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -2353,17 +2353,12 @@ static bool needsPtLoad(OutputSection *sec) { return true; } -// Linker scripts are responsible for aligning addresses. Unfortunately, most -// linker scripts are designed for creating two PT_LOADs only, one RX and one -// RW. This means that there is no alignment in the RO to RX transition and we -// cannot create a PT_LOAD there. +// Adjust phdr flags according to certain options. static uint64_t computeFlags(uint64_t flags) { if (config->omagic) return PF_R | PF_W | PF_X; if (config->executeOnly && (flags & PF_X)) return flags & ~PF_R; - if (config->singleRoRx && !(flags & PF_W)) - return flags | PF_X; return flags; } @@ -2451,7 +2446,7 @@ SmallVector Writer::createPhdrs(Partition &part) { // Segments are contiguous memory regions that has the same attributes // (e.g. executable or writable). There is one phdr for each segment. // Therefore, we need to create a new phdr when the next section has - // different flags or is loaded at a discontiguous address or memory region + // compatible flags or is loaded at a discontiguous address or memory region // using AT or AT> linker script command, respectively. // // As an exception, we don't create a separate load segment for the ELF @@ -2465,13 +2460,22 @@ SmallVector Writer::createPhdrs(Partition &part) { // so when hasSectionsCommand, since we cannot introduce the extra alignment // needed to create a new LOAD) uint64_t newFlags = computeFlags(sec->getPhdrFlags()); + // When --no-rosegment is specified, RO and RX sections are compatible. + uint32_t diff = flags ^ newFlags; + if (config->singleRoRx && !(newFlags & PF_W)) + diff &= ~PF_X; + if (diff) + load = nullptr; + bool sameLMARegion = load && !sec->lmaExpr && sec->lmaRegion == load->firstSec->lmaRegion; - if (!(load && newFlags == flags && sec != relroEnd && - sec->memRegion == load->firstSec->memRegion && - (sameLMARegion || load->lastSec == Out::programHeaders) && - (script->hasSectionsCommand || sec->type == SHT_NOBITS || - load->lastSec->type != SHT_NOBITS))) { + if (load && sec != relroEnd && + sec->memRegion == load->firstSec->memRegion && + (sameLMARegion || load->lastSec == Out::programHeaders) && + (script->hasSectionsCommand || sec->type == SHT_NOBITS || + load->lastSec->type != SHT_NOBITS)) { + load->p_flags |= newFlags; + } else { load = addHdr(PT_LOAD, newFlags); flags = newFlags; } diff --git a/lld/test/ELF/segments.s b/lld/test/ELF/segments.s index ee17117..1fe248a 100644 --- a/lld/test/ELF/segments.s +++ b/lld/test/ELF/segments.s @@ -44,7 +44,7 @@ # NOROSEGMENT1-NEXT: LOAD 0x001006 0x0000000000000006 0x0000000000000006 0x000001 0x000001 RW 0x1000 # NOROSEGMENT1-NEXT: LOAD 0x001007 0x0000000000000007 0x0000000000000007 0x000002 0x000002 R E 0x1000 # NOROSEGMENT1-NEXT: LOAD 0x001009 0x0000000000000009 0x0000000000000009 0x000001 0x000001 RW 0x1000 -# NOROSEGMENT1-NEXT: LOAD 0x00100a 0x000000000000000a 0x000000000000000a 0x000001 0x000001 R E 0x1000 +# NOROSEGMENT1-NEXT: LOAD 0x00100a 0x000000000000000a 0x000000000000000a 0x000001 0x000001 R 0x1000 # NOROSEGMENT1-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0 # RUN: ld.lld -N a.o -o omagic -- cgit v1.1 From 314ef9617e87b2cba9dd278e228ab03453500054 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 9 Feb 2024 10:41:37 -0800 Subject: [flang][cuda] Lower attribute for module variables (#81226) Propagate the CUDA attribute to fir.global operation for simple module variables. --- flang/include/flang/Optimizer/Builder/FIRBuilder.h | 6 ++- flang/include/flang/Optimizer/Dialect/FIROps.td | 3 +- flang/lib/Lower/ConvertVariable.cpp | 16 ++++--- flang/lib/Optimizer/Builder/FIRBuilder.cpp | 23 ++++++---- flang/test/Lower/CUDA/cuda-data-attribute.cuf | 51 +++++++++++++--------- 5 files changed, 61 insertions(+), 38 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h index 5384f6e..f50dacd 100644 --- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h +++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h @@ -230,12 +230,14 @@ public: llvm::StringRef name, mlir::StringAttr linkage = {}, mlir::Attribute value = {}, bool isConst = false, - bool isTarget = false); + bool isTarget = false, + fir::CUDAAttributeAttr cudaAttr = {}); fir::GlobalOp createGlobal(mlir::Location loc, mlir::Type type, llvm::StringRef name, bool isConst, bool isTarget, std::function bodyBuilder, - mlir::StringAttr linkage = {}); + mlir::StringAttr linkage = {}, + fir::CUDAAttributeAttr cudaAttr = {}); /// Create a global constant (read-only) value. fir::GlobalOp createGlobalConstant(mlir::Location loc, mlir::Type type, diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index b954a0c..d505fed 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -2737,7 +2737,8 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> { OptionalAttr:$initVal, OptionalAttr:$constant, OptionalAttr:$target, - OptionalAttr:$linkName + OptionalAttr:$linkName, + OptionalAttr:$cuda_attr ); let regions = (region AtMostRegion<1>:$region); diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index f14267f..2f23757 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -138,7 +138,8 @@ static bool isConstant(const Fortran::semantics::Symbol &sym) { static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, const Fortran::lower::pft::Variable &var, llvm::StringRef globalName, - mlir::StringAttr linkage); + mlir::StringAttr linkage, + fir::CUDAAttributeAttr cudaAttr = {}); static mlir::Location genLocation(Fortran::lower::AbstractConverter &converter, const Fortran::semantics::Symbol &sym) { @@ -462,7 +463,8 @@ void Fortran::lower::createGlobalInitialization( static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, const Fortran::lower::pft::Variable &var, llvm::StringRef globalName, - mlir::StringAttr linkage) { + mlir::StringAttr linkage, + fir::CUDAAttributeAttr cudaAttr) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); const Fortran::semantics::Symbol &sym = var.getSymbol(); mlir::Location loc = genLocation(converter, sym); @@ -500,8 +502,9 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, } } if (!global) - global = builder.createGlobal(loc, symTy, globalName, linkage, - mlir::Attribute{}, isConst, var.isTarget()); + global = + builder.createGlobal(loc, symTy, globalName, linkage, mlir::Attribute{}, + isConst, var.isTarget(), cudaAttr); if (Fortran::semantics::IsAllocatableOrPointer(sym) && !Fortran::semantics::IsProcedure(sym)) { const auto *details = @@ -2219,7 +2222,10 @@ void Fortran::lower::defineModuleVariable( // Do nothing. Mapping will be done on user side. } else { std::string globalName = converter.mangleName(sym); - defineGlobal(converter, var, globalName, linkage); + fir::CUDAAttributeAttr cudaAttr = + Fortran::lower::translateSymbolCUDAAttribute( + converter.getFirOpBuilder().getContext(), sym); + defineGlobal(converter, var, globalName, linkage, cudaAttr); } } diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index 141f8fc..68fe8de 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -271,19 +271,24 @@ mlir::Value fir::FirOpBuilder::createHeapTemporary( /// Create a global variable in the (read-only) data section. A global variable /// must have a unique name to identify and reference it. -fir::GlobalOp fir::FirOpBuilder::createGlobal(mlir::Location loc, - mlir::Type type, - llvm::StringRef name, - mlir::StringAttr linkage, - mlir::Attribute value, - bool isConst, bool isTarget) { +fir::GlobalOp fir::FirOpBuilder::createGlobal( + mlir::Location loc, mlir::Type type, llvm::StringRef name, + mlir::StringAttr linkage, mlir::Attribute value, bool isConst, + bool isTarget, fir::CUDAAttributeAttr cudaAttr) { auto module = getModule(); auto insertPt = saveInsertionPoint(); if (auto glob = module.lookupSymbol(name)) return glob; setInsertionPoint(module.getBody(), module.getBody()->end()); - auto glob = - create(loc, name, isConst, isTarget, type, value, linkage); + llvm::SmallVector attrs; + if (cudaAttr) { + auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(), + module.getContext()); + attrs.push_back(mlir::NamedAttribute( + fir::GlobalOp::getCudaAttrAttrName(globalOpName), cudaAttr)); + } + auto glob = create(loc, name, isConst, isTarget, type, value, + linkage, attrs); restoreInsertionPoint(insertPt); return glob; } @@ -291,7 +296,7 @@ fir::GlobalOp fir::FirOpBuilder::createGlobal(mlir::Location loc, fir::GlobalOp fir::FirOpBuilder::createGlobal( mlir::Location loc, mlir::Type type, llvm::StringRef name, bool isConst, bool isTarget, std::function bodyBuilder, - mlir::StringAttr linkage) { + mlir::StringAttr linkage, fir::CUDAAttributeAttr cudaAttr) { auto module = getModule(); auto insertPt = saveInsertionPoint(); if (auto glob = module.lookupSymbol(name)) diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf index b02701b..7596c6b 100644 --- a/flang/test/Lower/CUDA/cuda-data-attribute.cuf +++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf @@ -3,6 +3,18 @@ ! Test lowering of CUDA attribute on variables. +module cuda_var + real, constant :: mod_a_rc +! CHECK: fir.global @_QMcuda_varEmod_a_rc {cuda_attr = #fir.cuda} : f32 + real, device :: mod_b_ra +! CHECK: fir.global @_QMcuda_varEmod_b_ra {cuda_attr = #fir.cuda} : f32 + real, allocatable, managed :: mod_c_rm +! CHECK: fir.global @_QMcuda_varEmod_c_rm {cuda_attr = #fir.cuda} : !fir.box> + real, allocatable, pinned :: mod_d_rp +! CHECK: fir.global @_QMcuda_varEmod_d_rp {cuda_attr = #fir.cuda} : !fir.box> + +contains + subroutine local_var_attrs real, constant :: rc real, device :: rd @@ -10,46 +22,43 @@ subroutine local_var_attrs real, allocatable, pinned :: rp end subroutine -! CHECK-LABEL: func.func @_QPlocal_var_attrs() -! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QFlocal_var_attrsErc"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QFlocal_var_attrsErd"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFlocal_var_attrsErm"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) -! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFlocal_var_attrsErp"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +! CHECK-LABEL: func.func @_QMcuda_varPlocal_var_attrs() +! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QMcuda_varFlocal_var_attrsErc"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) -! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QFlocal_var_attrsErc"} : (!fir.ref) -> !fir.ref -! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QFlocal_var_attrsErd"} : (!fir.ref) -> !fir.ref -! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFlocal_var_attrsErm"} : (!fir.ref>>) -> !fir.ref>> -! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFlocal_var_attrsErp"} : (!fir.ref>>) -> !fir.ref>> +! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QMcuda_varFlocal_var_attrsErc"} : (!fir.ref) -> !fir.ref +! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref) -> !fir.ref +! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref>>) -> !fir.ref>> +! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref>>) -> !fir.ref>> subroutine dummy_arg_constant(dc) real, constant :: dc end subroutine -! CHECK-LABEL: func.func @_QPdummy_arg_constant( +! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_constant( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "dc", fir.cuda_attr = #fir.cuda} -! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, uniq_name = "_QFdummy_arg_constantEdc"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, uniq_name = "_QMcuda_varFdummy_arg_constantEdc"} : (!fir.ref) -> (!fir.ref, !fir.ref) subroutine dummy_arg_device(dd) real, device :: dd end subroutine -! CHECK-LABEL: func.func @_QPdummy_arg_device( +! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_device( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "dd", fir.cuda_attr = #fir.cuda}) { -! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, uniq_name = "_QFdummy_arg_deviceEdd"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref) -> (!fir.ref, !fir.ref) subroutine dummy_arg_managed(dm) real, allocatable, managed :: dm end subroutine -! CHECK-LABEL: func.func @_QPdummy_arg_managed( +! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_managed( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref>> {fir.bindc_name = "dm", fir.cuda_attr = #fir.cuda}) { -! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFdummy_arg_managedEdm"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) subroutine dummy_arg_pinned(dp) real, allocatable, pinned :: dp end subroutine -! CHECK-LABEL: func.func @_QPdummy_arg_pinned( +! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_pinned( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref>> {fir.bindc_name = "dp", fir.cuda_attr = #fir.cuda}) { -! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFdummy_arg_pinnedEdp"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) - - - - +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +end module -- cgit v1.1 From 2e4d2762b5f8c6b0ae02c2a9d517e009f470b8a6 Mon Sep 17 00:00:00 2001 From: Pranav Kant Date: Fri, 9 Feb 2024 10:55:56 -0800 Subject: [X86][CodeGen] Emit float128 libcalls for math functions (#79611) Make LLVM emit libcalls to proper float128 variants for float128 types. --- llvm/lib/CodeGen/TargetLoweringBase.cpp | 40 +++++++++++++++++++++ llvm/test/CodeGen/X86/GlobalISel/roundeven.ll | 2 +- llvm/test/CodeGen/X86/fp128-libcalls-strict.ll | 48 +++++++++++++------------- llvm/test/CodeGen/X86/fp128-libcalls.ll | 24 ++++++------- llvm/test/CodeGen/X86/frem.ll | 2 +- 5 files changed, 78 insertions(+), 38 deletions(-) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 16cd14b..d8302ba 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -122,6 +122,46 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) { for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC) setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C); + // Use the f128 variants of math functions on x86_64 + if (TT.getArch() == Triple::ArchType::x86_64) { + setLibcallName(RTLIB::REM_F128, "fmodf128"); + setLibcallName(RTLIB::FMA_F128, "fmaf128"); + setLibcallName(RTLIB::SQRT_F128, "sqrtf128"); + setLibcallName(RTLIB::CBRT_F128, "cbrtf128"); + setLibcallName(RTLIB::LOG_F128, "logf128"); + setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite"); + setLibcallName(RTLIB::LOG2_F128, "log2f128"); + setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite"); + setLibcallName(RTLIB::LOG10_F128, "log10f128"); + setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite"); + setLibcallName(RTLIB::EXP_F128, "expf128"); + setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite"); + setLibcallName(RTLIB::EXP2_F128, "exp2f128"); + setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite"); + setLibcallName(RTLIB::EXP10_F128, "exp10f128"); + setLibcallName(RTLIB::SIN_F128, "sinf128"); + setLibcallName(RTLIB::COS_F128, "cosf128"); + setLibcallName(RTLIB::SINCOS_F128, "sincosf128"); + setLibcallName(RTLIB::POW_F128, "powf128"); + setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite"); + setLibcallName(RTLIB::CEIL_F128, "ceilf128"); + setLibcallName(RTLIB::TRUNC_F128, "truncf128"); + setLibcallName(RTLIB::RINT_F128, "rintf128"); + setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128"); + setLibcallName(RTLIB::ROUND_F128, "roundf128"); + setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128"); + setLibcallName(RTLIB::FLOOR_F128, "floorf128"); + setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128"); + setLibcallName(RTLIB::FMIN_F128, "fminf128"); + setLibcallName(RTLIB::FMAX_F128, "fmaxf128"); + setLibcallName(RTLIB::LROUND_F128, "lroundf128"); + setLibcallName(RTLIB::LLROUND_F128, "llroundf128"); + setLibcallName(RTLIB::LRINT_F128, "lrintf128"); + setLibcallName(RTLIB::LLRINT_F128, "llrintf128"); + setLibcallName(RTLIB::LDEXP_F128, "ldexpf128"); + setLibcallName(RTLIB::FREXP_F128, "frexpf128"); + } + // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf". if (TT.isPPC()) { setLibcallName(RTLIB::ADD_F128, "__addkf3"); diff --git a/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll b/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll index 119821e..dae27ff 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll @@ -44,7 +44,7 @@ define fp128 @roundeven_f128(fp128 %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: callq roundevenl +; CHECK-NEXT: callq roundevenf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll index 4722ce6..47234c3 100644 --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -163,7 +163,7 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp { ; CHECK-LABEL: fma: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq fmal@PLT +; CHECK-NEXT: callq fmaf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -204,7 +204,7 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp { ; CHECK-LABEL: frem: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq fmodl@PLT +; CHECK-NEXT: callq fmodf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -241,7 +241,7 @@ define fp128 @ceil(fp128 %x) nounwind strictfp { ; CHECK-LABEL: ceil: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq ceill@PLT +; CHECK-NEXT: callq ceilf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -274,7 +274,7 @@ define fp128 @cos(fp128 %x) nounwind strictfp { ; CHECK-LABEL: cos: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq cosl@PLT +; CHECK-NEXT: callq cosf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -307,7 +307,7 @@ define fp128 @exp(fp128 %x) nounwind strictfp { ; CHECK-LABEL: exp: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq expl@PLT +; CHECK-NEXT: callq expf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -340,7 +340,7 @@ define fp128 @exp2(fp128 %x) nounwind strictfp { ; CHECK-LABEL: exp2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq exp2l@PLT +; CHECK-NEXT: callq exp2f128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -373,7 +373,7 @@ define fp128 @floor(fp128 %x) nounwind strictfp { ; CHECK-LABEL: floor: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq floorl@PLT +; CHECK-NEXT: callq floorf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -406,7 +406,7 @@ define fp128 @log(fp128 %x) nounwind strictfp { ; CHECK-LABEL: log: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq logl@PLT +; CHECK-NEXT: callq logf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -439,7 +439,7 @@ define fp128 @log10(fp128 %x) nounwind strictfp { ; CHECK-LABEL: log10: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq log10l@PLT +; CHECK-NEXT: callq log10f128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -472,7 +472,7 @@ define fp128 @log2(fp128 %x) nounwind strictfp { ; CHECK-LABEL: log2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq log2l@PLT +; CHECK-NEXT: callq log2f128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -505,7 +505,7 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp { ; CHECK-LABEL: maxnum: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq fmaxl@PLT +; CHECK-NEXT: callq fmaxf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -542,7 +542,7 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp { ; CHECK-LABEL: minnum: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq fminl@PLT +; CHECK-NEXT: callq fminf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -579,7 +579,7 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp { ; CHECK-LABEL: nearbyint: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq nearbyintl@PLT +; CHECK-NEXT: callq nearbyintf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -612,7 +612,7 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp { ; CHECK-LABEL: pow: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq powl@PLT +; CHECK-NEXT: callq powf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -683,7 +683,7 @@ define fp128 @rint(fp128 %x) nounwind strictfp { ; CHECK-LABEL: rint: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq rintl@PLT +; CHECK-NEXT: callq rintf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -716,7 +716,7 @@ define fp128 @round(fp128 %x) nounwind strictfp { ; CHECK-LABEL: round: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq roundl@PLT +; CHECK-NEXT: callq roundf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -749,7 +749,7 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp { ; CHECK-LABEL: roundeven: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq roundevenl@PLT +; CHECK-NEXT: callq roundevenf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -782,7 +782,7 @@ define fp128 @sin(fp128 %x) nounwind strictfp { ; CHECK-LABEL: sin: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq sinl@PLT +; CHECK-NEXT: callq sinf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -815,7 +815,7 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp { ; CHECK-LABEL: sqrt: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq sqrtl@PLT +; CHECK-NEXT: callq sqrtf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -848,7 +848,7 @@ define fp128 @trunc(fp128 %x) nounwind strictfp { ; CHECK-LABEL: trunc: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq truncl@PLT +; CHECK-NEXT: callq truncf128 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq ; @@ -881,7 +881,7 @@ define i32 @lrint(fp128 %x) nounwind strictfp { ; CHECK-LABEL: lrint: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq lrintl@PLT +; CHECK-NEXT: callq lrintf128 ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq ; @@ -904,7 +904,7 @@ define i64 @llrint(fp128 %x) nounwind strictfp { ; CHECK-LABEL: llrint: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq llrintl@PLT +; CHECK-NEXT: callq llrintf128 ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq ; @@ -927,7 +927,7 @@ define i32 @lround(fp128 %x) nounwind strictfp { ; CHECK-LABEL: lround: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq lroundl@PLT +; CHECK-NEXT: callq lroundf128 ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq ; @@ -950,7 +950,7 @@ define i64 @llround(fp128 %x) nounwind strictfp { ; CHECK-LABEL: llround: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq llroundl@PLT +; CHECK-NEXT: callq llroundf128 ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll index 4e7e6b4..6946ca2 100644 --- a/llvm/test/CodeGen/X86/fp128-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll @@ -299,7 +299,7 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind { ; CHECK-LABEL: Test128Rem: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq fmodl@PLT +; CHECK-NEXT: callq fmodf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -335,7 +335,7 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind { ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: movaps vf128(%rip), %xmm0 -; CHECK-NEXT: callq fmodl@PLT +; CHECK-NEXT: callq fmodf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -370,7 +370,7 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind { ; CHECK-LABEL: Test128Sqrt: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq sqrtl@PLT +; CHECK-NEXT: callq sqrtf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -401,7 +401,7 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind { ; CHECK-LABEL: Test128Sin: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq sinl@PLT +; CHECK-NEXT: callq sinf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -432,7 +432,7 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind { ; CHECK-LABEL: Test128Cos: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq cosl@PLT +; CHECK-NEXT: callq cosf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -463,7 +463,7 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind { ; CHECK-LABEL: Test128Ceil: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq ceill@PLT +; CHECK-NEXT: callq ceilf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -494,7 +494,7 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind { ; CHECK-LABEL: Test128Floor: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq floorl@PLT +; CHECK-NEXT: callq floorf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -525,7 +525,7 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind { ; CHECK-LABEL: Test128Trunc: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq truncl@PLT +; CHECK-NEXT: callq truncf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -556,7 +556,7 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind { ; CHECK-LABEL: Test128Nearbyint: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq nearbyintl@PLT +; CHECK-NEXT: callq nearbyintf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -587,7 +587,7 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind { ; CHECK-LABEL: Test128Rint: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq rintl@PLT +; CHECK-NEXT: callq rintf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -618,7 +618,7 @@ define dso_local void @Test128Round(fp128 %d1) nounwind { ; CHECK-LABEL: Test128Round: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq roundl@PLT +; CHECK-NEXT: callq roundf128 ; CHECK-NEXT: movaps %xmm0, vf128(%rip) ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -648,7 +648,7 @@ declare fp128 @llvm.round.f128(fp128) define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind { ; CHECK-LABEL: Test128FMA: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: jmp fmal@PLT # TAILCALL +; CHECK-NEXT: jmp fmaf128@PLT # TAILCALL ; ; X86-LABEL: Test128FMA: ; X86: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll index d91d428..35d16c3 100644 --- a/llvm/test/CodeGen/X86/frem.ll +++ b/llvm/test/CodeGen/X86/frem.ll @@ -82,7 +82,7 @@ define void @frem_f128(fp128 %a0, fp128 %a1, ptr%p3) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: callq fmodl@PLT +; CHECK-NEXT: callq fmodf128 ; CHECK-NEXT: vmovaps %xmm0, (%rbx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq -- cgit v1.1 From 647010a06f3af725a2e674f025bc0e04aa1fbbff Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 9 Feb 2024 10:56:33 -0800 Subject: [RISCV] Remove unnecessary check for RVE from determineCalleeSaves. NFCI The SavedRegs BitVector is checks against the CSR list later. We have a separate CSR list for RVE that excludes X16-31 so we don't need to filter here. If it was needed, it would be needed for the next block of code too which didn't have an RVE check. --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index b12b497..60f92af 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -1003,9 +1003,7 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF, }; for (auto Reg : CSRegs) - // Only save x0-x15 for RVE. - if (Reg < RISCV::X16 || !Subtarget.isRVE()) - SavedRegs.set(Reg); + SavedRegs.set(Reg); // According to psABI, if ilp32e/lp64e ABIs are used with an ISA that // has any of the registers x16-x31 and f0-f31, then these registers are -- cgit v1.1 From 5948d4de1d965d88c8ca05cc84bd94a28fa53ba4 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 9 Feb 2024 11:09:44 -0800 Subject: [RISCV] Add test coverage for buildvectors with long vslidedown sequences In advance of an upcoming change. --- .../CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll | 74 +++ .../RISCV/rvv/fixed-vectors-int-buildvec.ll | 509 +++++++++++++++++++++ 2 files changed, 583 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index 57b2193..a2bd862 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -1394,3 +1394,77 @@ define <2 x double> @vid_step2_v2f64() { ; CHECK-NEXT: ret ret <2 x double> } + + +define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float %e3, float %e4, float %e5, float %e6, float %e7) vscale_range(4, 128) { +; CHECK-LABEL: buildvec_v8f32_zvl256: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa3 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa4 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa6 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa7 +; CHECK-NEXT: ret + %v0 = insertelement <8 x float> poison, float %e0, i64 0 + %v1 = insertelement <8 x float> %v0, float %e1, i64 1 + %v2 = insertelement <8 x float> %v1, float %e2, i64 2 + %v3 = insertelement <8 x float> %v2, float %e3, i64 3 + %v4 = insertelement <8 x float> %v3, float %e4, i64 4 + %v5 = insertelement <8 x float> %v4, float %e5, i64 5 + %v6 = insertelement <8 x float> %v5, float %e6, i64 6 + %v7 = insertelement <8 x float> %v6, float %e7, i64 7 + ret <8 x float> %v7 +} + + +define <8 x double> @buildvec_v8f64_zvl256(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7) vscale_range(4, 128) { +; CHECK-LABEL: buildvec_v8f64_zvl256: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e64, m2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa3 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa4 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa6 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa7 +; CHECK-NEXT: ret + %v0 = insertelement <8 x double> poison, double %e0, i64 0 + %v1 = insertelement <8 x double> %v0, double %e1, i64 1 + %v2 = insertelement <8 x double> %v1, double %e2, i64 2 + %v3 = insertelement <8 x double> %v2, double %e3, i64 3 + %v4 = insertelement <8 x double> %v3, double %e4, i64 4 + %v5 = insertelement <8 x double> %v4, double %e5, i64 5 + %v6 = insertelement <8 x double> %v5, double %e6, i64 6 + %v7 = insertelement <8 x double> %v6, double %e7, i64 7 + ret <8 x double> %v7 +} + +define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7) vscale_range(8, 128) { +; CHECK-LABEL: buildvec_v8f64_zvl512: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa3 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa4 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa6 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa7 +; CHECK-NEXT: ret + %v0 = insertelement <8 x double> poison, double %e0, i64 0 + %v1 = insertelement <8 x double> %v0, double %e1, i64 1 + %v2 = insertelement <8 x double> %v1, double %e2, i64 2 + %v3 = insertelement <8 x double> %v2, double %e3, i64 3 + %v4 = insertelement <8 x double> %v3, double %e4, i64 4 + %v5 = insertelement <8 x double> %v4, double %e5, i64 5 + %v6 = insertelement <8 x double> %v5, double %e6, i64 6 + %v7 = insertelement <8 x double> %v6, double %e7, i64 7 + ret <8 x double> %v7 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index dfafbfb..e691e63 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -1178,3 +1178,512 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca %v4 = insertelement <8 x i64> %v3, i64 %d, i32 7 ret <8 x i64> %v4 } + + +define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { +; RV32-LABEL: buildvec_v16i8_loads_contigous: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: lbu a1, 1(a0) +; RV32-NEXT: lbu a2, 2(a0) +; RV32-NEXT: lbu a3, 3(a0) +; RV32-NEXT: lbu a4, 4(a0) +; RV32-NEXT: lbu a5, 5(a0) +; RV32-NEXT: lbu a6, 6(a0) +; RV32-NEXT: lbu a7, 7(a0) +; RV32-NEXT: lbu t0, 8(a0) +; RV32-NEXT: lbu t1, 9(a0) +; RV32-NEXT: lbu t2, 10(a0) +; RV32-NEXT: lbu t3, 11(a0) +; RV32-NEXT: lbu t4, 12(a0) +; RV32-NEXT: lbu t5, 13(a0) +; RV32-NEXT: lbu t6, 14(a0) +; RV32-NEXT: lbu s0, 15(a0) +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vlse8.v v8, (a0), zero +; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v8, v8, a4 +; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v8, v8, a7 +; RV32-NEXT: vslide1down.vx v8, v8, t0 +; RV32-NEXT: vslide1down.vx v8, v8, t1 +; RV32-NEXT: vslide1down.vx v8, v8, t2 +; RV32-NEXT: vslide1down.vx v8, v8, t3 +; RV32-NEXT: vslide1down.vx v8, v8, t4 +; RV32-NEXT: vslide1down.vx v8, v8, t5 +; RV32-NEXT: vslide1down.vx v8, v8, t6 +; RV32-NEXT: vslide1down.vx v8, v8, s0 +; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: buildvec_v16i8_loads_contigous: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset s0, -8 +; RV64-NEXT: lbu a1, 1(a0) +; RV64-NEXT: lbu a2, 2(a0) +; RV64-NEXT: lbu a3, 3(a0) +; RV64-NEXT: lbu a4, 4(a0) +; RV64-NEXT: lbu a5, 5(a0) +; RV64-NEXT: lbu a6, 6(a0) +; RV64-NEXT: lbu a7, 7(a0) +; RV64-NEXT: lbu t0, 8(a0) +; RV64-NEXT: lbu t1, 9(a0) +; RV64-NEXT: lbu t2, 10(a0) +; RV64-NEXT: lbu t3, 11(a0) +; RV64-NEXT: lbu t4, 12(a0) +; RV64-NEXT: lbu t5, 13(a0) +; RV64-NEXT: lbu t6, 14(a0) +; RV64-NEXT: lbu s0, 15(a0) +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vlse8.v v8, (a0), zero +; RV64-NEXT: vslide1down.vx v8, v8, a1 +; RV64-NEXT: vslide1down.vx v8, v8, a2 +; RV64-NEXT: vslide1down.vx v8, v8, a3 +; RV64-NEXT: vslide1down.vx v8, v8, a4 +; RV64-NEXT: vslide1down.vx v8, v8, a5 +; RV64-NEXT: vslide1down.vx v8, v8, a6 +; RV64-NEXT: vslide1down.vx v8, v8, a7 +; RV64-NEXT: vslide1down.vx v8, v8, t0 +; RV64-NEXT: vslide1down.vx v8, v8, t1 +; RV64-NEXT: vslide1down.vx v8, v8, t2 +; RV64-NEXT: vslide1down.vx v8, v8, t3 +; RV64-NEXT: vslide1down.vx v8, v8, t4 +; RV64-NEXT: vslide1down.vx v8, v8, t5 +; RV64-NEXT: vslide1down.vx v8, v8, t6 +; RV64-NEXT: vslide1down.vx v8, v8, s0 +; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %p2 = getelementptr i8, ptr %p, i32 1 + %p3 = getelementptr i8, ptr %p, i32 2 + %p4 = getelementptr i8, ptr %p, i32 3 + %p5 = getelementptr i8, ptr %p, i32 4 + %p6 = getelementptr i8, ptr %p, i32 5 + %p7 = getelementptr i8, ptr %p, i32 6 + %p8 = getelementptr i8, ptr %p, i32 7 + %p9 = getelementptr i8, ptr %p, i32 8 + %p10 = getelementptr i8, ptr %p, i32 9 + %p11 = getelementptr i8, ptr %p, i32 10 + %p12 = getelementptr i8, ptr %p, i32 11 + %p13 = getelementptr i8, ptr %p, i32 12 + %p14 = getelementptr i8, ptr %p, i32 13 + %p15 = getelementptr i8, ptr %p, i32 14 + %p16 = getelementptr i8, ptr %p, i32 15 + + %ld1 = load i8, ptr %p + %ld2 = load i8, ptr %p2 + %ld3 = load i8, ptr %p3 + %ld4 = load i8, ptr %p4 + %ld5 = load i8, ptr %p5 + %ld6 = load i8, ptr %p6 + %ld7 = load i8, ptr %p7 + %ld8 = load i8, ptr %p8 + %ld9 = load i8, ptr %p9 + %ld10 = load i8, ptr %p10 + %ld11 = load i8, ptr %p11 + %ld12 = load i8, ptr %p12 + %ld13 = load i8, ptr %p13 + %ld14 = load i8, ptr %p14 + %ld15 = load i8, ptr %p15 + %ld16 = load i8, ptr %p16 + + %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0 + %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1 + %v3 = insertelement <16 x i8> %v2, i8 %ld3, i32 2 + %v4 = insertelement <16 x i8> %v3, i8 %ld4, i32 3 + %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4 + %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5 + %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6 + %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7 + %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8 + %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9 + %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10 + %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11 + %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12 + %v14 = insertelement <16 x i8> %v13, i8 %ld14, i32 13 + %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14 + %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15 + ret <16 x i8> %v16 +} + + +define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { +; RV32-LABEL: buildvec_v16i8_loads_gather: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: lbu a1, 1(a0) +; RV32-NEXT: lbu a2, 22(a0) +; RV32-NEXT: lbu a3, 31(a0) +; RV32-NEXT: lbu a4, 44(a0) +; RV32-NEXT: lbu a5, 55(a0) +; RV32-NEXT: lbu a6, 623(a0) +; RV32-NEXT: lbu a7, 75(a0) +; RV32-NEXT: lbu t0, 82(a0) +; RV32-NEXT: lbu t1, 93(a0) +; RV32-NEXT: lbu t2, 105(a0) +; RV32-NEXT: lbu t3, 161(a0) +; RV32-NEXT: lbu t4, 124(a0) +; RV32-NEXT: lbu t5, 163(a0) +; RV32-NEXT: lbu t6, 144(a0) +; RV32-NEXT: lbu s0, 154(a0) +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vlse8.v v8, (a0), zero +; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v8, v8, a4 +; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v8, v8, a7 +; RV32-NEXT: vslide1down.vx v8, v8, t0 +; RV32-NEXT: vslide1down.vx v8, v8, t1 +; RV32-NEXT: vslide1down.vx v8, v8, t2 +; RV32-NEXT: vslide1down.vx v8, v8, t3 +; RV32-NEXT: vslide1down.vx v8, v8, t4 +; RV32-NEXT: vslide1down.vx v8, v8, t5 +; RV32-NEXT: vslide1down.vx v8, v8, t6 +; RV32-NEXT: vslide1down.vx v8, v8, s0 +; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: buildvec_v16i8_loads_gather: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset s0, -8 +; RV64-NEXT: lbu a1, 1(a0) +; RV64-NEXT: lbu a2, 22(a0) +; RV64-NEXT: lbu a3, 31(a0) +; RV64-NEXT: lbu a4, 44(a0) +; RV64-NEXT: lbu a5, 55(a0) +; RV64-NEXT: lbu a6, 623(a0) +; RV64-NEXT: lbu a7, 75(a0) +; RV64-NEXT: lbu t0, 82(a0) +; RV64-NEXT: lbu t1, 93(a0) +; RV64-NEXT: lbu t2, 105(a0) +; RV64-NEXT: lbu t3, 161(a0) +; RV64-NEXT: lbu t4, 124(a0) +; RV64-NEXT: lbu t5, 163(a0) +; RV64-NEXT: lbu t6, 144(a0) +; RV64-NEXT: lbu s0, 154(a0) +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vlse8.v v8, (a0), zero +; RV64-NEXT: vslide1down.vx v8, v8, a1 +; RV64-NEXT: vslide1down.vx v8, v8, a2 +; RV64-NEXT: vslide1down.vx v8, v8, a3 +; RV64-NEXT: vslide1down.vx v8, v8, a4 +; RV64-NEXT: vslide1down.vx v8, v8, a5 +; RV64-NEXT: vslide1down.vx v8, v8, a6 +; RV64-NEXT: vslide1down.vx v8, v8, a7 +; RV64-NEXT: vslide1down.vx v8, v8, t0 +; RV64-NEXT: vslide1down.vx v8, v8, t1 +; RV64-NEXT: vslide1down.vx v8, v8, t2 +; RV64-NEXT: vslide1down.vx v8, v8, t3 +; RV64-NEXT: vslide1down.vx v8, v8, t4 +; RV64-NEXT: vslide1down.vx v8, v8, t5 +; RV64-NEXT: vslide1down.vx v8, v8, t6 +; RV64-NEXT: vslide1down.vx v8, v8, s0 +; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %p2 = getelementptr i8, ptr %p, i32 1 + %p3 = getelementptr i8, ptr %p, i32 22 + %p4 = getelementptr i8, ptr %p, i32 31 + %p5 = getelementptr i8, ptr %p, i32 44 + %p6 = getelementptr i8, ptr %p, i32 55 + %p7 = getelementptr i8, ptr %p, i32 623 + %p8 = getelementptr i8, ptr %p, i32 75 + %p9 = getelementptr i8, ptr %p, i32 82 + %p10 = getelementptr i8, ptr %p, i32 93 + %p11 = getelementptr i8, ptr %p, i32 105 + %p12 = getelementptr i8, ptr %p, i32 161 + %p13 = getelementptr i8, ptr %p, i32 124 + %p14 = getelementptr i8, ptr %p, i32 163 + %p15 = getelementptr i8, ptr %p, i32 144 + %p16 = getelementptr i8, ptr %p, i32 154 + + %ld1 = load i8, ptr %p + %ld2 = load i8, ptr %p2 + %ld3 = load i8, ptr %p3 + %ld4 = load i8, ptr %p4 + %ld5 = load i8, ptr %p5 + %ld6 = load i8, ptr %p6 + %ld7 = load i8, ptr %p7 + %ld8 = load i8, ptr %p8 + %ld9 = load i8, ptr %p9 + %ld10 = load i8, ptr %p10 + %ld11 = load i8, ptr %p11 + %ld12 = load i8, ptr %p12 + %ld13 = load i8, ptr %p13 + %ld14 = load i8, ptr %p14 + %ld15 = load i8, ptr %p15 + %ld16 = load i8, ptr %p16 + + %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0 + %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1 + %v3 = insertelement <16 x i8> %v2, i8 %ld3, i32 2 + %v4 = insertelement <16 x i8> %v3, i8 %ld4, i32 3 + %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4 + %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5 + %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6 + %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7 + %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8 + %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9 + %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10 + %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11 + %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12 + %v14 = insertelement <16 x i8> %v13, i8 %ld14, i32 13 + %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14 + %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15 + ret <16 x i8> %v16 +} + +define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { +; CHECK-LABEL: buildvec_v16i8_undef_low_half: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, a0, 82 +; CHECK-NEXT: lbu a2, 93(a0) +; CHECK-NEXT: lbu a3, 105(a0) +; CHECK-NEXT: lbu a4, 161(a0) +; CHECK-NEXT: lbu a5, 124(a0) +; CHECK-NEXT: lbu a6, 163(a0) +; CHECK-NEXT: lbu a7, 144(a0) +; CHECK-NEXT: lbu a0, 154(a0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a1), zero +; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vslide1down.vx v8, v8, a4 +; CHECK-NEXT: vslide1down.vx v8, v8, a5 +; CHECK-NEXT: vslide1down.vx v8, v8, a6 +; CHECK-NEXT: vslide1down.vx v8, v8, a7 +; CHECK-NEXT: vslide1down.vx v8, v8, a0 +; CHECK-NEXT: ret + %p9 = getelementptr i8, ptr %p, i32 82 + %p10 = getelementptr i8, ptr %p, i32 93 + %p11 = getelementptr i8, ptr %p, i32 105 + %p12 = getelementptr i8, ptr %p, i32 161 + %p13 = getelementptr i8, ptr %p, i32 124 + %p14 = getelementptr i8, ptr %p, i32 163 + %p15 = getelementptr i8, ptr %p, i32 144 + %p16 = getelementptr i8, ptr %p, i32 154 + + %ld9 = load i8, ptr %p9 + %ld10 = load i8, ptr %p10 + %ld11 = load i8, ptr %p11 + %ld12 = load i8, ptr %p12 + %ld13 = load i8, ptr %p13 + %ld14 = load i8, ptr %p14 + %ld15 = load i8, ptr %p15 + %ld16 = load i8, ptr %p16 + + %v9 = insertelement <16 x i8> poison, i8 %ld9, i32 8 + %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9 + %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10 + %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11 + %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12 + %v14 = insertelement <16 x i8> %v13, i8 %ld14, i32 13 + %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14 + %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15 + ret <16 x i8> %v16 +} + +define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { +; CHECK-LABEL: buildvec_v16i8_undef_high_half: +; CHECK: # %bb.0: +; CHECK-NEXT: lbu a1, 1(a0) +; CHECK-NEXT: lbu a2, 22(a0) +; CHECK-NEXT: lbu a3, 31(a0) +; CHECK-NEXT: lbu a4, 44(a0) +; CHECK-NEXT: lbu a5, 55(a0) +; CHECK-NEXT: lbu a6, 623(a0) +; CHECK-NEXT: lbu a7, 75(a0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), zero +; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vslide1down.vx v8, v8, a4 +; CHECK-NEXT: vslide1down.vx v8, v8, a5 +; CHECK-NEXT: vslide1down.vx v8, v8, a6 +; CHECK-NEXT: vslide1down.vx v8, v8, a7 +; CHECK-NEXT: vslidedown.vi v8, v8, 8 +; CHECK-NEXT: ret + %p2 = getelementptr i8, ptr %p, i32 1 + %p3 = getelementptr i8, ptr %p, i32 22 + %p4 = getelementptr i8, ptr %p, i32 31 + %p5 = getelementptr i8, ptr %p, i32 44 + %p6 = getelementptr i8, ptr %p, i32 55 + %p7 = getelementptr i8, ptr %p, i32 623 + %p8 = getelementptr i8, ptr %p, i32 75 + + %ld1 = load i8, ptr %p + %ld2 = load i8, ptr %p2 + %ld3 = load i8, ptr %p3 + %ld4 = load i8, ptr %p4 + %ld5 = load i8, ptr %p5 + %ld6 = load i8, ptr %p6 + %ld7 = load i8, ptr %p7 + %ld8 = load i8, ptr %p8 + + %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0 + %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1 + %v3 = insertelement <16 x i8> %v2, i8 %ld3, i32 2 + %v4 = insertelement <16 x i8> %v3, i8 %ld4, i32 3 + %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4 + %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5 + %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6 + %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7 + ret <16 x i8> %v8 +} + +define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { +; CHECK-LABEL: buildvec_v16i8_undef_edges: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, a0, 31 +; CHECK-NEXT: lbu a2, 44(a0) +; CHECK-NEXT: lbu a3, 55(a0) +; CHECK-NEXT: lbu a4, 623(a0) +; CHECK-NEXT: lbu a5, 75(a0) +; CHECK-NEXT: lbu a6, 82(a0) +; CHECK-NEXT: lbu a7, 93(a0) +; CHECK-NEXT: lbu t0, 105(a0) +; CHECK-NEXT: lbu a0, 161(a0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a1), zero +; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vslide1down.vx v8, v8, a4 +; CHECK-NEXT: vslide1down.vx v8, v8, a5 +; CHECK-NEXT: vslide1down.vx v8, v8, a6 +; CHECK-NEXT: vslide1down.vx v8, v8, a7 +; CHECK-NEXT: vslide1down.vx v8, v8, t0 +; CHECK-NEXT: vslide1down.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vi v8, v8, 4 +; CHECK-NEXT: ret + %p4 = getelementptr i8, ptr %p, i32 31 + %p5 = getelementptr i8, ptr %p, i32 44 + %p6 = getelementptr i8, ptr %p, i32 55 + %p7 = getelementptr i8, ptr %p, i32 623 + %p8 = getelementptr i8, ptr %p, i32 75 + %p9 = getelementptr i8, ptr %p, i32 82 + %p10 = getelementptr i8, ptr %p, i32 93 + %p11 = getelementptr i8, ptr %p, i32 105 + %p12 = getelementptr i8, ptr %p, i32 161 + + %ld4 = load i8, ptr %p4 + %ld5 = load i8, ptr %p5 + %ld6 = load i8, ptr %p6 + %ld7 = load i8, ptr %p7 + %ld8 = load i8, ptr %p8 + %ld9 = load i8, ptr %p9 + %ld10 = load i8, ptr %p10 + %ld11 = load i8, ptr %p11 + %ld12 = load i8, ptr %p12 + + %v4 = insertelement <16 x i8> poison, i8 %ld4, i32 3 + %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4 + %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5 + %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6 + %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7 + %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8 + %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9 + %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10 + %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11 + ret <16 x i8> %v12 +} + +define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { +; CHECK-LABEL: buildvec_v16i8_loads_undef_scattered: +; CHECK: # %bb.0: +; CHECK-NEXT: lbu a1, 1(a0) +; CHECK-NEXT: lbu a2, 44(a0) +; CHECK-NEXT: lbu a3, 55(a0) +; CHECK-NEXT: lbu a4, 75(a0) +; CHECK-NEXT: lbu a5, 82(a0) +; CHECK-NEXT: lbu a6, 93(a0) +; CHECK-NEXT: lbu a7, 124(a0) +; CHECK-NEXT: lbu t0, 144(a0) +; CHECK-NEXT: lbu t1, 154(a0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), zero +; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslide1down.vx v8, v8, a4 +; CHECK-NEXT: vslide1down.vx v8, v8, a5 +; CHECK-NEXT: vslide1down.vx v8, v8, a6 +; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: vslide1down.vx v8, v8, a7 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslide1down.vx v8, v8, t0 +; CHECK-NEXT: vslide1down.vx v8, v8, t1 +; CHECK-NEXT: ret + %p2 = getelementptr i8, ptr %p, i32 1 + %p3 = getelementptr i8, ptr %p, i32 22 + %p4 = getelementptr i8, ptr %p, i32 31 + %p5 = getelementptr i8, ptr %p, i32 44 + %p6 = getelementptr i8, ptr %p, i32 55 + %p7 = getelementptr i8, ptr %p, i32 623 + %p8 = getelementptr i8, ptr %p, i32 75 + %p9 = getelementptr i8, ptr %p, i32 82 + %p10 = getelementptr i8, ptr %p, i32 93 + %p11 = getelementptr i8, ptr %p, i32 105 + %p12 = getelementptr i8, ptr %p, i32 161 + %p13 = getelementptr i8, ptr %p, i32 124 + %p14 = getelementptr i8, ptr %p, i32 163 + %p15 = getelementptr i8, ptr %p, i32 144 + %p16 = getelementptr i8, ptr %p, i32 154 + + %ld1 = load i8, ptr %p + %ld2 = load i8, ptr %p2 + %ld3 = load i8, ptr %p3 + %ld4 = load i8, ptr %p4 + %ld5 = load i8, ptr %p5 + %ld6 = load i8, ptr %p6 + %ld7 = load i8, ptr %p7 + %ld8 = load i8, ptr %p8 + %ld9 = load i8, ptr %p9 + %ld10 = load i8, ptr %p10 + %ld11 = load i8, ptr %p11 + %ld12 = load i8, ptr %p12 + %ld13 = load i8, ptr %p13 + %ld14 = load i8, ptr %p14 + %ld15 = load i8, ptr %p15 + %ld16 = load i8, ptr %p16 + + %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0 + %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1 + %v3 = insertelement <16 x i8> %v2, i8 undef, i32 2 + %v4 = insertelement <16 x i8> %v3, i8 undef, i32 3 + %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4 + %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5 + %v7 = insertelement <16 x i8> %v6, i8 undef, i32 6 + %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7 + %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8 + %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9 + %v11 = insertelement <16 x i8> %v10, i8 undef, i32 10 + %v12 = insertelement <16 x i8> %v11, i8 undef, i32 11 + %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12 + %v14 = insertelement <16 x i8> %v13, i8 undef, i32 13 + %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14 + %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15 + ret <16 x i8> %v16 +} -- cgit v1.1 From 2a6b521b36fb538a49564323ecd457d7b08b1325 Mon Sep 17 00:00:00 2001 From: Yinying Li <107574043+yinying-lisa-li@users.noreply.github.com> Date: Fri, 9 Feb 2024 14:34:36 -0500 Subject: [mlir][sparse] Add more tests and verification for n:m (#81186) 1. Add python test for n out of m 2. Add more methods for python binding 3. Add verification for n:m and invalid encoding tests 4. Add e2e test for n:m Previous PRs for n:m #80501 #79935 --- mlir/include/mlir-c/Dialect/SparseTensor.h | 10 ++ mlir/lib/Bindings/Python/DialectSparseTensor.cpp | 38 +++++++- mlir/lib/CAPI/Dialect/SparseTensor.cpp | 18 ++++ .../SparseTensor/IR/Detail/LvlTypeParser.cpp | 34 ++++--- .../Dialect/SparseTensor/IR/Detail/LvlTypeParser.h | 4 +- .../SparseTensor/IR/SparseTensorDialect.cpp | 31 ++++++ .../Dialect/SparseTensor/invalid_encoding.mlir | 106 +++++++++++++++++++++ .../Dialect/SparseTensor/CPU/sparse_ds.mlir | 22 +++++ mlir/test/python/dialects/sparse_tensor/dialect.py | 84 ++++++++++++++++ 9 files changed, 331 insertions(+), 16 deletions(-) diff --git a/mlir/include/mlir-c/Dialect/SparseTensor.h b/mlir/include/mlir-c/Dialect/SparseTensor.h index 2c71b00..d549f5d 100644 --- a/mlir/include/mlir-c/Dialect/SparseTensor.h +++ b/mlir/include/mlir-c/Dialect/SparseTensor.h @@ -84,6 +84,16 @@ mlirSparseTensorEncodingAttrGetPosWidth(MlirAttribute attr); MLIR_CAPI_EXPORTED int mlirSparseTensorEncodingAttrGetCrdWidth(MlirAttribute attr); +MLIR_CAPI_EXPORTED unsigned +mlirSparseTensorEncodingAttrGetStructuredN(MlirSparseTensorLevelType lvlType); + +MLIR_CAPI_EXPORTED unsigned +mlirSparseTensorEncodingAttrGetStructuredM(MlirSparseTensorLevelType lvlType); + +MLIR_CAPI_EXPORTED MlirSparseTensorLevelType +mlirSparseTensorEncodingAttrBuildLvlType( + enum MlirBaseSparseTensorLevelType lvlType, unsigned n, unsigned m); + #ifdef __cplusplus } #endif diff --git a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp index 607534c..74f4d24 100644 --- a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp +++ b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp @@ -60,6 +60,15 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) { py::arg("lvl_to_dim"), py::arg("pos_width"), py::arg("crd_width"), py::arg("context") = py::none(), "Gets a sparse_tensor.encoding from parameters.") + .def_classmethod( + "build_level_type", + [](py::object cls, MlirBaseSparseTensorLevelType lvlType, unsigned n, + unsigned m) { + return mlirSparseTensorEncodingAttrBuildLvlType(lvlType, n, m); + }, + py::arg("cls"), py::arg("lvl_type"), py::arg("n") = 0, + py::arg("m") = 0, + "Builds a sparse_tensor.encoding.level_type from parameters.") .def_property_readonly( "lvl_types", [](MlirAttribute self) { @@ -89,7 +98,34 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) { .def_property_readonly("pos_width", mlirSparseTensorEncodingAttrGetPosWidth) .def_property_readonly("crd_width", - mlirSparseTensorEncodingAttrGetCrdWidth); + mlirSparseTensorEncodingAttrGetCrdWidth) + .def_property_readonly( + "structured_n", + [](MlirAttribute self) -> unsigned { + const int lvlRank = mlirSparseTensorEncodingGetLvlRank(self); + return mlirSparseTensorEncodingAttrGetStructuredN( + mlirSparseTensorEncodingAttrGetLvlType(self, lvlRank - 1)); + }) + .def_property_readonly( + "structured_m", + [](MlirAttribute self) -> unsigned { + const int lvlRank = mlirSparseTensorEncodingGetLvlRank(self); + return mlirSparseTensorEncodingAttrGetStructuredM( + mlirSparseTensorEncodingAttrGetLvlType(self, lvlRank - 1)); + }) + .def_property_readonly("lvl_types_enum", [](MlirAttribute self) { + const int lvlRank = mlirSparseTensorEncodingGetLvlRank(self); + std::vector ret; + ret.reserve(lvlRank); + for (int l = 0; l < lvlRank; l++) { + // Convert level type to 32 bits to ignore n and m for n_out_of_m + // format. + ret.push_back( + static_cast(static_cast( + mlirSparseTensorEncodingAttrGetLvlType(self, l)))); + } + return ret; + }); } PYBIND11_MODULE(_mlirDialectsSparseTensor, m) { diff --git a/mlir/lib/CAPI/Dialect/SparseTensor.cpp b/mlir/lib/CAPI/Dialect/SparseTensor.cpp index a34b9a29..4e1bd45 100644 --- a/mlir/lib/CAPI/Dialect/SparseTensor.cpp +++ b/mlir/lib/CAPI/Dialect/SparseTensor.cpp @@ -94,3 +94,21 @@ int mlirSparseTensorEncodingAttrGetPosWidth(MlirAttribute attr) { int mlirSparseTensorEncodingAttrGetCrdWidth(MlirAttribute attr) { return cast(unwrap(attr)).getCrdWidth(); } + +MlirSparseTensorLevelType +mlirSparseTensorEncodingAttrBuildLvlType(MlirBaseSparseTensorLevelType lvlType, + unsigned n, unsigned m) { + LevelType lt = static_cast(lvlType); + return static_cast(*buildLevelType( + *getLevelFormat(lt), isOrderedLT(lt), isUniqueLT(lt), n, m)); +} + +unsigned +mlirSparseTensorEncodingAttrGetStructuredN(MlirSparseTensorLevelType lvlType) { + return getN(static_cast(lvlType)); +} + +unsigned +mlirSparseTensorEncodingAttrGetStructuredM(MlirSparseTensorLevelType lvlType) { + return getM(static_cast(lvlType)); +} diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp index 752d6e6..0fb0d27 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp @@ -35,14 +35,22 @@ FailureOr LvlTypeParser::parseLvlType(AsmParser &parser) const { ERROR_IF(failed(parser.parseOptionalKeyword(&base)), "expected valid level format (e.g. dense, compressed or singleton)") uint64_t properties = 0; - SmallVector structure; + SmallVector structured; if (base.compare("structured") == 0) { ParseResult res = parser.parseCommaSeparatedList( mlir::OpAsmParser::Delimiter::OptionalSquare, - [&]() -> ParseResult { return parseStructure(parser, &structure); }, - " in block n out of m"); + [&]() -> ParseResult { return parseStructured(parser, &structured); }, + " in structured n out of m"); FAILURE_IF_FAILED(res) + if (structured.size() != 2) { + parser.emitError(loc, "expected exactly 2 structured sizes"); + return failure(); + } + if (structured[0] > structured[1]) { + parser.emitError(loc, "expected n <= m in n_out_of_m"); + return failure(); + } } ParseResult res = parser.parseCommaSeparatedList( @@ -57,12 +65,8 @@ FailureOr LvlTypeParser::parseLvlType(AsmParser &parser) const { } else if (base.compare("compressed") == 0) { properties |= static_cast(LevelFormat::Compressed); } else if (base.compare("structured") == 0) { - if (structure.size() != 2) { - parser.emitError(loc, "expected exactly 2 structure sizes"); - return failure(); - } properties |= static_cast(LevelFormat::NOutOfM); - properties |= nToBits(structure[0]) | mToBits(structure[1]); + properties |= nToBits(structured[0]) | mToBits(structured[1]); } else if (base.compare("loose_compressed") == 0) { properties |= static_cast(LevelFormat::LooseCompressed); } else if (base.compare("singleton") == 0) { @@ -95,20 +99,24 @@ ParseResult LvlTypeParser::parseProperty(AsmParser &parser, } ParseResult -LvlTypeParser::parseStructure(AsmParser &parser, - SmallVector *structure) const { +LvlTypeParser::parseStructured(AsmParser &parser, + SmallVector *structured) const { int intVal; auto loc = parser.getCurrentLocation(); OptionalParseResult intValParseResult = parser.parseOptionalInteger(intVal); if (intValParseResult.has_value()) { if (failed(*intValParseResult)) { - parser.emitError(loc, "failed to parse block size"); + parser.emitError(loc, "failed to parse structured size"); + return failure(); + } + if (intVal < 0) { + parser.emitError(loc, "expected structured size to be >= 0"); return failure(); } - structure->push_back(intVal); + structured->push_back(intVal); return success(); } - parser.emitError(loc, "expected valid integer for block size"); + parser.emitError(loc, "expected valid integer for structured size"); return failure(); } diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h index 6a13112..1ac8254 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h +++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h @@ -22,8 +22,8 @@ public: private: ParseResult parseProperty(AsmParser &parser, uint64_t *properties) const; - ParseResult parseStructure(AsmParser &parser, - SmallVector *structure) const; + ParseResult parseStructured(AsmParser &parser, + SmallVector *structured) const; }; } // namespace ir_detail diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 67b1d79..aed43f2 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -657,6 +657,37 @@ LogicalResult SparseTensorEncodingAttr::verify( return emitError() << "expected all singleton lvlTypes " "following a singleton level"; } + // TODO: audit formats that actually are supported by backend. + if (auto it = std::find_if(lvlTypes.begin(), lvlTypes.end(), isNOutOfMLT); + it != std::end(lvlTypes)) { + if (it != lvlTypes.end() - 1) + return emitError() << "expected n_out_of_m to be the last level type"; + if (!std::all_of(lvlTypes.begin(), it, + [](LevelType i) { return isDenseLT(i); })) + return emitError() << "expected all dense lvlTypes " + "before a n_out_of_m level"; + if (dimToLvl && (dimToLvl.getNumDims() != dimToLvl.getNumResults())) { + if (!isBlockSparsity(dimToLvl)) { + return emitError() + << "expected 1xm block structure for n_out_of_m level"; + } + auto sizes = getBlockSize(dimToLvl); + unsigned coefficient = 0; + for (const auto &elem : sizes) { + if (elem != 0) { + if (elem != coefficient && coefficient != 0) { + return emitError() << "expected only one blocked level " + "with the same coefficients"; + } + coefficient = elem; + } + } + if (coefficient != getM(*it)) { + return emitError() << "expected coeffiencts of Affine expressions " + "to be equal to m of n_out_of_m level"; + } + } + } // Before we can check that the level-rank is consistent/coherent // across all fields, we need to define it. The source-of-truth for // the `getLvlRank` method is the length of the level-types array, diff --git a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir index 2d189cc..a52a46b4 100644 --- a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir +++ b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir @@ -315,3 +315,109 @@ func.func private @BSR(%arg0: tensor) { func.func private @BSR_explicit(%arg0: tensor) { return } + +// ----- + +// expected-error@+6 {{expected structured size to be >= 0}} +#NOutOfM = #sparse_tensor.encoding<{ + map = ( i, j, k ) -> + ( i : dense, + k floordiv 4 : dense, + j : dense, + k mod 4 : structured[-2, 4] + ) +}> +func.func private @NOutOfM(%arg0: tensor) { + return +} + +// ----- + +// expected-error@+6 {{expected n <= m in n_out_of_m}} +#NOutOfM = #sparse_tensor.encoding<{ + map = ( i, j, k ) -> + ( i : dense, + k floordiv 4 : dense, + j : dense, + k mod 4 : structured[5, 4] + ) +}> +func.func private @NOutOfM(%arg0: tensor) { + return +} + +// ----- + +// expected-error@+1 {{expected all dense lvlTypes before a n_out_of_m level}} +#NOutOfM = #sparse_tensor.encoding<{ + map = ( i, j, k ) -> + ( i : dense, + k floordiv 4 : compressed, + j : dense, + k mod 4 : structured[2, 4] + ) +}> +func.func private @NOutOfM(%arg0: tensor) { + return +} + +// ----- + +// expected-error@+1 {{expected n_out_of_m to be the last level type}} +#NOutOfM = #sparse_tensor.encoding<{ + map = ( i, j, k ) -> + ( i : dense, + k floordiv 4 : structured[2, 4], + j : dense, + k mod 4 : compressed + ) +}> +func.func private @NOutOfM(%arg0: tensor) { + return +} + +// ----- + +// expected-error@+1 {{expected 1xm block structure for n_out_of_m level}} +#NOutOfM = #sparse_tensor.encoding<{ + map = ( i, j, k ) -> + ( i : dense, + k floordiv 2 : dense, + j : dense, + k mod 4 : structured[2, 4] + ) +}> +func.func private @NOutOfM(%arg0: tensor) { + return +} + +// ----- + +// expected-error@+1 {{expected coeffiencts of Affine expressions to be equal to m of n_out_of_m level}} +#NOutOfM = #sparse_tensor.encoding<{ + map = ( i, j, k ) -> + ( i : dense, + k floordiv 2 : dense, + j : dense, + k mod 2 : structured[2, 4] + ) +}> +func.func private @NOutOfM(%arg0: tensor) { + return +} + +// ----- + +// expected-error@+1 {{expected only one blocked level with the same coefficients}} +#NOutOfM = #sparse_tensor.encoding<{ + map = ( i, j, k ) -> + ( i floordiv 2 : dense, + i mod 2 : dense, + j : dense, + k floordiv 4 : dense, + k mod 4 : structured[2, 4] + ) +}> +func.func private @NOutOfM(%arg0: tensor) { + return +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir index ec5c758..251944c 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir @@ -45,6 +45,13 @@ crdWidth = 8 }> +#NV_58 = #sparse_tensor.encoding<{ + map = ( i, j ) -> ( i : dense, + j floordiv 8 : dense, + j mod 8 : structured[5, 8]), + crdWidth = 8 +}> + module { func.func private @getTensorFilename(index) -> (!Filename) @@ -65,6 +72,7 @@ module { %A1 = sparse_tensor.new %fileName : !Filename to tensor %A2 = sparse_tensor.new %fileName : !Filename to tensor %A3 = sparse_tensor.new %fileName : !Filename to tensor + %A4 = sparse_tensor.new %fileName : !Filename to tensor // // CSR: @@ -113,10 +121,24 @@ module { %vecv3 = vector.transfer_read %val3[%c0], %f0 : memref, vector<12xf64> vector.print %vecv3 : vector<12xf64> + // + // NV_58 + // + // CHECK-NEXT: ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5 ) + // CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ) + // + %crd4 = sparse_tensor.coordinates %A4 {level = 2 : index } : tensor to memref + %vecc4 = vector.transfer_read %crd4[%c0], %u0 : memref, vector<12xi8> + vector.print %vecc4 : vector<12xi8> + %val4 = sparse_tensor.values %A4 : tensor to memref + %vecv4 = vector.transfer_read %val4[%c0], %f0 : memref, vector<12xf64> + vector.print %vecv4 : vector<12xf64> + // Release the resources. bufferization.dealloc_tensor %A1: tensor bufferization.dealloc_tensor %A2: tensor bufferization.dealloc_tensor %A3: tensor + bufferization.dealloc_tensor %A4: tensor return } diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py index 412c579..1fa7030 100644 --- a/mlir/test/python/dialects/sparse_tensor/dialect.py +++ b/mlir/test/python/dialects/sparse_tensor/dialect.py @@ -52,6 +52,90 @@ def testEncodingAttr1D(): print(f"created_pos_width: {created.pos_width}") +# CHECK-LABEL: TEST: testEncodingAttrStructure +@run +def testEncodingAttrStructure(): + with Context() as ctx: + parsed = Attribute.parse( + "#sparse_tensor.encoding<{" + " map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense," + " d1 mod 4 : structured[2, 4])," + " posWidth = 16," + " crdWidth = 32" + "}>" + ) + # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]), posWidth = 16, crdWidth = 32 }> + print(parsed) + + casted = st.EncodingAttr(parsed) + # CHECK: equal: True + print(f"equal: {casted == parsed}") + + # CHECK: lvl_types: [65536, 65536, 4406637494272] + print(f"lvl_types: {casted.lvl_types}") + # CHECK: lvl_types_enum: [, , ] + print(f"lvl_types_enum: {casted.lvl_types_enum}") + # CHECK: structured_n: 2 + print(f"structured_n: {casted.structured_n}") + # CHECK: structured_m: 4 + print(f"structured_m: {casted.structured_m}") + # CHECK: dim_to_lvl: (d0, d1) -> (d0, d1 floordiv 4, d1 mod 4) + print(f"dim_to_lvl: {casted.dim_to_lvl}") + # CHECK: lvl_to_dim: (d0, d1, d2) -> (d0, d1 * 4 + d2) + print(f"lvl_to_dim: {casted.lvl_to_dim}") + # CHECK: pos_width: 16 + print(f"pos_width: {casted.pos_width}") + # CHECK: crd_width: 32 + print(f"crd_width: {casted.crd_width}") + + created = st.EncodingAttr.get( + casted.lvl_types, casted.dim_to_lvl, casted.lvl_to_dim, 0, 0 + ) + # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]) }> + print(created) + # CHECK: created_equal: False + print(f"created_equal: {created == casted}") + + built_2_4 = st.EncodingAttr.build_level_type(st.LevelType.n_out_of_m, 2, 4) + dim_to_lvl = AffineMap.get( + 2, + 0, + [ + AffineExpr.get_dim(0), + AffineExpr.get_floor_div(AffineExpr.get_dim(1), 4), + AffineExpr.get_mod(AffineExpr.get_dim(1), 4), + ], + ) + lvl_to_dim = AffineMap.get( + 3, + 0, + [ + AffineExpr.get_dim(0), + AffineExpr.get_add( + AffineExpr.get_mul(AffineExpr.get_dim(1), 4), + AffineExpr.get_dim(2), + ), + ], + ) + built = st.EncodingAttr.get( + [st.LevelType.dense, st.LevelType.dense, built_2_4], + dim_to_lvl, + lvl_to_dim, + 0, + 0, + ) + # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]) }> + print(built) + # CHECK: built_equal: True + print(f"built_equal: {built == created}") + + # Verify that the factory creates an instance of the proper type. + # CHECK: is_proper_instance: True + print(f"is_proper_instance: {isinstance(created, st.EncodingAttr)}") + # CHECK: created_pos_width: 0 + print(f"created_pos_width: {created.pos_width}") + + # CHECK-LABEL: TEST: testEncodingAttr2D @run def testEncodingAttr2D(): -- cgit v1.1 From 07dc85ba0cc84e7034ad2a0575c644cfeab60b39 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 9 Feb 2024 13:39:03 -0600 Subject: [NVVMReflect] Improve folding inside of the NVVMReflect pass (#81253) Summary: The previous patch did very simple folding that only worked for driectly used branches. This patch improves this by traversing the use-def chain to sipmlify every constant subexpression until it reaches a terminator we can delete. The support should work for all expected cases now. --- llvm/docs/NVPTXUsage.rst | 3 +- llvm/lib/Target/NVPTX/NVVMReflect.cpp | 70 ++++++---------------- llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll | 78 ++++++++++++++++++++----- 3 files changed, 82 insertions(+), 69 deletions(-) diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index b5e3918..6a55b12 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -298,8 +298,7 @@ input IR module ``module.bc``, the following compilation flow is recommended: The ``NVVMReflect`` pass will attempt to remove dead code even without optimizations. This allows potentially incompatible instructions to be avoided -at all optimizations levels. This currently only works for simple conditionals -like the above example. +at all optimizations levels by using the ``__CUDA_ARCH`` argument. 1. Save list of external functions in ``module.bc`` 2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc`` diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 3794ad9b..64fedf3 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -90,7 +90,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { } SmallVector ToRemove; - SmallVector ToSimplify; + SmallVector ToSimplify; // Go through the calls in this function. Each call to __nvvm_reflect or // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument. @@ -177,9 +177,8 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { } // If the immediate user is a simple comparison we want to simplify it. - // TODO: This currently does not handle switch instructions. for (User *U : Call->users()) - if (ICmpInst *I = dyn_cast(U)) + if (Instruction *I = dyn_cast(U)) ToSimplify.push_back(I); Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); @@ -190,56 +189,21 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { I->eraseFromParent(); // The code guarded by __nvvm_reflect may be invalid for the target machine. - // We need to do some basic dead code elimination to trim invalid code before - // it reaches the backend at all optimization levels. - SmallVector Simplified; - for (ICmpInst *Cmp : ToSimplify) { - Constant *LHS = dyn_cast(Cmp->getOperand(0)); - Constant *RHS = dyn_cast(Cmp->getOperand(1)); - - if (!LHS || !RHS) - continue; - - // If the comparison is a compile time constant we simply propagate it. - Constant *C = ConstantFoldCompareInstOperands( - Cmp->getPredicate(), LHS, RHS, Cmp->getModule()->getDataLayout()); - - if (!C) - continue; - - for (User *U : Cmp->users()) - if (BranchInst *I = dyn_cast(U)) - Simplified.push_back(I); - - Cmp->replaceAllUsesWith(C); - Cmp->eraseFromParent(); - } - - // Each instruction here is a conditional branch off of a constant true or - // false value. Simply replace it with an unconditional branch to the - // appropriate basic block and delete the rest if it is trivially dead. - DenseSet Removed; - for (BranchInst *Branch : Simplified) { - if (Removed.contains(Branch)) - continue; - - ConstantInt *C = dyn_cast(Branch->getCondition()); - if (!C || (!C->isOne() && !C->isZero())) - continue; - - BasicBlock *TrueBB = - C->isOne() ? Branch->getSuccessor(0) : Branch->getSuccessor(1); - BasicBlock *FalseBB = - C->isOne() ? Branch->getSuccessor(1) : Branch->getSuccessor(0); - - // This transformation is only correct on simple edges. - if (!FalseBB->hasNPredecessors(1)) - continue; - - ReplaceInstWithInst(Branch, BranchInst::Create(TrueBB)); - if (FalseBB->use_empty() && !FalseBB->getFirstNonPHIOrDbg()) { - Removed.insert(FalseBB->getFirstNonPHIOrDbg()); - changeToUnreachable(FalseBB->getFirstNonPHIOrDbg()); + // Traverse the use-def chain, continually simplifying constant expressions + // until we find a terminator that we can then remove. + while (!ToSimplify.empty()) { + Instruction *I = ToSimplify.pop_back_val(); + if (Constant *C = + ConstantFoldInstruction(I, F.getParent()->getDataLayout())) { + for (User *U : I->users()) + if (Instruction *I = dyn_cast(U)) + ToSimplify.push_back(I); + + I->replaceAllUsesWith(C); + if (isInstructionTriviallyDead(I)) + I->eraseFromParent(); + } else if (I->isTerminator()) { + ConstantFoldTerminator(I->getParent()); } } diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll index 9dcdf5b..0088d6c 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll @@ -102,23 +102,24 @@ if.end: ret void } -; SM_52: .visible .func (.param .b32 func_retval0) qux() -; SM_52: mov.u32 %[[REG1:.+]], %[[REG2:.+]]; -; SM_52: st.param.b32 [func_retval0+0], %[[REG1:.+]]; -; SM_52: ret; -; SM_70: .visible .func (.param .b32 func_retval0) qux() -; SM_70: mov.u32 %[[REG1:.+]], %[[REG2:.+]]; -; SM_70: st.param.b32 [func_retval0+0], %[[REG1:.+]]; -; SM_70: ret; -; SM_90: .visible .func (.param .b32 func_retval0) qux() -; SM_90: st.param.b32 [func_retval0+0], %[[REG1:.+]]; -; SM_90: ret; +; SM_52: .visible .func (.param .b32 func_retval0) qux() +; SM_52: mov.b32 %[[REG:.+]], 3; +; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_52-NEXT: ret; +; +; SM_70: .visible .func (.param .b32 func_retval0) qux() +; SM_70: mov.b32 %[[REG:.+]], 2; +; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_70-NEXT: ret; +; +; SM_90: .visible .func (.param .b32 func_retval0) qux() +; SM_90: mov.b32 %[[REG:.+]], 1; +; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_90-NEXT: ret; define i32 @qux() { entry: %call = call i32 @__nvvm_reflect(ptr noundef @.str) - %cmp = icmp uge i32 %call, 700 - %conv = zext i1 %cmp to i32 - switch i32 %conv, label %sw.default [ + switch i32 %call, label %sw.default [ i32 900, label %sw.bb i32 700, label %sw.bb1 i32 520, label %sw.bb2 @@ -173,3 +174,52 @@ if.exit: exit: ret float 0.000000e+00 } + +; SM_52: .visible .func (.param .b32 func_retval0) prop() +; SM_52: mov.b32 %[[REG:.+]], 3; +; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_52-NEXT: ret; +; +; SM_70: .visible .func (.param .b32 func_retval0) prop() +; SM_70: mov.b32 %[[REG:.+]], 2; +; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_70-NEXT: ret; +; +; SM_90: .visible .func (.param .b32 func_retval0) prop() +; SM_90: mov.b32 %[[REG:.+]], 1; +; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_90-NEXT: ret; +define i32 @prop() { +entry: + %call = call i32 @__nvvm_reflect(ptr @.str) + %conv = zext i32 %call to i64 + %div = udiv i64 %conv, 100 + %cmp = icmp eq i64 %div, 9 + br i1 %cmp, label %if.then, label %if.else + +if.then: + br label %return + +if.else: + %div2 = udiv i64 %conv, 100 + %cmp3 = icmp eq i64 %div2, 7 + br i1 %cmp3, label %if.then5, label %if.else6 + +if.then5: + br label %return + +if.else6: + %div7 = udiv i64 %conv, 100 + %cmp8 = icmp eq i64 %div7, 5 + br i1 %cmp8, label %if.then10, label %if.else11 + +if.then10: + br label %return + +if.else11: + br label %return + +return: + %retval = phi i32 [ 1, %if.then ], [ 2, %if.then5 ], [ 3, %if.then10 ], [ 4, %if.else11 ] + ret i32 %retval +} -- cgit v1.1 From 9d9cc3706f59499f443ce4ebaeb24f7c8417e797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= Date: Fri, 9 Feb 2024 20:40:16 +0100 Subject: [clang-format][docs] Fix version (#81185) #78752 was not merged in time for clang-format 18. --- clang/docs/ClangFormatStyleOptions.rst | 2 +- clang/include/clang/Tooling/Inclusions/IncludeStyle.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 0a8cc18..4ccdd2d 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -4156,7 +4156,7 @@ the configuration (without a prefix: ``Auto``). .. _MainIncludeChar: -**MainIncludeChar** (``MainIncludeCharDiscriminator``) :versionbadge:`clang-format 18` :ref:`¶ ` +**MainIncludeChar** (``MainIncludeCharDiscriminator``) :versionbadge:`clang-format 19` :ref:`¶ ` When guessing whether a #include is the "main" include, only the include directives that use the specified character are considered. diff --git a/clang/include/clang/Tooling/Inclusions/IncludeStyle.h b/clang/include/clang/Tooling/Inclusions/IncludeStyle.h index c91e4a6..d167b7e 100644 --- a/clang/include/clang/Tooling/Inclusions/IncludeStyle.h +++ b/clang/include/clang/Tooling/Inclusions/IncludeStyle.h @@ -164,7 +164,7 @@ struct IncludeStyle { /// When guessing whether a #include is the "main" include, only the include /// directives that use the specified character are considered. - /// \version 18 + /// \version 19 MainIncludeCharDiscriminator MainIncludeChar; }; -- cgit v1.1 From 7ad7db0d9960859de10d23fa84aa581c154d152c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 9 Feb 2024 11:45:54 -0800 Subject: [RISCV] Fix typo in ABI name in test. NFC ilp64->lp64. --- llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll b/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll index ca6895d..5edf3cf 100644 --- a/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll +++ b/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll @@ -1,8 +1,8 @@ ; RUN: llc -mattr=+zcmp -verify-machineinstrs \ -; RUN: -mtriple=riscv32 -target-abi ilp32 < %s \ +; RUN: -mtriple=riscv32 -target-abi=ilp32 < %s \ ; RUN: | FileCheck %s -check-prefixes=RV32IZCMP ; RUN: llc -mattr=+zcmp -verify-machineinstrs \ -; RUN: -mtriple=riscv64 -target-abi ilp64 < %s \ +; RUN: -mtriple=riscv64 -target-abi=lp64 < %s \ ; RUN: | FileCheck %s -check-prefixes=RV64IZCMP ; This source code exposed a crash in the RISC-V Zcmp Push/Pop optimization -- cgit v1.1 From 81c4bf946a377b1dc90c02ff3ff8240e78db0edb Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 9 Feb 2024 12:01:49 -0800 Subject: [ELF] Improve _etext/_edata tests --- lld/test/ELF/riscv-section-layout.s | 25 +++++++++-------- lld/test/ELF/x86-64-section-layout.s | 54 +++++++++++++++++++++--------------- 2 files changed, 45 insertions(+), 34 deletions(-) diff --git a/lld/test/ELF/riscv-section-layout.s b/lld/test/ELF/riscv-section-layout.s index 56ac95d..10e0feb 100644 --- a/lld/test/ELF/riscv-section-layout.s +++ b/lld/test/ELF/riscv-section-layout.s @@ -3,20 +3,20 @@ # RUN: llvm-mc -filetype=obj -triple=riscv32 %s -o %t.32.o # RUN: ld.lld -pie %t.32.o -o %t.32 -# RUN: llvm-readelf -S -s %t.32 | FileCheck %s --check-prefix=NOSDATA +# RUN: llvm-readelf -S -sX %t.32 | FileCheck %s --check-prefix=NOSDATA # RUN: llvm-mc -filetype=obj -triple=riscv32 --defsym=SDATA=1 %s -o %t.32s.o # RUN: ld.lld -pie %t.32s.o -o %t.32s -# RUN: llvm-readelf -S -s %t.32s | FileCheck %s +# RUN: llvm-readelf -S -sX %t.32s | FileCheck %s # RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t.64.o # RUN: ld.lld -pie %t.64.o -o %t.64 -# RUN: llvm-readelf -S -s %t.64 | FileCheck %s --check-prefix=NOSDATA +# RUN: llvm-readelf -S -sX %t.64 | FileCheck %s --check-prefix=NOSDATA # RUN: llvm-mc -filetype=obj -triple=riscv64 --defsym=SDATA=1 %s -o %t.64s.o # RUN: ld.lld -pie %t.64s.o -o %t.64s -# RUN: llvm-readelf -S -s %t.64s | FileCheck %s +# RUN: llvm-readelf -S -sX %t.64s | FileCheck %s # NOSDATA: .text -# NOSDATA-NEXT: .tdata +# NOSDATA-NEXT: .tdata PROGBITS [[#%x,TDATA:]] # NOSDATA-NEXT: .tbss # NOSDATA-NEXT: .dynamic # NOSDATA-NEXT: .got @@ -28,9 +28,10 @@ ## exist, define __global_pointer$ and set its st_shndx arbitrarily to 1. ## The symbol value should not be used by the program. -# NOSDATA-DAG: [[#]]: {{0*}}[[#BSS]] 0 NOTYPE GLOBAL DEFAULT [[#]] _edata -# NOSDATA-DAG: [[#]]: {{0*}}[[#BSS]] 0 NOTYPE GLOBAL DEFAULT [[#]] __bss_start -# NOSDATA-DAG: [[#]]: {{0*}}800 0 NOTYPE GLOBAL DEFAULT 1 __global_pointer$ +# NOSDATA-DAG: [[#]]: {{.*}} 0 NOTYPE GLOBAL DEFAULT [[#]] (.text) _etext +# NOSDATA-DAG: [[#]]: {{0*}}[[#BSS]] 0 NOTYPE GLOBAL DEFAULT [[#]] (.data) _edata +# NOSDATA-DAG: [[#]]: {{0*}}[[#BSS]] 0 NOTYPE GLOBAL DEFAULT [[#]] (.bss) __bss_start +# NOSDATA-DAG: [[#]]: {{0*}}800 0 NOTYPE GLOBAL DEFAULT 1 (.dynsym) __global_pointer$ # CHECK: .text # CHECK-NEXT: .tdata @@ -43,11 +44,11 @@ # CHECK-NEXT: .sbss NOBITS [[#%x,SBSS:]] # CHECK-NEXT: .bss -# CHECK-DAG: [[#]]: {{0*}}[[#SBSS]] 0 NOTYPE GLOBAL DEFAULT [[#]] _edata -# CHECK-DAG: [[#]]: {{0*}}[[#SBSS]] 0 NOTYPE GLOBAL DEFAULT [[#]] __bss_start -# CHECK-DAG: [[#]]: {{0*}}[[#SDATA+0x800]] 0 NOTYPE GLOBAL DEFAULT [[#]] __global_pointer$ +# CHECK-DAG: [[#]]: {{0*}}[[#SBSS]] 0 NOTYPE GLOBAL DEFAULT [[#]] (.sdata) _edata +# CHECK-DAG: [[#]]: {{0*}}[[#SBSS]] 0 NOTYPE GLOBAL DEFAULT [[#]] (.sbss) __bss_start +# CHECK-DAG: [[#]]: {{0*}}[[#SDATA+0x800]] 0 NOTYPE GLOBAL DEFAULT [[#]] (.sdata) __global_pointer$ -.globl _edata, __bss_start +.globl _etext, _edata, __bss_start lla gp, __global_pointer$ .section .data,"aw",@progbits; .long _GLOBAL_OFFSET_TABLE_ - . diff --git a/lld/test/ELF/x86-64-section-layout.s b/lld/test/ELF/x86-64-section-layout.s index 3720127..f292877 100644 --- a/lld/test/ELF/x86-64-section-layout.s +++ b/lld/test/ELF/x86-64-section-layout.s @@ -6,31 +6,31 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64 --defsym=BSS=1 a.s -o a.o # RUN: ld.lld --section-start=.note=0x200300 a.o -o a -# RUN: llvm-readelf -S -l a | FileCheck %s +# RUN: llvm-readelf -S -l -sX a | FileCheck %s # RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a1.o # RUN: ld.lld --section-start=.note=0x200300 a1.o -o a1 -# RUN: llvm-readelf -S a1 | FileCheck %s --check-prefix=CHECK1 +# RUN: llvm-readelf -S -sX a1 | FileCheck %s --check-prefix=CHECK1 # RUN: ld.lld -T b.lds -z norelro a.o -o b # RUN: llvm-readelf -S -l b | FileCheck %s --check-prefix=CHECK2 -# CHECK: Name Type Address Off Size ES Flg Lk Inf Al -# CHECK-NEXT: NULL 0000000000000000 000000 000000 00 0 0 0 -# CHECK-NEXT: .note NOTE 0000000000200300 000300 000001 00 A 0 0 1 -# CHECK-NEXT: .lrodata PROGBITS 0000000000200301 000301 000002 00 Al 0 0 1 -# CHECK-NEXT: .rodata PROGBITS 0000000000200303 000303 000001 00 A 0 0 1 -# CHECK-NEXT: .text PROGBITS 0000000000201304 000304 000001 00 AX 0 0 4 -# CHECK-NEXT: .tdata PROGBITS 0000000000202305 000305 000001 00 WAT 0 0 1 -# CHECK-NEXT: .tbss NOBITS 0000000000202306 000306 000002 00 WAT 0 0 1 -# CHECK-NEXT: .relro_padding NOBITS 0000000000202306 000306 000cfa 00 WA 0 0 1 -# CHECK-NEXT: .data PROGBITS 0000000000203306 000306 000001 00 WA 0 0 1 -# CHECK-NEXT: .bss NOBITS 0000000000203307 000307 001800 00 WA 0 0 1 +# CHECK: Name Type Address Off Size ES Flg Lk Inf Al +# CHECK-NEXT: NULL 0000000000000000 000000 000000 00 0 0 0 +# CHECK-NEXT: .note NOTE 0000000000200300 000300 000001 00 A 0 0 1 +# CHECK-NEXT: .lrodata PROGBITS 0000000000200301 000301 000002 00 Al 0 0 1 +# CHECK-NEXT: .rodata PROGBITS 0000000000200303 000303 000001 00 A 0 0 1 +# CHECK-NEXT: .text PROGBITS 0000000000201304 000304 000001 00 AX 0 0 4 +# CHECK-NEXT: .tdata PROGBITS 0000000000202305 000305 000001 00 WAT 0 0 1 +# CHECK-NEXT: .tbss NOBITS 0000000000202306 000306 000002 00 WAT 0 0 1 +# CHECK-NEXT: .relro_padding NOBITS 0000000000202306 000306 000cfa 00 WA 0 0 1 +# CHECK-NEXT: .data PROGBITS 0000000000203306 000306 000001 00 WA 0 0 1 +# CHECK-NEXT: .bss NOBITS 0000000000203307 000307 001800 00 WA 0 0 1 ## We spend size(.bss) % MAXPAGESIZE bytes for .bss. -# CHECK-NEXT: .ldata PROGBITS 0000000000205b07 000b07 000002 00 WAl 0 0 1 -# CHECK-NEXT: .ldata2 PROGBITS 0000000000205b09 000b09 000001 00 WAl 0 0 1 -# CHECK-NEXT: .lbss NOBITS 0000000000205b0a 000b0a 000002 00 WAl 0 0 1 -# CHECK-NEXT: .comment PROGBITS 0000000000000000 000b0a {{.*}} 01 MS 0 0 1 +# CHECK-NEXT: .ldata PROGBITS 0000000000205b07 000b07 000002 00 WAl 0 0 1 +# CHECK-NEXT: .ldata2 PROGBITS 0000000000205b09 000b09 000001 00 WAl 0 0 1 +# CHECK-NEXT: .lbss NOBITS 0000000000205b0a 000b0a 001201 00 WAl 0 0 1 +# CHECK-NEXT: .comment PROGBITS 0000000000000000 000b0a {{.*}} 01 MS 0 0 1 # CHECK: Program Headers: # CHECK-NEXT: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align @@ -39,13 +39,23 @@ # CHECK-NEXT: LOAD 0x000304 0x0000000000201304 0x0000000000201304 0x000001 0x000001 R E 0x1000 # CHECK-NEXT: LOAD 0x000305 0x0000000000202305 0x0000000000202305 0x000001 0x000cfb RW 0x1000 # CHECK-NEXT: LOAD 0x000306 0x0000000000203306 0x0000000000203306 0x000001 0x001801 RW 0x1000 -# CHECK-NEXT: LOAD 0x000b07 0x0000000000205b07 0x0000000000205b07 0x000003 0x000005 RW 0x1000 +# CHECK-NEXT: LOAD 0x000b07 0x0000000000205b07 0x0000000000205b07 0x000003 0x001204 RW 0x1000 + +# CHECK: 0000000000201304 0 NOTYPE GLOBAL DEFAULT [[#]] (.text) _start +# CHECK-NEXT: 0000000000201305 0 NOTYPE GLOBAL DEFAULT [[#]] (.text) _etext +# CHECK-NEXT: 0000000000205b0a 0 NOTYPE GLOBAL DEFAULT [[#]] (.ldata2) _edata +# CHECK-NEXT: 0000000000206d0b 0 NOTYPE GLOBAL DEFAULT [[#]] (.lbss) _end # CHECK1: .data PROGBITS 0000000000203306 000306 000001 00 WA 0 0 1 # CHECK1-NEXT: .ldata PROGBITS 0000000000203307 000307 000002 00 WAl 0 0 1 # CHECK1-NEXT: .ldata2 PROGBITS 0000000000203309 000309 000001 00 WAl 0 0 1 # CHECK1-NEXT: .comment PROGBITS 0000000000000000 00030a {{.*}} 01 MS 0 0 1 +# CHECK1: 0000000000201304 0 NOTYPE GLOBAL DEFAULT [[#]] (.text) _start +# CHECK1-NEXT: 0000000000201305 0 NOTYPE GLOBAL DEFAULT [[#]] (.text) _etext +# CHECK1-NEXT: 000000000020330a 0 NOTYPE GLOBAL DEFAULT [[#]] (.ldata2) _edata +# CHECK1-NEXT: 000000000020330a 0 NOTYPE GLOBAL DEFAULT [[#]] (.ldata2) _end + # CHECK2: .note NOTE 0000000000200300 000300 000001 00 A 0 0 1 # CHECK2-NEXT: .lrodata PROGBITS 0000000000200301 000301 000001 00 Al 0 0 1 ## With a SECTIONS command, we suppress the default rule placing .lrodata.* into .lrodata. @@ -59,7 +69,7 @@ # CHECK2-NEXT: .bss NOBITS 0000000000200307 000307 001800 00 WA 0 0 1 # CHECK2-NEXT: .ldata PROGBITS 0000000000201b07 001b07 000002 00 WAl 0 0 1 # CHECK2-NEXT: .ldata2 PROGBITS 0000000000201b09 001b09 000001 00 WAl 0 0 1 -# CHECK2-NEXT: .lbss NOBITS 0000000000201b0a 001b0a 000002 00 WAl 0 0 1 +# CHECK2-NEXT: .lbss NOBITS 0000000000201b0a 001b0a 001201 00 WAl 0 0 1 # CHECK2-NEXT: .comment PROGBITS 0000000000000000 001b0a {{.*}} 01 MS 0 0 1 # CHECK2: Program Headers: @@ -67,11 +77,11 @@ # CHECK2-NEXT: PHDR 0x000040 0x0000000000200040 0x0000000000200040 {{.*}} {{.*}} R 0x8 # CHECK2-NEXT: LOAD 0x000000 0x0000000000200000 0x0000000000200000 0x000304 0x000304 R 0x1000 # CHECK2-NEXT: LOAD 0x000304 0x0000000000200304 0x0000000000200304 0x000001 0x000001 R E 0x1000 -# CHECK2-NEXT: LOAD 0x000305 0x0000000000200305 0x0000000000200305 0x001805 0x001807 RW 0x1000 +# CHECK2-NEXT: LOAD 0x000305 0x0000000000200305 0x0000000000200305 0x001805 0x002a06 RW 0x1000 # CHECK2-NEXT: TLS 0x000305 0x0000000000200305 0x0000000000200305 0x000001 0x000003 R 0x1 #--- a.s -.globl _start +.globl _start, _etext, _edata, _end _start: ret @@ -92,7 +102,7 @@ _start: ## Input .ldata.rel.ro sections are placed in the output .ldata section. .section .ldata.rel.ro,"awl"; .space 1 .ifdef BSS -.section .lbss,"awl",@nobits; .space 1 +.section .lbss,"awl",@nobits; .space 0x1200 ## Input .lbss.rel.ro sections are placed in the output .lbss section. .section .lbss.rel.ro,"awl",@nobits; .space 1 .endif -- cgit v1.1 From bb180856ec28efe305dc77ca4bb3db12d8932edf Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 9 Feb 2024 14:04:59 -0600 Subject: [NVPTX][Fix] Update minimum CPU for NVPTX intrinsics test Summary: This test requires at least sm_30 to run, but that is still below the minimum supported version of sm_52 currently. Just set this to sm_60 so the tests pass in the future. --- llvm/test/CodeGen/NVPTX/intrinsics.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll index 7e45b1f..2994f60 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll @@ -1,7 +1,7 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s +; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify %} ; CHECK-LABEL: test_fabsf( define float @test_fabsf(float %f) { -- cgit v1.1 From 5f26b902d59b98ffa450f7bae508b330d0184d0d Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 9 Feb 2024 12:09:42 -0800 Subject: [ELF] Apply forgotten change to #81223 --- lld/ELF/Writer.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 53ca70b..bd4db1e 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -2446,8 +2446,8 @@ SmallVector Writer::createPhdrs(Partition &part) { // Segments are contiguous memory regions that has the same attributes // (e.g. executable or writable). There is one phdr for each segment. // Therefore, we need to create a new phdr when the next section has - // compatible flags or is loaded at a discontiguous address or memory region - // using AT or AT> linker script command, respectively. + // incompatible flags or is loaded at a discontiguous address or memory + // region using AT or AT> linker script command, respectively. // // As an exception, we don't create a separate load segment for the ELF // headers, even if the first "real" output has an AT or AT> attribute. @@ -2461,10 +2461,10 @@ SmallVector Writer::createPhdrs(Partition &part) { // needed to create a new LOAD) uint64_t newFlags = computeFlags(sec->getPhdrFlags()); // When --no-rosegment is specified, RO and RX sections are compatible. - uint32_t diff = flags ^ newFlags; + uint32_t incompatible = flags ^ newFlags; if (config->singleRoRx && !(newFlags & PF_W)) - diff &= ~PF_X; - if (diff) + incompatible &= ~PF_X; + if (incompatible) load = nullptr; bool sameLMARegion = -- cgit v1.1 From 3c707310a3e0233c1bc364a408e6fb43e56e1b78 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 9 Feb 2024 14:11:01 -0600 Subject: [NVPTX] Add clang builtin for `__nvvm_reflect` intrinsic (#81277) Summary: Some recent support made usage of `__nvvm_reflect` more consistent. We should expose it as a builtin rather than forcing users to externally define the function. --- clang/include/clang/Basic/BuiltinsNVPTX.def | 1 + clang/test/CodeGen/builtins-nvptx.c | 8 ++++++++ clang/test/CodeGenOpenCL/reflect.cl | 28 ++++++++++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 3 ++- llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll | 4 ++-- llvm/test/CodeGen/NVPTX/nvvm-reflect.ll | 4 ++-- 6 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/reflect.cl diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 7819e71..8d3c5e6 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -159,6 +159,7 @@ BUILTIN(__nvvm_read_ptx_sreg_pm3, "i", "n") BUILTIN(__nvvm_prmt, "UiUiUiUi", "") BUILTIN(__nvvm_exit, "v", "r") +BUILTIN(__nvvm_reflect, "UicC*", "r") TARGET_BUILTIN(__nvvm_nanosleep, "vUi", "n", AND(SM_70, PTX63)) // Min Max diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index ad7c27f..4dba767 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -44,6 +44,14 @@ __device__ int read_tid() { } +__device__ bool reflect() { + +// CHECK: call i32 @llvm.nvvm.reflect(ptr {{.*}}) + + unsigned x = __nvvm_reflect("__CUDA_ARCH"); + return x >= 700; +} + __device__ int read_ntid() { // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() diff --git a/clang/test/CodeGenOpenCL/reflect.cl b/clang/test/CodeGenOpenCL/reflect.cl new file mode 100644 index 0000000..9ae4a5f --- /dev/null +++ b/clang/test/CodeGenOpenCL/reflect.cl @@ -0,0 +1,28 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -emit-llvm -O0 -o - | FileCheck %s + +// CHECK-LABEL: define dso_local zeroext i1 @device_function( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.reflect(ptr addrspacecast (ptr addrspace(4) @.str to ptr)) +// CHECK-NEXT: [[CMP:%.*]] = icmp uge i32 [[TMP0]], 700 +// CHECK-NEXT: ret i1 [[CMP]] +// +bool device_function() { + return __nvvm_reflect("__CUDA_ARCH") >= 700; +} + +// CHECK-LABEL: define dso_local spir_kernel void @kernel_function( +// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// CHECK-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4 +// CHECK-NEXT: [[CALL:%.*]] = call zeroext i1 @device_function() #[[ATTR3:[0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CALL]] to i32 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR]], align 4 +// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +__kernel void kernel_function(__global int *i) { + *i = device_function(); +} diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index d825dc8..726cea0 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1625,7 +1625,8 @@ def int_nvvm_compiler_warn : Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.warn">; def int_nvvm_reflect : - Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty], [IntrNoMem], "llvm.nvvm.reflect">; + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem], "llvm.nvvm.reflect">, + ClangBuiltin<"__nvvm_reflect">; // isspacep.{const, global, local, shared} def int_nvvm_isspacep_const diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll index 1cb5c87..46ab79d 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll @@ -41,7 +41,7 @@ exit: ret float %ret } -declare i32 @llvm.nvvm.reflect.p0(ptr) +declare i32 @llvm.nvvm.reflect(ptr) ; CHECK-LABEL: define noundef i32 @intrinsic define i32 @intrinsic() { @@ -49,7 +49,7 @@ define i32 @intrinsic() { ; USE_FTZ_0: ret i32 0 ; USE_FTZ_1: ret i32 1 %ptr = tail call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) @str) - %reflect = tail call i32 @llvm.nvvm.reflect.p0(ptr %ptr) + %reflect = tail call i32 @llvm.nvvm.reflect(ptr %ptr) ret i32 %reflect } diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll index 9b1939f..2ed9f7c 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll @@ -41,7 +41,7 @@ exit: ret float %ret } -declare i32 @llvm.nvvm.reflect.p0(ptr) +declare i32 @llvm.nvvm.reflect(ptr) ; CHECK-LABEL: define noundef i32 @intrinsic define i32 @intrinsic() { @@ -49,7 +49,7 @@ define i32 @intrinsic() { ; USE_FTZ_0: ret i32 0 ; USE_FTZ_1: ret i32 1 %ptr = tail call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) @str) - %reflect = tail call i32 @llvm.nvvm.reflect.p0(ptr %ptr) + %reflect = tail call i32 @llvm.nvvm.reflect(ptr %ptr) ret i32 %reflect } -- cgit v1.1 From 2cbe5a33a5fda257747d75863bd9ccb8920b9249 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 9 Feb 2024 12:26:12 -0800 Subject: [llvm-objcopy] Fix the build again after 7ddc320 --- llvm/lib/ObjCopy/ELF/ELFObject.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp index d7559ab..b9b9167 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp @@ -2980,7 +2980,7 @@ SRECWriter::getTotalSize(WritableMemoryBuffer &EmptyBuffer) const { SRECSizeCalculator SizeCalc(EmptyBuffer, 0); for (const SectionBase *Sec : Sections) if (Error Err = Sec->accept(SizeCalc)) - return Err; + return std::move(Err); SizeCalc.writeRecords(Obj.Entry); // We need to add the size of the Header and Terminator records. -- cgit v1.1 From 228e9d5bcfcb6411d2a257b560464323d0248c35 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 9 Feb 2024 15:31:08 -0500 Subject: [clang] Document the type_visibility attribute (#79157) I was looking for the documentation of that attribute, and the best I could find was a Stackoverflow answer or the commit message that originally introduced the attribute. I figured I might as well document what I find to save everyone time in the future. --- clang/include/clang/Basic/Attr.td | 4 ++-- clang/include/clang/Basic/AttrDocs.td | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index b2d5309..45a29e7 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -3226,8 +3226,8 @@ def TypeVisibility : InheritableAttr { let Args = [EnumArgument<"Visibility", "VisibilityType", ["default", "hidden", "internal", "protected"], ["Default", "Hidden", "Hidden", "Protected"]>]; -// let Subjects = [Tag, ObjCInterface, Namespace]; - let Documentation = [Undocumented]; + // let Subjects = SubjectList<[Tag, ObjCInterface, Namespace], ErrorDiag>; + let Documentation = [TypeVisibilityDocs]; } def VecReturn : InheritableAttr { diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 19a98a0..8d36909 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -5585,6 +5585,25 @@ See :doc:`LTOVisibility`. }]; } +def TypeVisibilityDocs : Documentation { + let Category = DocCatType; + let Content = [{ +The ``type_visibility`` attribute allows the visibility of a type and its vague +linkage objects (vtable, typeinfo, typeinfo name) to be controlled separately from +the visibility of functions and data members of the type. + +For example, this can be used to give default visibility to the typeinfo and the vtable +of a type while still keeping hidden visibility on its member functions and static data +members. + +This attribute can only be applied to types and namespaces. + +If both ``visibility`` and ``type_visibility`` are applied to a type or a namespace, the +visibility specified with the ``type_visibility`` attribute overrides the visibility +provided with the regular ``visibility`` attribute. + }]; +} + def RenderScriptKernelAttributeDocs : Documentation { let Category = DocCatFunction; let Content = [{ -- cgit v1.1 From bd65547805a4b02be8f8c9e7acf1df850164da53 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 9 Feb 2024 13:04:49 -0800 Subject: [workflows] Create a more descriptive title and body when creating a PR for backports (#80396) When a backport request is made, the resulting pull request will have a title like this: : And a body that says: Backport .. Requested By: --- .github/workflows/issue-release-workflow.yml | 1 + llvm/utils/git/github-automation.py | 29 ++++++++++++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/.github/workflows/issue-release-workflow.yml b/.github/workflows/issue-release-workflow.yml index 33a1e89..448c1c5 100644 --- a/.github/workflows/issue-release-workflow.yml +++ b/.github/workflows/issue-release-workflow.yml @@ -65,4 +65,5 @@ jobs: release-workflow \ --branch-repo-token ${{ secrets.RELEASE_WORKFLOW_PUSH_SECRET }} \ --issue-number ${{ github.event.issue.number }} \ + --requested-by ${{ github.event.issue.user.login }} \ auto diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py index e2b84ae..b475eff 100755 --- a/llvm/utils/git/github-automation.py +++ b/llvm/utils/git/github-automation.py @@ -343,6 +343,7 @@ class ReleaseWorkflow: branch_repo_name: str, branch_repo_token: str, llvm_project_dir: str, + requested_by: str, ) -> None: self._token = token self._repo_name = repo @@ -353,6 +354,7 @@ class ReleaseWorkflow: else: self._branch_repo_token = self.token self._llvm_project_dir = llvm_project_dir + self._requested_by = requested_by @property def token(self) -> str: @@ -383,6 +385,10 @@ class ReleaseWorkflow: return self._llvm_project_dir @property + def requested_by(self) -> str: + return self._requested_by + + @property def repo(self) -> github.Repository.Repository: return github.Github(self.token).get_repo(self.repo_name) @@ -536,7 +542,7 @@ class ReleaseWorkflow: self.issue_remove_cherry_pick_failed_label() return self.create_pull_request( - self.branch_repo_owner, self.repo_name, branch_name + self.branch_repo_owner, self.repo_name, branch_name, commits ) def check_if_pull_request_exists( @@ -545,7 +551,9 @@ class ReleaseWorkflow: pulls = repo.get_pulls(head=head) return pulls.totalCount != 0 - def create_pull_request(self, owner: str, repo_name: str, branch: str) -> bool: + def create_pull_request( + self, owner: str, repo_name: str, branch: str, commits: List[str] + ) -> bool: """ Create a pull request in `self.repo_name`. The base branch of the pull request will be chosen based on the the milestone attached to @@ -567,9 +575,15 @@ class ReleaseWorkflow: print("PR already exists...") return True try: + commit_message = repo.get_commit(commits[-1]).commit.message + message_lines = commit_message.splitlines() + title = "{}: {}".format(release_branch_for_issue, message_lines[0]) + body = "Backport {}\n\nRequested by: @{}".format( + " ".join(commits), self.requested_by + ) pull = repo.create_pull( - title=f"PR for {issue_ref}", - body="resolves {}".format(issue_ref), + title=title, + body=body, base=release_branch_for_issue, head=head, maintainer_can_modify=False, @@ -683,6 +697,12 @@ llvmbot_git_config_parser = subparsers.add_parser( "setup-llvmbot-git", help="Set the default user and email for the git repo in LLVM_PROJECT_DIR to llvmbot", ) +release_workflow_parser.add_argument( + "--requested-by", + type=str, + required=True, + help="The user that requested this backport", +) args = parser.parse_args() @@ -712,6 +732,7 @@ elif args.command == "release-workflow": args.branch_repo, args.branch_repo_token, args.llvm_project_dir, + args.requested_by, ) if not release_workflow.release_branch_for_issue: release_workflow.issue_notify_no_milestone(sys.stdin.readlines()) -- cgit v1.1 From 967374123bd6eee23db9a57fcac7324e420648c5 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 9 Feb 2024 15:05:18 -0600 Subject: [libc] Bump up minimum PTX version to 6.3 Summary: I neglected the fact that `activemask` is a 6.2 or 6.3 feature, so building this on older machines is incorrect. Bump this up to 6.3 for now so it works. In the future we will try to get rid of the N architecture business. --- libc/cmake/modules/LLVMLibCObjectRules.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake index 70e64a6..ef1f248 100644 --- a/libc/cmake/modules/LLVMLibCObjectRules.cmake +++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake @@ -106,13 +106,13 @@ function(get_nvptx_compile_options output_var gpu_arch) list(APPEND nvptx_options "-Wno-unknown-cuda-version") list(APPEND nvptx_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false") if(${gpu_arch} STREQUAL "sm_35") - list(APPEND nvptx_options "--cuda-feature=+ptx60") + list(APPEND nvptx_options "--cuda-feature=+ptx63") elseif(${gpu_arch} STREQUAL "sm_37") - list(APPEND nvptx_options "--cuda-feature=+ptx60") + list(APPEND nvptx_options "--cuda-feature=+ptx63") elseif(${gpu_arch} STREQUAL "sm_50") - list(APPEND nvptx_options "--cuda-feature=+ptx60") + list(APPEND nvptx_options "--cuda-feature=+ptx63") elseif(${gpu_arch} STREQUAL "sm_52") - list(APPEND nvptx_options "--cuda-feature=+ptx60") + list(APPEND nvptx_options "--cuda-feature=+ptx63") elseif(${gpu_arch} STREQUAL "sm_53") list(APPEND nvptx_options "--cuda-feature=+ptx63") elseif(${gpu_arch} STREQUAL "sm_60") -- cgit v1.1 From 7fd1466433a05f4e2e183914a8bd7c372bb0b8a7 Mon Sep 17 00:00:00 2001 From: Richard Dzenis Date: Fri, 9 Feb 2024 23:07:07 +0200 Subject: [mlir] Fix CallOpInterface extraClassDeclaration to be fully namespace qualified (#81258) `extraClassDeclaration` of `CallOpInterface` can be inherited by other `OpInterfaces` into foreign namespaces, thus types must be fully qualified to prevent compiler errors, for example: def MyCaller : OpInterface<"MyCaller", [CallOpInterface]> { let cppNamespace = "::MyNamespace"; } --- mlir/include/mlir/Interfaces/CallInterfaces.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Interfaces/CallInterfaces.td b/mlir/include/mlir/Interfaces/CallInterfaces.td index 3e9c002..752de74 100644 --- a/mlir/include/mlir/Interfaces/CallInterfaces.td +++ b/mlir/include/mlir/Interfaces/CallInterfaces.td @@ -68,7 +68,7 @@ def CallOpInterface : OpInterface<"CallOpInterface"> { /// `symbolTable` is an optional parameter that will allow for using a /// cached symbol table for symbol lookups instead of performing an O(N) /// scan. - Operation *resolveCallable(SymbolTableCollection *symbolTable = nullptr); + ::mlir::Operation *resolveCallable(::mlir::SymbolTableCollection *symbolTable = nullptr); }]; } -- cgit v1.1 From 1d0f86ba80543931d467d6ce3f2ad8cdde514710 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 9 Feb 2024 13:39:08 -0800 Subject: [Sema] Warn unused functions for FMV based on the target attribute (#81302) The spurious -Wunused-function warning issue for `target_version` #80227 also applied to `__attribute__((target(...)))` based FMV. #81167 removed warnings for all `target`-based FMV. This patch restores the warnings for `__attribute__((target("default")))`. --- clang/lib/AST/Decl.cpp | 6 +++++- clang/test/SemaCXX/attr-target-mv-warn-unused.cpp | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 clang/test/SemaCXX/attr-target-mv-warn-unused.cpp diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index e281f2d..5d6bb72 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3538,7 +3538,11 @@ bool FunctionDecl::isTargetMultiVersion() const { } bool FunctionDecl::isTargetMultiVersionDefault() const { - return isMultiVersion() && hasAttr() && + if (!isMultiVersion()) + return false; + if (hasAttr()) + return getAttr()->isDefaultVersion(); + return hasAttr() && getAttr()->isDefaultVersion(); } diff --git a/clang/test/SemaCXX/attr-target-mv-warn-unused.cpp b/clang/test/SemaCXX/attr-target-mv-warn-unused.cpp new file mode 100644 index 0000000..1901589 --- /dev/null +++ b/clang/test/SemaCXX/attr-target-mv-warn-unused.cpp @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -verify -Wunused %s + +__attribute__((target("sse3"))) +static int not_used_fmv() { return 1; } +__attribute__((target("avx2"))) +static int not_used_fmv() { return 2; } +__attribute__((target("default"))) +static int not_used_fmv() { return 0; } // expected-warning {{unused function 'not_used_fmv'}} + +__attribute__((target("sse3"))) +static int definitely_used_fmv() { return 1; } +__attribute__((target("avx2"))) +static int definitely_used_fmv() { return 2; } +__attribute__((target("default"))) +static int definitely_used_fmv() { return 0; } +int definite_user() { return definitely_used_fmv(); } -- cgit v1.1 From c7a0db1e20251f436e3d500eac03bd9be1d88b45 Mon Sep 17 00:00:00 2001 From: yozhu <101743168+yozhu@users.noreply.github.com> Date: Fri, 9 Feb 2024 13:55:08 -0800 Subject: [CFI][annotation] Leave alone function pointers in function annotations (#80173) Function annotation, as part of llvm.metadata, is for the function itself and doesn't apply to its corresponding jump table entry, so with CFI we shouldn't replace function pointer in function annotation with pointer to its corresponding jump table entry. --- llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 28 ++++++++- .../Transforms/LowerTypeTests/cfi-annotation.ll | 68 ++++++++++++++++++++++ 2 files changed, 94 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 733f290..633fcb3 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -470,6 +470,9 @@ class LowerTypeTestsModule { Function *WeakInitializerFn = nullptr; + GlobalVariable *GlobalAnnotation; + DenseSet FunctionAnnotations; + bool shouldExportConstantsAsAbsoluteSymbols(); uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); TypeIdLowering importTypeId(StringRef TypeId); @@ -531,6 +534,10 @@ class LowerTypeTestsModule { /// replace each use, which is a direct function call. void replaceDirectCalls(Value *Old, Value *New); + bool isFunctionAnnotation(Value *V) const { + return FunctionAnnotations.contains(V); + } + public: LowerTypeTestsModule(Module &M, ModuleAnalysisManager &AM, ModuleSummaryIndex *ExportSummary, @@ -1377,8 +1384,11 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( // (all?) targets. Switch to a runtime initializer. SmallSetVector GlobalVarUsers; findGlobalVariableUsersOf(F, GlobalVarUsers); - for (auto *GV : GlobalVarUsers) + for (auto *GV : GlobalVarUsers) { + if (GV == GlobalAnnotation) + continue; moveInitializerToModuleConstructor(GV); + } // Can not RAUW F with an expression that uses F. Replace with a temporary // placeholder first. @@ -1837,6 +1847,16 @@ LowerTypeTestsModule::LowerTypeTestsModule( } OS = TargetTriple.getOS(); ObjectFormat = TargetTriple.getObjectFormat(); + + // Function annotation describes or applies to function itself, and + // shouldn't be associated with jump table thunk generated for CFI. + GlobalAnnotation = M.getGlobalVariable("llvm.global.annotations"); + if (GlobalAnnotation && GlobalAnnotation->hasInitializer()) { + const ConstantArray *CA = + cast(GlobalAnnotation->getInitializer()); + for (Value *Op : CA->operands()) + FunctionAnnotations.insert(Op); + } } bool LowerTypeTestsModule::runForTesting(Module &M, ModuleAnalysisManager &AM) { @@ -1896,10 +1916,14 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, if (isa(U.getUser())) continue; - // Skip direct calls to externally defined or non-dso_local functions + // Skip direct calls to externally defined or non-dso_local functions. if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical)) continue; + // Skip function annotation. + if (isFunctionAnnotation(U.getUser())) + continue; + // Must handle Constants specially, we cannot call replaceUsesOfWith on a // constant because they are uniqued. if (auto *C = dyn_cast(U.getUser())) { diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll b/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll new file mode 100644 index 0000000..034af89 --- /dev/null +++ b/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll @@ -0,0 +1,68 @@ +; REQUIRES: aarch64-registered-target + +; RUN: opt -passes=lowertypetests %s -o %t.o +; RUN: llvm-dis %t.o -o - | FileCheck %s --check-prefix=CHECK-foobar +; CHECK-foobar: {{llvm.global.annotations = .*[foo|bar], .*[foo|bar],}} +; RUN: llvm-dis %t.o -o - | FileCheck %s --check-prefix=CHECK-cfi +; CHECK-cfi-NOT: {{llvm.global.annotations = .*cfi.*}} + +target triple = "aarch64-none-linux-gnu" + +@.src = private unnamed_addr constant [7 x i8] c"test.c\00", align 1 +@.str = private unnamed_addr constant [30 x i8] c"annotation_string_literal_bar\00", section "llvm.metadata" +@.str.1 = private unnamed_addr constant [7 x i8] c"test.c\00", section "llvm.metadata" +@.str.2 = private unnamed_addr constant [30 x i8] c"annotation_string_literal_foo\00", section "llvm.metadata" +@llvm.global.annotations = appending global [2 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @bar, ptr @.str, ptr @.str.1, i32 2, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @foo, ptr @.str.2, ptr @.str.1, i32 1, ptr null }], section "llvm.metadata" + +define i32 @bar(i32 noundef %0) #0 !type !8 !type !9 { + %2 = alloca i32, align 4 + store i32 %0, ptr %2, align 4 + %3 = load i32, ptr %2, align 4 + %4 = call i32 @foo(i32 noundef %3) + ret i32 %4 +} + +declare !type !8 !type !9 i32 @foo(i32 noundef) #1 + +define i32 @test(i32 noundef %0) #0 !type !8 !type !9 { + %2 = alloca i32, align 4 + %3 = alloca ptr, align 8 + store i32 %0, ptr %2, align 4 + %4 = load i32, ptr %2, align 4 + %5 = icmp sgt i32 %4, 0 + %6 = zext i1 %5 to i64 + %7 = select i1 %5, ptr @foo, ptr @bar + store ptr %7, ptr %3, align 8 + %8 = load ptr, ptr %3, align 8 + %9 = call i1 @llvm.type.test(ptr %8, metadata !"_ZTSFiiE"), !nosanitize !10 + br i1 %9, label %11, label %10, !nosanitize !10 + +10: + call void @llvm.ubsantrap(i8 2) #4, !nosanitize !10 + unreachable, !nosanitize !10 + +11: + %12 = load i32, ptr %2, align 4 + %13 = call i32 %8(i32 noundef %12) + ret i32 %13 +} + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.ubsantrap(i8 immarg) + +attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" } +attributes #1 = { "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" } +attributes #4 = { noreturn nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 4, !"CFI Canonical Jump Tables", i32 0} +!2 = !{i32 8, !"PIC Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{i32 7, !"frame-pointer", i32 1} +!5 = !{i32 1, !"ThinLTO", i32 0} +!6 = !{i32 1, !"EnableSplitLTOUnit", i32 1} +!8 = !{i64 0, !"_ZTSFiiE"} +!9 = !{i64 0, !"_ZTSFiiE.generalized"} +!10 = !{} -- cgit v1.1 From 7ff488708c0caa1b31af7ad677b9b321209f6738 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 9 Feb 2024 13:57:26 -0800 Subject: [flang][cuda][NFC] Rename CUDAAttribute to CUDADataAttribute (#81323) The newly introduced `CUDAAttribute` is meant for CUDA attributes associated with variable. In order to not clash with the future attribute for function/subroutine, rename `CUDAAttribute` to `CUDADataAttribute`. --- flang/include/flang/Lower/ConvertVariable.h | 6 +++--- flang/include/flang/Optimizer/Builder/FIRBuilder.h | 4 ++-- flang/include/flang/Optimizer/Builder/HLFIRTools.h | 2 +- flang/include/flang/Optimizer/Dialect/FIRAttr.td | 7 ++++--- flang/include/flang/Optimizer/Dialect/FIROps.td | 4 ++-- flang/include/flang/Optimizer/HLFIR/HLFIROps.td | 4 ++-- flang/include/flang/Optimizer/Support/Utils.h | 20 +++++++++---------- flang/lib/Lower/CallInterface.cpp | 2 +- flang/lib/Lower/ConvertVariable.cpp | 23 +++++++++++----------- flang/lib/Optimizer/Builder/FIRBuilder.cpp | 4 ++-- flang/lib/Optimizer/Builder/HLFIRTools.cpp | 2 +- flang/lib/Optimizer/Dialect/FIRAttr.cpp | 2 +- flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp | 2 +- .../Optimizer/HLFIR/Transforms/ConvertToFIR.cpp | 4 ++-- flang/unittests/Optimizer/FortranVariableTest.cpp | 8 ++++---- 15 files changed, 48 insertions(+), 46 deletions(-) diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h index cdbf050..b13bb41 100644 --- a/flang/include/flang/Lower/ConvertVariable.h +++ b/flang/include/flang/Lower/ConvertVariable.h @@ -139,9 +139,9 @@ translateSymbolAttributes(mlir::MLIRContext *mlirContext, /// Translate the CUDA Fortran attributes of \p sym into the FIR CUDA attribute /// representation. -fir::CUDAAttributeAttr -translateSymbolCUDAAttribute(mlir::MLIRContext *mlirContext, - const Fortran::semantics::Symbol &sym); +fir::CUDADataAttributeAttr +translateSymbolCUDADataAttribute(mlir::MLIRContext *mlirContext, + const Fortran::semantics::Symbol &sym); /// Map a symbol to a given fir::ExtendedValue. This will generate an /// hlfir.declare when lowering to HLFIR and map the hlfir.declare result to the diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h index f50dacd..39821f1 100644 --- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h +++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h @@ -231,13 +231,13 @@ public: mlir::StringAttr linkage = {}, mlir::Attribute value = {}, bool isConst = false, bool isTarget = false, - fir::CUDAAttributeAttr cudaAttr = {}); + fir::CUDADataAttributeAttr cudaAttr = {}); fir::GlobalOp createGlobal(mlir::Location loc, mlir::Type type, llvm::StringRef name, bool isConst, bool isTarget, std::function bodyBuilder, mlir::StringAttr linkage = {}, - fir::CUDAAttributeAttr cudaAttr = {}); + fir::CUDADataAttributeAttr cudaAttr = {}); /// Create a global constant (read-only) value. fir::GlobalOp createGlobalConstant(mlir::Location loc, mlir::Type type, diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h index fe69ffa..170e134 100644 --- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h +++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h @@ -237,7 +237,7 @@ fir::FortranVariableOpInterface genDeclare(mlir::Location loc, fir::FirOpBuilder &builder, const fir::ExtendedValue &exv, llvm::StringRef name, fir::FortranVariableFlagsAttr flags, - fir::CUDAAttributeAttr cudaAttr = {}); + fir::CUDADataAttributeAttr cudaAttr = {}); /// Generate an hlfir.associate to build a variable from an expression value. /// The type of the variable must be provided so that scalar logicals are diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td index bc73124..422ad53 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td +++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td @@ -66,8 +66,8 @@ def CUDAshared : I32EnumAttrCase<"Shared", 4, "shared">; def CUDAunified : I32EnumAttrCase<"Unified", 5, "unified">; // Texture is omitted since it is obsolete and rejected by semantic. -def fir_CUDAAttribute : I32EnumAttr< - "CUDAAttribute", +def fir_CUDADataAttribute : I32EnumAttr< + "CUDADataAttribute", "CUDA Fortran variable attributes", [CUDAconstant, CUDAdevice, CUDAmanaged, CUDApinned, CUDAshared, CUDAunified]> { @@ -75,7 +75,8 @@ def fir_CUDAAttribute : I32EnumAttr< let cppNamespace = "::fir"; } -def fir_CUDAAttributeAttr : EnumAttr { +def fir_CUDADataAttributeAttr : + EnumAttr { let assemblyFormat = [{ ```<` $value `>` }]; } diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index d505fed..9f198a4 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -2738,7 +2738,7 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> { OptionalAttr:$constant, OptionalAttr:$target, OptionalAttr:$linkName, - OptionalAttr:$cuda_attr + OptionalAttr:$cuda_attr ); let regions = (region AtMostRegion<1>:$region); @@ -3029,7 +3029,7 @@ def fir_DeclareOp : fir_Op<"declare", [AttrSizedOperandSegments, Variadic:$typeparams, Builtin_StringAttr:$uniq_name, OptionalAttr:$fortran_attrs, - OptionalAttr:$cuda_attr + OptionalAttr:$cuda_attr ); let results = (outs AnyRefOrBox); diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td index f22e9a7..c82eae1 100644 --- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td +++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td @@ -89,7 +89,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments, Variadic:$typeparams, Builtin_StringAttr:$uniq_name, OptionalAttr:$fortran_attrs, - OptionalAttr:$cuda_attr + OptionalAttr:$cuda_attr ); let results = (outs AnyFortranVariable, AnyRefOrBoxLike); @@ -103,7 +103,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments, OpBuilder<(ins "mlir::Value":$memref, "llvm::StringRef":$uniq_name, CArg<"mlir::Value", "{}">:$shape, CArg<"mlir::ValueRange", "{}">:$typeparams, CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs, - CArg<"fir::CUDAAttributeAttr", "{}">:$cuda_attr)>]; + CArg<"fir::CUDADataAttributeAttr", "{}">:$cuda_attr)>]; let extraClassDeclaration = [{ /// Get the variable original base (same as input). It lacks diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h index 586701b..84c550a 100644 --- a/flang/include/flang/Optimizer/Support/Utils.h +++ b/flang/include/flang/Optimizer/Support/Utils.h @@ -273,32 +273,32 @@ inline void genMinMaxlocReductionLoop( builder.setInsertionPointAfter(ifMaskTrueOp); } -inline fir::CUDAAttributeAttr -getCUDAAttribute(mlir::MLIRContext *mlirContext, - std::optional cudaAttr) { +inline fir::CUDADataAttributeAttr +getCUDADataAttribute(mlir::MLIRContext *mlirContext, + std::optional cudaAttr) { if (cudaAttr) { - fir::CUDAAttribute attr; + fir::CUDADataAttribute attr; switch (*cudaAttr) { case Fortran::common::CUDADataAttr::Constant: - attr = fir::CUDAAttribute::Constant; + attr = fir::CUDADataAttribute::Constant; break; case Fortran::common::CUDADataAttr::Device: - attr = fir::CUDAAttribute::Device; + attr = fir::CUDADataAttribute::Device; break; case Fortran::common::CUDADataAttr::Managed: - attr = fir::CUDAAttribute::Managed; + attr = fir::CUDADataAttribute::Managed; break; case Fortran::common::CUDADataAttr::Pinned: - attr = fir::CUDAAttribute::Pinned; + attr = fir::CUDADataAttribute::Pinned; break; case Fortran::common::CUDADataAttr::Shared: - attr = fir::CUDAAttribute::Shared; + attr = fir::CUDADataAttribute::Shared; break; case Fortran::common::CUDADataAttr::Texture: // Obsolete attribute return {}; } - return fir::CUDAAttributeAttr::get(mlirContext, attr); + return fir::CUDADataAttributeAttr::get(mlirContext, attr); } return {}; } diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp index f67ee88..9c32b71 100644 --- a/flang/lib/Lower/CallInterface.cpp +++ b/flang/lib/Lower/CallInterface.cpp @@ -972,7 +972,7 @@ private: if (obj.cudaDataAttr) attrs.emplace_back( mlir::StringAttr::get(&mlirContext, fir::getCUDAAttrName()), - fir::getCUDAAttribute(&mlirContext, obj.cudaDataAttr)); + fir::getCUDADataAttribute(&mlirContext, obj.cudaDataAttr)); // TODO: intents that require special care (e.g finalization) diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 2f23757..b2279a3 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -139,7 +139,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, const Fortran::lower::pft::Variable &var, llvm::StringRef globalName, mlir::StringAttr linkage, - fir::CUDAAttributeAttr cudaAttr = {}); + fir::CUDADataAttributeAttr cudaAttr = {}); static mlir::Location genLocation(Fortran::lower::AbstractConverter &converter, const Fortran::semantics::Symbol &sym) { @@ -464,7 +464,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, const Fortran::lower::pft::Variable &var, llvm::StringRef globalName, mlir::StringAttr linkage, - fir::CUDAAttributeAttr cudaAttr) { + fir::CUDADataAttributeAttr cudaAttr) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); const Fortran::semantics::Symbol &sym = var.getSymbol(); mlir::Location loc = genLocation(converter, sym); @@ -1583,11 +1583,11 @@ fir::FortranVariableFlagsAttr Fortran::lower::translateSymbolAttributes( return fir::FortranVariableFlagsAttr::get(mlirContext, flags); } -fir::CUDAAttributeAttr Fortran::lower::translateSymbolCUDAAttribute( +fir::CUDADataAttributeAttr Fortran::lower::translateSymbolCUDADataAttribute( mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym) { std::optional cudaAttr = Fortran::semantics::GetCUDADataAttr(&sym); - return fir::getCUDAAttribute(mlirContext, cudaAttr); + return fir::getCUDADataAttribute(mlirContext, cudaAttr); } /// Map a symbol to its FIR address and evaluated specification expressions. @@ -1629,8 +1629,9 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter, auto name = converter.mangleName(sym); fir::FortranVariableFlagsAttr attributes = Fortran::lower::translateSymbolAttributes(builder.getContext(), sym); - fir::CUDAAttributeAttr cudaAttr = - Fortran::lower::translateSymbolCUDAAttribute(builder.getContext(), sym); + fir::CUDADataAttributeAttr cudaAttr = + Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(), + sym); if (isCrayPointee) { mlir::Type baseType = @@ -1722,9 +1723,9 @@ void Fortran::lower::genDeclareSymbol( fir::FortranVariableFlagsAttr attributes = Fortran::lower::translateSymbolAttributes( builder.getContext(), sym.GetUltimate(), extraFlags); - fir::CUDAAttributeAttr cudaAttr = - Fortran::lower::translateSymbolCUDAAttribute(builder.getContext(), - sym.GetUltimate()); + fir::CUDADataAttributeAttr cudaAttr = + Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(), + sym.GetUltimate()); auto name = converter.mangleName(sym); hlfir::EntityWithAttributes declare = hlfir::genDeclare(loc, builder, exv, name, attributes, cudaAttr); @@ -2222,8 +2223,8 @@ void Fortran::lower::defineModuleVariable( // Do nothing. Mapping will be done on user side. } else { std::string globalName = converter.mangleName(sym); - fir::CUDAAttributeAttr cudaAttr = - Fortran::lower::translateSymbolCUDAAttribute( + fir::CUDADataAttributeAttr cudaAttr = + Fortran::lower::translateSymbolCUDADataAttribute( converter.getFirOpBuilder().getContext(), sym); defineGlobal(converter, var, globalName, linkage, cudaAttr); } diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index 68fe8de..3cce39f 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -274,7 +274,7 @@ mlir::Value fir::FirOpBuilder::createHeapTemporary( fir::GlobalOp fir::FirOpBuilder::createGlobal( mlir::Location loc, mlir::Type type, llvm::StringRef name, mlir::StringAttr linkage, mlir::Attribute value, bool isConst, - bool isTarget, fir::CUDAAttributeAttr cudaAttr) { + bool isTarget, fir::CUDADataAttributeAttr cudaAttr) { auto module = getModule(); auto insertPt = saveInsertionPoint(); if (auto glob = module.lookupSymbol(name)) @@ -296,7 +296,7 @@ fir::GlobalOp fir::FirOpBuilder::createGlobal( fir::GlobalOp fir::FirOpBuilder::createGlobal( mlir::Location loc, mlir::Type type, llvm::StringRef name, bool isConst, bool isTarget, std::function bodyBuilder, - mlir::StringAttr linkage, fir::CUDAAttributeAttr cudaAttr) { + mlir::StringAttr linkage, fir::CUDADataAttributeAttr cudaAttr) { auto module = getModule(); auto insertPt = saveInsertionPoint(); if (auto glob = module.lookupSymbol(name)) diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp index 61e5311..4ffa303f 100644 --- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp +++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp @@ -199,7 +199,7 @@ fir::FortranVariableOpInterface hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder, const fir::ExtendedValue &exv, llvm::StringRef name, fir::FortranVariableFlagsAttr flags, - fir::CUDAAttributeAttr cudaAttr) { + fir::CUDADataAttributeAttr cudaAttr) { mlir::Value base = fir::getBase(exv); assert(fir::conformsWithPassByRef(base.getType()) && diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp index 04431b6..218fa50 100644 --- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp +++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp @@ -298,5 +298,5 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr, void FIROpsDialect::registerAttributes() { addAttributes(); + UpperBoundAttr, CUDADataAttributeAttr>(); } diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp index 85644c1..8bc92a9 100644 --- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp +++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp @@ -124,7 +124,7 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder, llvm::StringRef uniq_name, mlir::Value shape, mlir::ValueRange typeparams, fir::FortranVariableFlagsAttr fortran_attrs, - fir::CUDAAttributeAttr cuda_attr) { + fir::CUDADataAttributeAttr cuda_attr) { auto nameAttr = builder.getStringAttr(uniq_name); mlir::Type inputType = memref.getType(); bool hasExplicitLbs = hasExplicitLowerBounds(shape); diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp index b15fb59..cd534ba 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp @@ -320,12 +320,12 @@ public: mlir::Location loc = declareOp->getLoc(); mlir::Value memref = declareOp.getMemref(); fir::FortranVariableFlagsAttr fortranAttrs; - fir::CUDAAttributeAttr cudaAttr; + fir::CUDADataAttributeAttr cudaAttr; if (auto attrs = declareOp.getFortranAttrs()) fortranAttrs = fir::FortranVariableFlagsAttr::get(rewriter.getContext(), *attrs); if (auto attr = declareOp.getCudaAttr()) - cudaAttr = fir::CUDAAttributeAttr::get(rewriter.getContext(), *attr); + cudaAttr = fir::CUDADataAttributeAttr::get(rewriter.getContext(), *attr); auto firDeclareOp = rewriter.create( loc, memref.getType(), memref, declareOp.getShape(), declareOp.getTypeparams(), declareOp.getUniqName(), fortranAttrs, diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp index 4b101ce..790f735 100644 --- a/flang/unittests/Optimizer/FortranVariableTest.cpp +++ b/flang/unittests/Optimizer/FortranVariableTest.cpp @@ -50,7 +50,7 @@ TEST_F(FortranVariableTest, SimpleScalar) { auto declare = builder->create(loc, addr.getType(), addr, /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt, name, /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, - /*cuda_attr=*/fir::CUDAAttributeAttr{}); + /*cuda_attr=*/fir::CUDADataAttributeAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_FALSE(fortranVariable.isArray()); @@ -76,7 +76,7 @@ TEST_F(FortranVariableTest, CharacterScalar) { auto declare = builder->create(loc, addr.getType(), addr, /*shape=*/mlir::Value{}, typeParams, name, /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, - /*cuda_attr=*/fir::CUDAAttributeAttr{}); + /*cuda_attr=*/fir::CUDADataAttributeAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_FALSE(fortranVariable.isArray()); @@ -107,7 +107,7 @@ TEST_F(FortranVariableTest, SimpleArray) { auto declare = builder->create(loc, addr.getType(), addr, shape, /*typeParams*/ std::nullopt, name, /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, - /*cuda_attr=*/fir::CUDAAttributeAttr{}); + /*cuda_attr=*/fir::CUDADataAttributeAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_TRUE(fortranVariable.isArray()); @@ -138,7 +138,7 @@ TEST_F(FortranVariableTest, CharacterArray) { auto declare = builder->create(loc, addr.getType(), addr, shape, typeParams, name, /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, - /*cuda_attr=*/fir::CUDAAttributeAttr{}); + /*cuda_attr=*/fir::CUDADataAttributeAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_TRUE(fortranVariable.isArray()); -- cgit v1.1 From 0b77b19292457b9f2020e290980f1803a16eea34 Mon Sep 17 00:00:00 2001 From: choikwa <5455710+choikwa@users.noreply.github.com> Date: Fri, 9 Feb 2024 17:10:04 -0500 Subject: [AMDGPU] Add test to show s_cselect generation from uniform select (#79384) --- llvm/test/CodeGen/AMDGPU/uniform-select.ll | 219 +++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-select.ll diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll new file mode 100644 index 0000000..0cb4086 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -0,0 +1,219 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100 %s + +define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { +; GFX90A-LABEL: test_insert_extract: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NEXT: s_mov_b32 s2, 0 +; GFX90A-NEXT: s_and_b64 vcc, exec, -1 +; GFX90A-NEXT: s_mov_b32 s3, 0 +; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: s_mov_b32 s5, 0 +; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: .LBB0_1: ; %for.body +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 +; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX90A-NEXT: s_cselect_b32 s7, s4, s3 +; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 +; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX90A-NEXT: s_cselect_b32 s7, s5, s7 +; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 +; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX90A-NEXT: s_cselect_b32 s7, s6, s7 +; GFX90A-NEXT: s_or_b32 s7, s7, s0 +; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 +; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], exec +; GFX90A-NEXT: s_cselect_b32 s4, s7, s4 +; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 +; GFX90A-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX90A-NEXT: s_and_b64 s[12:13], s[10:11], exec +; GFX90A-NEXT: s_cselect_b32 s6, s7, s6 +; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 +; GFX90A-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX90A-NEXT: s_and_b64 s[14:15], s[12:13], exec +; GFX90A-NEXT: s_cselect_b32 s5, s7, s5 +; GFX90A-NEXT: s_cmp_eq_u32 s1, 0 +; GFX90A-NEXT: s_cselect_b32 s3, s7, s3 +; GFX90A-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] +; GFX90A-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX90A-NEXT: s_cselect_b32 s2, 0, s2 +; GFX90A-NEXT: s_mov_b64 vcc, vcc +; GFX90A-NEXT: s_cbranch_vccnz .LBB0_1 +; GFX90A-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: test_insert_extract: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NEXT: s_mov_b32 s2, 0 +; GFX940-NEXT: s_and_b64 vcc, exec, -1 +; GFX940-NEXT: s_mov_b32 s3, 0 +; GFX940-NEXT: s_mov_b32 s4, 0 +; GFX940-NEXT: s_mov_b32 s5, 0 +; GFX940-NEXT: s_mov_b32 s6, 0 +; GFX940-NEXT: .LBB0_1: ; %for.body +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_cmp_eq_u32 s1, 1 +; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX940-NEXT: s_cselect_b32 s7, s4, s3 +; GFX940-NEXT: s_cmp_eq_u32 s1, 2 +; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX940-NEXT: s_cselect_b32 s7, s5, s7 +; GFX940-NEXT: s_cmp_eq_u32 s1, 3 +; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX940-NEXT: s_cselect_b32 s7, s6, s7 +; GFX940-NEXT: s_or_b32 s7, s7, s0 +; GFX940-NEXT: s_cmp_eq_u32 s1, 1 +; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX940-NEXT: s_and_b64 s[10:11], s[8:9], exec +; GFX940-NEXT: s_cselect_b32 s4, s7, s4 +; GFX940-NEXT: s_cmp_eq_u32 s1, 3 +; GFX940-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX940-NEXT: s_and_b64 s[12:13], s[10:11], exec +; GFX940-NEXT: s_cselect_b32 s6, s7, s6 +; GFX940-NEXT: s_cmp_eq_u32 s1, 2 +; GFX940-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX940-NEXT: s_and_b64 s[14:15], s[12:13], exec +; GFX940-NEXT: s_cselect_b32 s5, s7, s5 +; GFX940-NEXT: s_cmp_eq_u32 s1, 0 +; GFX940-NEXT: s_cselect_b32 s3, s7, s3 +; GFX940-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] +; GFX940-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX940-NEXT: s_cselect_b32 s2, 0, s2 +; GFX940-NEXT: s_mov_b64 vcc, vcc +; GFX940-NEXT: s_cbranch_vccnz .LBB0_1 +; GFX940-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX940-NEXT: s_endpgm +; +; GFX1030-LABEL: test_insert_extract: +; GFX1030: ; %bb.0: ; %entry +; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1030-NEXT: s_mov_b32 s2, 0 +; GFX1030-NEXT: s_mov_b32 s3, 0 +; GFX1030-NEXT: s_mov_b32 s4, 0 +; GFX1030-NEXT: s_mov_b32 s5, 0 +; GFX1030-NEXT: s_mov_b32 s6, 0 +; GFX1030-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX1030-NEXT: .p2align 6 +; GFX1030-NEXT: .LBB0_1: ; %for.body +; GFX1030-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1030-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 +; GFX1030-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1030-NEXT: s_cselect_b32 s7, s4, s3 +; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 +; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1030-NEXT: s_cselect_b32 s7, s5, s7 +; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 +; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1030-NEXT: s_cselect_b32 s7, s6, s7 +; GFX1030-NEXT: s_or_b32 s7, s7, s0 +; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 +; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1030-NEXT: s_and_b32 s9, s8, exec_lo +; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 +; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 +; GFX1030-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1030-NEXT: s_and_b32 s10, s9, exec_lo +; GFX1030-NEXT: s_cselect_b32 s6, s7, s6 +; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 +; GFX1030-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1030-NEXT: s_and_b32 s11, s10, exec_lo +; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 +; GFX1030-NEXT: s_cmp_eq_u32 s1, 0 +; GFX1030-NEXT: s_cselect_b32 s3, s7, s3 +; GFX1030-NEXT: s_or_b32 s7, s10, s8 +; GFX1030-NEXT: s_or_b32 s7, s9, s7 +; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1030-NEXT: s_cselect_b32 s2, 0, s2 +; GFX1030-NEXT: s_cbranch_vccnz .LBB0_1 +; GFX1030-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX1030-NEXT: s_endpgm +; +; GFX1100-LABEL: test_insert_extract: +; GFX1100: ; %bb.0: ; %entry +; GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1100-NEXT: s_mov_b32 s2, 0 +; GFX1100-NEXT: s_mov_b32 s3, 0 +; GFX1100-NEXT: s_mov_b32 s4, 0 +; GFX1100-NEXT: s_mov_b32 s5, 0 +; GFX1100-NEXT: s_mov_b32 s6, 0 +; GFX1100-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX1100-NEXT: .p2align 6 +; GFX1100-NEXT: .LBB0_1: ; %for.body +; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-NEXT: s_cmp_eq_u32 s1, 1 +; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-NEXT: s_cselect_b32 s7, s4, s3 +; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 +; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1100-NEXT: s_cselect_b32 s7, s5, s7 +; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 +; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1100-NEXT: s_cselect_b32 s7, s6, s7 +; GFX1100-NEXT: s_or_b32 s7, s7, s0 +; GFX1100-NEXT: s_cmp_eq_u32 s1, 1 +; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_and_b32 s9, s8, exec_lo +; GFX1100-NEXT: s_cselect_b32 s4, s7, s4 +; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 +; GFX1100-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1100-NEXT: s_and_b32 s10, s9, exec_lo +; GFX1100-NEXT: s_cselect_b32 s6, s7, s6 +; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 +; GFX1100-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_and_b32 s11, s10, exec_lo +; GFX1100-NEXT: s_cselect_b32 s5, s7, s5 +; GFX1100-NEXT: s_cmp_eq_u32 s1, 0 +; GFX1100-NEXT: s_cselect_b32 s3, s7, s3 +; GFX1100-NEXT: s_or_b32 s7, s10, s8 +; GFX1100-NEXT: s_or_b32 s7, s9, s7 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-NEXT: s_cselect_b32 s2, 0, s2 +; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 +; GFX1100-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX1100-NEXT: s_endpgm +entry: + %init = insertelement <4 x i32> zeroinitializer, i32 0, i64 0 + br label %for.body + +for.body: ; preds = %for.body, %entry + %x1 = phi <4 x i32> [ %init, %entry ], [ %i4, %for.body ] + %x2 = phi <4 x i32> [ zeroinitializer, %entry ], [ %i2, %for.body ] + %idxprom = zext i32 %q to i64 + %e1 = extractelement <4 x i32> %x2, i64 %idxprom + %add = or i32 %e1, %p + %i2 = insertelement <4 x i32> %x2, i32 %add, i64 %idxprom + %e3 = extractelement <4 x i32> %x1, i64 %idxprom + %i4 = insertelement <4 x i32> %x1, i32 %e3, i64 0 + br label %for.body +} + -- cgit v1.1 From 01706e767777aeac9d5a22617d522826b64fce3e Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Fri, 9 Feb 2024 14:22:47 -0800 Subject: [llvm-nm][WebAssembly] Print function symbol sizes (#81315) nm already prints sizes for data symbols. Do that for function symbols too, and update objdump to also print size information. Implements item 3 from https://github.com/llvm/llvm-project/issues/76107 --- llvm/include/llvm/Object/Wasm.h | 1 + llvm/lib/Object/WasmObjectFile.cpp | 14 ++++++++++++++ llvm/test/MC/WebAssembly/alias-offset.s | 8 ++++---- llvm/test/MC/WebAssembly/alias.s | 6 +++--- llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml | 2 +- llvm/test/Object/wasm-linked-symbol-table.yaml | 6 +++--- llvm/test/tools/llvm-nm/wasm/linked.yaml | 5 +++++ llvm/test/tools/llvm-nm/wasm/print-size.test | 2 +- llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml | 4 ++-- .../llvm-objdump/wasm/linked-symbol-table-namesec.yaml | 12 ++++++------ llvm/test/tools/llvm-objdump/wasm/symbol-table.test | 12 ++++++------ llvm/tools/llvm-nm/llvm-nm.cpp | 7 ++----- llvm/tools/llvm-objdump/llvm-objdump.cpp | 3 +++ 13 files changed, 51 insertions(+), 31 deletions(-) diff --git a/llvm/include/llvm/Object/Wasm.h b/llvm/include/llvm/Object/Wasm.h index 13d9a17e..b8f7bb45 100644 --- a/llvm/include/llvm/Object/Wasm.h +++ b/llvm/include/llvm/Object/Wasm.h @@ -179,6 +179,7 @@ public: Expected getSymbolType(DataRefImpl Symb) const override; Expected getSymbolSection(DataRefImpl Symb) const override; uint32_t getSymbolSectionId(SymbolRef Sym) const; + uint32_t getSymbolSize(SymbolRef Sym) const; // Overrides from SectionRef. void moveSectionNext(DataRefImpl &Sec) const override; diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp index 1d68687..04e2b80 100644 --- a/llvm/lib/Object/WasmObjectFile.cpp +++ b/llvm/lib/Object/WasmObjectFile.cpp @@ -1932,6 +1932,20 @@ uint32_t WasmObjectFile::getSymbolSectionIdImpl(const WasmSymbol &Sym) const { } } +uint32_t WasmObjectFile::getSymbolSize(SymbolRef Symb) const { + const WasmSymbol &Sym = getWasmSymbol(Symb); + if (!Sym.isDefined()) + return 0; + if (Sym.isTypeData()) + return Sym.Info.DataRef.Size; + if (Sym.isTypeFunction()) + return functions()[Sym.Info.ElementIndex - getNumImportedFunctions()].Size; + // Currently symbol size is only tracked for data segments and functions. In + // principle we could also track size (e.g. binary size) for tables, globals + // and element segments etc too. + return 0; +} + void WasmObjectFile::moveSectionNext(DataRefImpl &Sec) const { Sec.d.a++; } Expected WasmObjectFile::getSectionName(DataRefImpl Sec) const { diff --git a/llvm/test/MC/WebAssembly/alias-offset.s b/llvm/test/MC/WebAssembly/alias-offset.s index e45b17d..4899922 100644 --- a/llvm/test/MC/WebAssembly/alias-offset.s +++ b/llvm/test/MC/WebAssembly/alias-offset.s @@ -12,10 +12,10 @@ sym_a: .set sym_b, sym_a + 4 # CHECK-LABEL: SYMBOL TABLE: -# CHECK-NEXT: 00000000 l O DATA foo -# CHECK-NEXT: 00000004 l O DATA sym_a -# CHECK-NEXT: 00000008 l O DATA sym_b -# CHECK-NEXT: 00000001 l F CODE main +# CHECK-NEXT: 00000000 l O DATA 00000004 foo +# CHECK-NEXT: 00000004 l O DATA 00000008 sym_a +# CHECK-NEXT: 00000008 l O DATA 00000004 sym_b +# CHECK-NEXT: 00000001 l F CODE 00000012 main .text .section .text,"",@ diff --git a/llvm/test/MC/WebAssembly/alias.s b/llvm/test/MC/WebAssembly/alias.s index b0a7539..8ed46f5 100644 --- a/llvm/test/MC/WebAssembly/alias.s +++ b/llvm/test/MC/WebAssembly/alias.s @@ -10,6 +10,6 @@ sym_a: .set sym_b, sym_a -# CHECK: 00000000 l O DATA foo -# CHECK: 00000004 l O DATA sym_a -# CHECK: 00000004 l O DATA sym_b +# CHECK: 00000000 l O DATA 00000004 foo +# CHECK: 00000004 l O DATA 00000004 sym_a +# CHECK: 00000004 l O DATA 00000004 sym_b diff --git a/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml b/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml index c730417..5dfa394 100644 --- a/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml +++ b/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml @@ -2,7 +2,7 @@ # RUN: llvm-nm -P %t.wasm | FileCheck %s # # Test that names from the linking section override those from the name section -# CHECK: foo T 1 0 +# CHECK: foo T 1 3 # CHECK-NOT: my_func_local_name --- !WASM diff --git a/llvm/test/Object/wasm-linked-symbol-table.yaml b/llvm/test/Object/wasm-linked-symbol-table.yaml index 6dd949a..eccdc2c 100644 --- a/llvm/test/Object/wasm-linked-symbol-table.yaml +++ b/llvm/test/Object/wasm-linked-symbol-table.yaml @@ -2,9 +2,9 @@ # RUN: llvm-objdump -t %t.wasm | FileCheck %s # # CHECK: SYMBOL TABLE: -# CHECK-NEXT: 0000009f g F CODE my_func_export -# CHECK-NEXT: 0000002a g O DATA my_global_export -# CHECK-NEXT: 00000000 g TABLE my_table_export +# CHECK-NEXT: 0000009f g F CODE 00000003 my_func_export +# CHECK-NEXT: 0000002a g O DATA 00000000 my_global_export +# CHECK-NEXT: 00000000 g TABLE 00000000 my_table_export --- !WASM FileHeader: diff --git a/llvm/test/tools/llvm-nm/wasm/linked.yaml b/llvm/test/tools/llvm-nm/wasm/linked.yaml index 992c181..6aee4b9 100644 --- a/llvm/test/tools/llvm-nm/wasm/linked.yaml +++ b/llvm/test/tools/llvm-nm/wasm/linked.yaml @@ -1,10 +1,15 @@ # RUN: yaml2obj %s -o %t.wasm # RUN: llvm-nm %t.wasm | FileCheck %s +# RUN: llvm-nm -P %t.wasm | FileCheck %s --check-prefix=POSIX # CHECK: 0000009f T my_func_export # CHECK-NEXT: 0000002a D my_global_export # CHECK-NEXT: 00000000 D my_table_export +# POSIX: my_func_export T 9f 3 +# POSIX-NEXT: my_global_export D 2a 0 +# POSIX-NEXT: my_table_export D 0 0 + --- !WASM FileHeader: Version: 0x1 diff --git a/llvm/test/tools/llvm-nm/wasm/print-size.test b/llvm/test/tools/llvm-nm/wasm/print-size.test index c166edb..610929b 100644 --- a/llvm/test/tools/llvm-nm/wasm/print-size.test +++ b/llvm/test/tools/llvm-nm/wasm/print-size.test @@ -43,4 +43,4 @@ Sections: Size: 32 # CHECK: 00000000 00000020 D a_data_symbol -# CHECK: 00000001 00000000 T a_func +# CHECK: 00000001 0000000d T a_func diff --git a/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml b/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml index 9c1e90a..f4abf12 100644 --- a/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml +++ b/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml @@ -2,8 +2,8 @@ # RUN: llvm-objdump -t %t.so | FileCheck %s # # CHECK: SYMBOL TABLE: -# CHECK-NEXT: 00000001 g F CODE my_func_export -# CHECK-NEXT: 0000002a g O DATA my_global_export +# CHECK-NEXT: 00000001 g F CODE 00000003 my_func_export +# CHECK-NEXT: 0000002a g O DATA 00000000 my_global_export --- !WASM FileHeader: diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml index 622a606..dc87e62 100644 --- a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml +++ b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml @@ -2,12 +2,12 @@ # RUN: llvm-objdump -t %t.wasm | FileCheck %s # # CHECK: SYMBOL TABLE: -# CHECK-NEXT: 00000000 F *UND* my_func_import_name -# CHECK-NEXT: 00000083 g F CODE my_func_export_name -# CHECK-NEXT: 00000086 l F CODE my_func_local_name -# CHECK-NEXT: 00000000 *UND* my_global_import_name -# CHECK-NEXT: 00000001 g GLOBAL my_global_export_name -# CHECK-NEXT: 00000000 l O DATA my_datasegment_name +# CHECK-NEXT: 00000000 F *UND* 00000000 my_func_import_name +# CHECK-NEXT: 00000083 g F CODE 00000003 my_func_export_name +# CHECK-NEXT: 00000086 l F CODE 00000003 my_func_local_name +# CHECK-NEXT: 00000000 *UND* 00000000 my_global_import_name +# CHECK-NEXT: 00000001 g GLOBAL 00000000 my_global_export_name +# CHECK-NEXT: 00000000 l O DATA 00000004 my_datasegment_name --- !WASM FileHeader: diff --git a/llvm/test/tools/llvm-objdump/wasm/symbol-table.test b/llvm/test/tools/llvm-objdump/wasm/symbol-table.test index b7301a2..ccb0746 100644 --- a/llvm/test/tools/llvm-objdump/wasm/symbol-table.test +++ b/llvm/test/tools/llvm-objdump/wasm/symbol-table.test @@ -1,9 +1,9 @@ RUN: llvm-objdump -t %p/Inputs/trivial.obj.wasm | FileCheck %s CHECK: SYMBOL TABLE: -CHECK-NEXT: 00000001 g F CODE main -CHECK-NEXT: 00000000 l O DATA .L.str -CHECK-NEXT: 00000000 F *UND* puts -CHECK-NEXT: 00000019 l F CODE .LSomeOtherFunction_bitcast -CHECK-NEXT: 00000000 F *UND* SomeOtherFunction -CHECK-NEXT: 00000010 g O DATA var +CHECK-NEXT: 00000001 g F CODE 00000018 main +CHECK-NEXT: 00000000 l O DATA 0000000d .L.str +CHECK-NEXT: 00000000 F *UND* 00000000 puts +CHECK-NEXT: 00000019 l F CODE 0000000b .LSomeOtherFunction_bitcast +CHECK-NEXT: 00000000 F *UND* 00000000 SomeOtherFunction +CHECK-NEXT: 00000010 g O DATA 00000004 var diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp index da5998b..e3b8145 100644 --- a/llvm/tools/llvm-nm/llvm-nm.cpp +++ b/llvm/tools/llvm-nm/llvm-nm.cpp @@ -1854,11 +1854,8 @@ static bool getSymbolNamesFromObject(SymbolicFile &Obj, dyn_cast(&Obj)) S.Size = XCOFFObj->getSymbolSize(Sym.getRawDataRefImpl()); - if (const WasmObjectFile *WasmObj = dyn_cast(&Obj)) { - const WasmSymbol &WasmSym = WasmObj->getWasmSymbol(Sym); - if (WasmSym.isTypeData() && !WasmSym.isUndefined()) - S.Size = WasmSym.Info.DataRef.Size; - } + if (const WasmObjectFile *WasmObj = dyn_cast(&Obj)) + S.Size = WasmObj->getSymbolSize(Sym); if (PrintAddress && isa(Obj)) { SymbolRef SymRef(Sym); diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp index de52ebc..0e4f4e1 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -2947,6 +2947,9 @@ void Dumper::printSymbol(const SymbolRef &Symbol, Symbol.getRawDataRefImpl())); else if (O.isELF()) outs() << '\t' << format(Fmt, ELFSymbolRef(Symbol).getSize()); + else if (O.isWasm()) + outs() << '\t' + << format(Fmt, cast(O).getSymbolSize(Symbol)); if (O.isELF()) { if (!SymbolVersions.empty()) { -- cgit v1.1 From 9397d23671f26ab8631e90f688ae2ea212f3c770 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 9 Feb 2024 14:26:49 -0800 Subject: [docs] --save-temps=: add single quotes after #80921 and update --save-temps --- clang/include/clang/Driver/Options.td | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4f498db..31503fc 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5392,13 +5392,13 @@ def regcall4 : Flag<["-"], "regcall4">, Group, MarshallingInfoFlag>; def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[NoXarchOption]>, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, - HelpText<"Save intermediate compilation results. can be set to cwd for " - "current working directory, or obj which will save temporary files in the " + HelpText<"Save intermediate compilation results. can be set to 'cwd' for " + "current working directory, or 'obj' which will save temporary files in the " "same directory as the final output file">; def save_temps : Flag<["-", "--"], "save-temps">, Flags<[NoXarchOption]>, Visibility<[ClangOption, FlangOption, FC1Option]>, Alias, AliasArgs<["cwd"]>, - HelpText<"Save intermediate compilation results">; + HelpText<"Alias for --save-temps=cwd">; def save_stats_EQ : Joined<["-", "--"], "save-stats=">, Flags<[NoXarchOption]>, HelpText<"Save llvm statistics.">; def save_stats : Flag<["-", "--"], "save-stats">, Flags<[NoXarchOption]>, -- cgit v1.1 From 0267f9800ea23921120faa4b1d46ac5806e5eca1 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 9 Feb 2024 15:01:04 -0800 Subject: [workflows] Add a new workflow for testing release branch CI (#81073) Since we commit all changes to the release branch CI to main first, we need someway to test that these changes to main don't break the CI. --- .github/workflows/llvm-project-workflow-tests.yml | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .github/workflows/llvm-project-workflow-tests.yml diff --git a/.github/workflows/llvm-project-workflow-tests.yml b/.github/workflows/llvm-project-workflow-tests.yml new file mode 100644 index 0000000..a2539b2 --- /dev/null +++ b/.github/workflows/llvm-project-workflow-tests.yml @@ -0,0 +1,32 @@ +# This workflow will test the llvm-project-tests workflow in PRs +# targetting the main branch. Since this workflow doesn't normally +# run on main PRs, we need some way to test it to ensure new updates +# don't break it. + +name: LLVM Workflow Test + +permissions: + contents: read + +on: + pull_request: + branches: + - 'main' + paths: + - '.github/workflows/llvm-project-tests.yml' + - '.github/workflows/llvm-project-workflow-tests.yml' + +concurrency: + # Skip intermediate builds: always. + # Cancel intermediate builds: only if it is a pull request build. + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} + +jobs: + llvm-test: + if: github.repository_owner == 'llvm' + name: Build and Test + uses: ./.github/workflows/llvm-project-tests.yml + with: + build_target: check-all + projects: clang;lld;libclc;lldb -- cgit v1.1 From fbba818a78f591d89f25768ba31783714d526532 Mon Sep 17 00:00:00 2001 From: Philipp Tomsich Date: Fri, 9 Feb 2024 15:22:09 -0800 Subject: [AArch64] Add the Ampere1B core (#81297) The Ampere1B is Ampere's third-generation core implementing a superscalar, out-of-order microarchitecture with nested virtualization, speculative side-channel mitigation and architectural support for defense against ROP/JOP style software attacks. Ampere1B is an ARMv8.7+ implementation, adding support for the FEAT WFxT, FEAT CSSC, FEAT PAN3 and FEAT AFP extensions. It also includes all features of the second-generation Ampere1A, such as the Memory Tagging Extension and SM3/SM4 cryptography instructions. --- clang/test/Driver/aarch64-cssc.c | 1 + clang/test/Misc/target-invalid-cpu-note.c | 4 ++-- .../llvm/TargetParser/AArch64TargetParser.h | 6 +++++ llvm/lib/Target/AArch64/AArch64.td | 26 ++++++++++++++++++++++ llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 1 + llvm/lib/Target/AArch64/AArch64Subtarget.h | 1 + llvm/lib/TargetParser/Host.cpp | 1 + llvm/test/CodeGen/AArch64/cpus.ll | 1 + llvm/test/CodeGen/AArch64/neon-dot-product.ll | 1 + llvm/test/CodeGen/AArch64/remat.ll | 1 + llvm/test/MC/AArch64/armv8.2a-dotprod.s | 3 +++ .../test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt | 1 + llvm/unittests/TargetParser/Host.cpp | 3 +++ llvm/unittests/TargetParser/TargetParserTest.cpp | 14 +++++++++++- 14 files changed, 61 insertions(+), 3 deletions(-) diff --git a/clang/test/Driver/aarch64-cssc.c b/clang/test/Driver/aarch64-cssc.c index a3e1866..5df0ea7 100644 --- a/clang/test/Driver/aarch64-cssc.c +++ b/clang/test/Driver/aarch64-cssc.c @@ -9,6 +9,7 @@ // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a %s 2>&1 | FileCheck %s // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a+cssc %s 2>&1 | FileCheck %s // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a+nocssc %s 2>&1 | FileCheck %s --check-prefix=NO_CSSC +// RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -mcpu=ampere1b %s 2>&1 | FileCheck %s // CHECK: "target-features"="{{.*}},+cssc // NO_CSSC: "target-features"="{{.*}},-cssc diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c index 2f10bfb..39ed02f 100644 --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -5,11 +5,11 @@ // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64 // AARCH64: error: unknown target CPU 'not-a-cpu' -// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, cobalt-100, grace{{$}} +// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}} // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu' -// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, cobalt-100, grace{{$}} +// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}} // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86 // X86: error: unknown target CPU 'not-a-cpu' diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index cce9d6d..ed9944b 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -805,6 +805,12 @@ inline constexpr CpuInfo CpuInfos[] = { {AArch64::AEK_FP16, AArch64::AEK_RAND, AArch64::AEK_SM4, AArch64::AEK_SHA3, AArch64::AEK_SHA2, AArch64::AEK_AES, AArch64::AEK_MTE, AArch64::AEK_SB, AArch64::AEK_SSBS}))}, + {"ampere1b", ARMV8_7A, + (AArch64::ExtensionBitset({AArch64::AEK_FP16, AArch64::AEK_RAND, + AArch64::AEK_SM4, AArch64::AEK_SHA3, + AArch64::AEK_SHA2, AArch64::AEK_AES, + AArch64::AEK_MTE, AArch64::AEK_SB, + AArch64::AEK_SSBS, AArch64::AEK_CSSC}))}, }; // An alias for a CPU. diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 02fb01c..00833b4 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -1376,6 +1376,24 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A", FeatureLdpAlignedOnly, FeatureStpAlignedOnly]>; +def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B", + "Ampere Computing Ampere-1B processors", [ + FeaturePostRAScheduler, + FeatureFuseAES, + FeatureFuseAdrpAdd, + FeatureAddrLSLFast, + FeatureALULSLFast, + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeatureCmpBccFusion, + FeatureFuseAddress, + FeatureFuseLiterals, + FeatureStorePairSuppress, + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive, + FeatureLdpAlignedOnly, + FeatureStpAlignedOnly]>; + def ProcessorFeatures { list A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeaturePerfMon]; @@ -1530,6 +1548,11 @@ def ProcessorFeatures { FeatureMTE, FeatureSSBS, FeatureRandGen, FeatureSB, FeatureSM4, FeatureSHA2, FeatureSHA3, FeatureAES]; + list Ampere1B = [HasV8_7aOps, FeatureNEON, FeaturePerfMon, + FeatureMTE, FeatureSSBS, FeatureRandGen, + FeatureSB, FeatureSM4, FeatureSHA2, + FeatureSHA3, FeatureAES, FeatureCSSC, + FeatureWFxT]; // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not @@ -1697,6 +1720,9 @@ def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1, def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A, [TuneAmpere1A]>; +def : ProcessorModel<"ampere1b", Ampere1Model, ProcessorFeatures.Ampere1B, + [TuneAmpere1B]>; + //===----------------------------------------------------------------------===// // Assembly parser //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 6550c12..2b01deb 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -296,6 +296,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { break; case Ampere1: case Ampere1A: + case Ampere1B: CacheLineSize = 64; PrefFunctionAlignment = Align(64); PrefLoopAlignment = Align(64); diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 0292c01..01cc471 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -42,6 +42,7 @@ public: A64FX, Ampere1, Ampere1A, + Ampere1B, AppleA7, AppleA10, AppleA11, diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index f1197c2..4466d50 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -321,6 +321,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { return StringSwitch(Part) .Case("0xac3", "ampere1") .Case("0xac4", "ampere1a") + .Case("0xac5", "ampere1b") .Default("generic"); } diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll index b248660..7b45d0f 100644 --- a/llvm/test/CodeGen/AArch64/cpus.ll +++ b/llvm/test/CodeGen/AArch64/cpus.ll @@ -37,6 +37,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=a64fx 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1a 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1b 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID ; CHECK-NOT: {{.*}} is not a recognized processor for this target diff --git a/llvm/test/CodeGen/AArch64/neon-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-dot-product.ll index 23d1e43..cf09a46 100644 --- a/llvm/test/CodeGen/AArch64/neon-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-dot-product.ll @@ -7,6 +7,7 @@ ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2 < %s | FileCheck %s ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1 < %s | FileCheck %s ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1a < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1b < %s | FileCheck %s declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) diff --git a/llvm/test/CodeGen/AArch64/remat.ll b/llvm/test/CodeGen/AArch64/remat.ll index 483c4d7..704c87f 100644 --- a/llvm/test/CodeGen/AArch64/remat.ll +++ b/llvm/test/CodeGen/AArch64/remat.ll @@ -26,6 +26,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx3t110 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1a -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1b -o - %s | FileCheck %s %X = type { i64, i64, i64 } declare void @f(ptr) diff --git a/llvm/test/MC/AArch64/armv8.2a-dotprod.s b/llvm/test/MC/AArch64/armv8.2a-dotprod.s index a49ed14..4d964090 100644 --- a/llvm/test/MC/AArch64/armv8.2a-dotprod.s +++ b/llvm/test/MC/AArch64/armv8.2a-dotprod.s @@ -15,6 +15,7 @@ // RUN: llvm-mc -triple aarch64 -mattr=+v8r,+dotprod -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD // RUN: llvm-mc -triple aarch64 -mcpu=ampere1 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD // RUN: llvm-mc -triple aarch64 -mcpu=ampere1a -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD +// RUN: llvm-mc -triple aarch64 -mcpu=ampere1b -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD // RUN: not llvm-mc -triple aarch64 -mattr=+v8.2a -show-encoding < %s 2> %t // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s @@ -42,6 +43,8 @@ // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s // RUN: not llvm-mc -triple aarch64 -mcpu=ampere1a -mattr=-dotprod -show-encoding < %s 2> %t // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s +// RUN: not llvm-mc -triple aarch64 -mcpu=ampere1b -mattr=-dotprod -show-encoding < %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s udot v0.2s, v1.8b, v2.8b sdot v0.2s, v1.8b, v2.8b diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt index 907d0c3..259cb9d 100644 --- a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt +++ b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt @@ -14,6 +14,7 @@ # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=neoverse-n2 --disassemble < %s | FileCheck %s # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1 --disassemble < %s | FileCheck %s # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1a --disassemble < %s | FileCheck %s +# RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1b --disassemble < %s | FileCheck %s # CHECK: ldaprb w0, [x0] # CHECK: ldaprh w0, [x0] diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp index 5f15161..6aa1d7a 100644 --- a/llvm/unittests/TargetParser/Host.cpp +++ b/llvm/unittests/TargetParser/Host.cpp @@ -122,6 +122,9 @@ TEST(getLinuxHostCPUName, AArch64) { EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n" "CPU part : 0xac4"), "ampere1a"); + EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n" + "CPU part : 0xac5"), + "ampere1b"); // MSM8992/4 weirdness StringRef MSM8992ProcCpuInfo = R"( diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index e7f9973..e89fc68 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1601,6 +1601,18 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_PAUTH})), "8.6-A"), ARMCPUTestParams( + "ampere1b", "armv8.7-a", "crypto-neon-fp-armv8", + (AArch64::ExtensionBitset( + {AArch64::AEK_CRC, AArch64::AEK_FP, AArch64::AEK_FP16, + AArch64::AEK_SIMD, AArch64::AEK_RAS, AArch64::AEK_LSE, + AArch64::AEK_RDM, AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, + AArch64::AEK_SM4, AArch64::AEK_SHA3, AArch64::AEK_BF16, + AArch64::AEK_SHA2, AArch64::AEK_AES, AArch64::AEK_I8MM, + AArch64::AEK_SSBS, AArch64::AEK_SB, AArch64::AEK_RAND, + AArch64::AEK_MTE, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH, AArch64::AEK_CSSC})), + "8.7-A"), + ARMCPUTestParams( "neoverse-512tvb", "armv8.4-a", "crypto-neon-fp-armv8", (AArch64::ExtensionBitset( {AArch64::AEK_RAS, AArch64::AEK_SVE, AArch64::AEK_SSBS, @@ -1679,7 +1691,7 @@ INSTANTIATE_TEST_SUITE_P( ARMCPUTestParams::PrintToStringParamName); // Note: number of CPUs includes aliases. -static constexpr unsigned NumAArch64CPUArchs = 68; +static constexpr unsigned NumAArch64CPUArchs = 69; TEST(TargetParserTest, testAArch64CPUArchList) { SmallVector List; -- cgit v1.1 From 7b2eff6306c1a20f69f16bc485dbc229c8ada40d Mon Sep 17 00:00:00 2001 From: Philipp Tomsich Date: Fri, 9 Feb 2024 15:31:03 -0800 Subject: [AArch64] Add FeatureFuseAddSub2RegAndConstOne for Ampere1A (#81295) Ampere1A introduced the Fusion for A+B+1/A-B-1. However, the Feature flag to enable that fusion-case never was added to TuneAmpere1A. This commit corrects that omission. --- llvm/lib/Target/AArch64/AArch64.td | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 00833b4..8f8cc15 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -1372,6 +1372,7 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A", FeatureFuseAddress, FeatureFuseLiterals, FeatureFuseLiterals, + FeatureFuseAddSub2RegAndConstOne, FeatureStorePairSuppress, FeatureLdpAlignedOnly, FeatureStpAlignedOnly]>; -- cgit v1.1 From ff2e8788d277cbb8c47fa2a8ea87dec7e06307aa Mon Sep 17 00:00:00 2001 From: Philipp Tomsich Date: Fri, 9 Feb 2024 15:31:30 -0800 Subject: [AArch64] Add FeatureFuseAdrpAdd for Ampere1/1A (#81293) Both Ampere1 and Ampere1A support fusion of ADRP+ADD. This adds the missing feature to enable fusion-aware scheduling for this case. --- llvm/lib/Target/AArch64/AArch64.td | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 8f8cc15..5098dd8 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -1349,6 +1349,7 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1", "Ampere Computing Ampere-1 processors", [ FeaturePostRAScheduler, FeatureFuseAES, + FeatureFuseAdrpAdd, FeatureAddrLSLFast, FeatureALULSLFast, FeatureAggressiveFMA, @@ -1364,6 +1365,7 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A", "Ampere Computing Ampere-1A processors", [ FeaturePostRAScheduler, FeatureFuseAES, + FeatureFuseAdrpAdd, FeatureAddrLSLFast, FeatureALULSLFast, FeatureAggressiveFMA, -- cgit v1.1 From 4f0ee665b58f3f70cd7e8edad6704b2b053b7ea9 Mon Sep 17 00:00:00 2001 From: Philipp Tomsich Date: Fri, 9 Feb 2024 15:34:37 -0800 Subject: [AArch64] [NFC] Remove duplicate FeatureFuseLiterals from Ampere1A (#81292) --- llvm/lib/Target/AArch64/AArch64.td | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 5098dd8..e76204f 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -1373,7 +1373,6 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A", FeatureCmpBccFusion, FeatureFuseAddress, FeatureFuseLiterals, - FeatureFuseLiterals, FeatureFuseAddSub2RegAndConstOne, FeatureStorePairSuppress, FeatureLdpAlignedOnly, -- cgit v1.1 From 014401158bbbc6899144905c1eb9e44fac86867e Mon Sep 17 00:00:00 2001 From: Philipp Tomsich Date: Fri, 9 Feb 2024 15:48:46 -0800 Subject: [AArch64] Add Ampere1B scheduling/pipeline model (#81338) The Ampere1B core is enabled with a new scheduling/pipeline model, as it provides significant updates over the Ampere1 core; it reduces latencies on many instructions, has some micro-ops reassigned between the XY and X units, and provides modelling for the instructions added since Ampere1 and Ampere1A. --- llvm/lib/Target/AArch64/AArch64.td | 3 +- llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td | 1061 +++++++++++++++++++++++ 2 files changed, 1063 insertions(+), 1 deletion(-) create mode 100644 llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index e76204f..156c48e 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -837,6 +837,7 @@ include "AArch64SchedA64FX.td" include "AArch64SchedThunderX3T110.td" include "AArch64SchedTSV110.td" include "AArch64SchedAmpere1.td" +include "AArch64SchedAmpere1B.td" include "AArch64SchedNeoverseN1.td" include "AArch64SchedNeoverseN2.td" include "AArch64SchedNeoverseV1.td" @@ -1722,7 +1723,7 @@ def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1, def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A, [TuneAmpere1A]>; -def : ProcessorModel<"ampere1b", Ampere1Model, ProcessorFeatures.Ampere1B, +def : ProcessorModel<"ampere1b", Ampere1BModel, ProcessorFeatures.Ampere1B, [TuneAmpere1B]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td new file mode 100644 index 0000000..43da762 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td @@ -0,0 +1,1061 @@ +//=- AArch64SchedAmpere1B.td - Ampere-1B scheduling def -----*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the Ampere Computing Ampere-1B to +// support instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +// The Ampere-1 core is an out-of-order micro-architecture. The front +// end has branch prediction, with a 10-cycle recovery time from a +// mispredicted branch. Instructions coming out of the front end are +// decoded into internal micro-ops (uops). + +def Ampere1BModel : SchedMachineModel { + let IssueWidth = 4; // 4-way decode and dispatch + let MicroOpBufferSize = 192; // micro-op re-order buffer size + let LoadLatency = 3; // Optimistic load latency + let MispredictPenalty = 10; // Branch mispredict penalty + let LoopMicroOpBufferSize = 32; // Instruction queue size + let CompleteModel = 0; + + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + SMEUnsupported.F, + PAUnsupported.F); +} + +let SchedModel = Ampere1BModel in { + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Ampere-1. +// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP, +// and 2 memory) issue into. The integer and FP schedulers can each issue +// one uop per cycle, while the memory schedulers can each issue one load +// and one store address calculation per cycle. + +def Ampere1BUnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w +def Ampere1BUnitB : ProcResource<2>; // integer single-cycle, and complex shifts +def Ampere1BUnitBS : ProcResource<1>; // integer multi-cycle +def Ampere1BUnitL : ProcResource<2>; // load +def Ampere1BUnitS : ProcResource<2>; // store address calculation +def Ampere1BUnitX : ProcResource<1>; // FP and vector operations, and flag write +def Ampere1BUnitY : ProcResource<1>; // FP and vector operations, and crypto +def Ampere1BUnitZ : ProcResource<1>; // FP store data and FP-to-integer moves + +def Ampere1BUnitAB : ProcResGroup<[Ampere1BUnitA, Ampere1BUnitB]>; +def Ampere1BUnitXY : ProcResGroup<[Ampere1BUnitX, Ampere1BUnitY]>; + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Ampere-1. + +def Ampere1BWrite_1cyc_1A : SchedWriteRes<[Ampere1BUnitA]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_2A : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitA]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1BWrite_1cyc_1B : SchedWriteRes<[Ampere1BUnitB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_1BS_1B : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitB]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1BWrite_1cyc_1AB : SchedWriteRes<[Ampere1BUnitAB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_1AB_1A : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitA]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1BWrite_1cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_1S : SchedWriteRes<[Ampere1BUnitS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_2S : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1BWrite_2cyc_1Y : SchedWriteRes<[Ampere1BUnitY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1BWrite_2cyc_2AB : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1BWrite_2cyc_1B_1AB : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1BWrite_2cyc_1B_1S : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1BWrite_2cyc_1B_1S_1AB : SchedWriteRes<[Ampere1BUnitB, + Ampere1BUnitS, + Ampere1BUnitAB]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1BWrite_2cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1BWrite_2cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1BWrite_3cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1BWrite_3cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1BWrite_3cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1BWrite_3cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1BWrite_3cyc_1Z : SchedWriteRes<[Ampere1BUnitZ]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1BWrite_3cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, + Ampere1BUnitZ]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def Ampere1BWrite_3cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def Ampere1BWrite_3cyc_2S_2Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def Ampere1BWrite_4cyc_1BS_1AB : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitAB]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1BWrite_4cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1BWrite_4cyc_1L_1B : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitB]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1BWrite_4cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1BWrite_4cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1BWrite_4cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1BWrite_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1BUnitXY, + Ampere1BUnitS + Ampere1BUnitZ]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1BWrite_5cyc_4S_4Z : SchedWriteRes<[Ampere1BUnitL, + Ampere1BUnitBS]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def Ampere1BWrite_5cyc_1L_1BS : SchedWriteRes<[Ampere1BUnitL, + Ampere1BUnitBS]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1BWrite_5cyc_3L : SchedWriteRes<[Ampere1BUnitL, + Ampere1BUnitL, + Ampere1BUnitL]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def Ampere1BWrite_5cyc_4L : SchedWriteRes<[Ampere1BUnitL, + Ampere1BUnitL, + Ampere1BUnitL, + Ampere1BUnitL]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def Ampere1BWrite_5cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1BWrite_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def Ampere1BWrite_6cyc_1BS_1A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1BWrite_6cyc_1BS_2A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA, + Ampere1BUnitA]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1BWrite_6cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1BWrite_6cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1BWrite_6cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1BWrite_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 6; + let NumMicroOps = 6; +} + +def Ampere1BWrite_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitS, Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 6; + let NumMicroOps = 9; +} + +def Ampere1BWrite_7cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1BWrite_7cyc_1XY_1Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitZ]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1BWrite_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 7; + let NumMicroOps = 12; +} + +def Ampere1BWrite_8cyc_1BS_1L : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitL]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1BWrite_8cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1BWrite_8cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1BWrite_8cyc_4XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def Ampere1BWrite_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 9; + let NumMicroOps = 14; +} + +def Ampere1BWrite_9cyc_1A_1BS_1X : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitX]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1BWrite_9cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitXY]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1BWrite_9cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 9; + let NumMicroOps = 1; +} + +def Ampere1BWrite_9cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1BWrite_11cyc_1BS_2XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1BWrite_12cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 12; + let NumMicroOps = 1; +} + +def Ampere1BWrite_13cyc_1BS_1X : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitX]> { + let Latency = 13; + let NumMicroOps = 2; +} + +def Ampere1BWrite_17cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 17; + let NumMicroOps = 1; +} + +def Ampere1BWrite_19cyc_2BS_1X : SchedWriteRes<[Ampere1BUnitBS, + Ampere1BUnitBS, + Ampere1BUnitX]> { + let Latency = 13; + let NumMicroOps = 3; +} + +def Ampere1BWrite_19cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 19; + let NumMicroOps = 1; +} + +def Ampere1BWrite_21cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 21; + let NumMicroOps = 1; +} + +def Ampere1BWrite_33cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 33; + let NumMicroOps = 1; +} + +def Ampere1BWrite_39cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 39; + let NumMicroOps = 1; +} + +def Ampere1BWrite_63cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 63; + let NumMicroOps = 1; +} + +// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4), +// which are a single uop, and for extended registers, which have full flexibility +// across Unit A or B for both uops. +def Ampere1BWrite_Arith : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +def Ampere1BWrite_ArithFlagsetting : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latencies for Ampere-1. +// This provides a coarse model, which is then specialised below. + +def : WriteRes; // MOVN, MOVZ +def : WriteRes; // ALU +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Shifted-Reg +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Extended-Reg +def : WriteRes; // EXTR shifts a reg pair +def : WriteRes; // Shift/Scale +def : WriteRes { + let Latency = 13; +} // 32-bit Divide +def : WriteRes { + let Latency = 19; +} // 64-bit Divide +def : WriteRes { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes { + let Latency = 3; +} // 64-bit Multiply +def : WriteRes; +def : WriteRes; +def : WriteRes { + let Latency = 3; +} // Load from base addr plus immediate offset +def : WriteRes { + let Latency = 1; +} // Store to base addr plus immediate offset +def : WriteRes { + let Latency = 1; + let NumMicroOps = 1; +} // Store a register pair. +def : WriteRes; +def : WriteRes { + let Latency = 3; + let NumMicroOps = 1; +} // Load from a register index (maybe scaled). +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} // Store to a register index (maybe scaled). +def : WriteRes { + let Latency = 2; +} // General floating-point ops. +def : WriteRes { + let Latency = 3; +} // Floating-point compare. +def : WriteRes { + let Latency = 3; +} // Float conversion. +def : WriteRes { +} // Float-int register copy. +def : WriteRes { + let Latency = 2; +} // Float-int register copy. +def : WriteRes { + let Latency = 4; +} // Floating-point multiply. +def : WriteRes { + let Latency = 19; +} // Floating-point division. +def : WriteRes { + let Latency = 3; +} // 64bit Vector D ops. +def : WriteRes { + let Latency = 3; +} // 128bit Vector Q ops. +def : WriteRes { + let Latency = 4; +} // Vector loads. +def : WriteRes { + let Latency = 2; +} // Vector stores. + +def : WriteRes { let Unsupported = 1; } + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { + let Latency = 3; +} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP + +// Forwarding logic. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Specialising the scheduling model further for Ampere-1B. + +def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs COPY)>; + +// Branch instructions +def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs Bcc, BL, RET)>; +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; +def : InstRW<[Ampere1BWrite_1cyc_2A], (instrs BLR)>; + +// Common Short Sequence Compression (CSSC) +def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs ABS)>; +def : InstRW<[Ampere1BWrite_1cyc_1BS], (instrs CNT)>; +def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instrs SMAX, SMIN)>; +def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs CTZ)>; +def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instrs UMAX, USMIN)>; + +// Cryptography instructions +// -- AES encryption/decryption +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AES[DE]")>; +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AESI?MC")>; +// -- Polynomial multiplication +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^PMUL", "^PMULL")>; +// -- SHA-256 hash +def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA256(H|H2)")>; +// -- SHA-256 schedule update +def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA256SU[01]")>; +// -- SHA-3 instructions +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>; +// -- SHA-512 hash +def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA512(H|H2)")>; +// -- SHA-512 schedule update +def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA512SU[01]")>; +// -- SHA1 choose/majority/parity +def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA1[CMP]")>; +// -- SHA1 hash/schedule update +def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1SU[01]")>; +def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1H")>; +// -- SM3 hash +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$"0)>; +def : InstRW<[Ampere1BWrite_4cyc_1X], (instrs SM4E, SM4ENCKEY)>; + +// FP and vector load instructions +// -- Load 1-element structure to one/all lanes +// ---- all lanes +def : InstRW<[Ampere1BWrite_6cyc_1L_1XY], + (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// ---- one lane +def : InstRW<[Ampere1BWrite_6cyc_1L_1XY], + (instregex "^LD1i(8|16|32|64)")>; +// -- Load 1-element structure to one/all lanes, 1D size +def : InstRW<[Ampere1BWrite_4cyc_1L], + (instregex "^LD1Rv1d")>; +// -- Load 1-element structures to 1 register +def : InstRW<[Ampere1BWrite_4cyc_1L], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 2 registers +def : InstRW<[Ampere1BWrite_4cyc_2L], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 3 registers +def : InstRW<[Ampere1BWrite_5cyc_3L], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 4 registers +def : InstRW<[Ampere1BWrite_5cyc_4L], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 2-element structure to all lanes of 2 registers, 1D size +def : InstRW<[Ampere1BWrite_4cyc_2L], + (instregex "^LD2Rv1d")>; +// -- Load 2-element structure to all lanes of 2 registers, other sizes +def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], + (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 2-element structure to one lane of 2 registers +def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], + (instregex "^LD2i(8|16|32|64)")>; +// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size +def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], + (instregex "^LD2Twov(16b|8h|4s|2d)")>; +// -- Load 2-element structures to 2 registers, 8B/4H/2S size +def : InstRW<[Ampere1BWrite_8cyc_2L_3XY], + (instregex "^LD2Twov(8b|4h|2s)")>; +// -- Load 3-element structure to all lanes of 3 registers, 1D size +def : InstRW<[Ampere1BWrite_5cyc_3L], + (instregex "^LD3Rv1d")>; +// -- Load 3-element structure to all lanes of 3 registers, other sizes +def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 3-element structure to one lane of 3 registers +def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], + (instregex "^LD3i(8|16|32|64)")>; +// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1BWrite_8cyc_3L_3XY], + (instregex "^LD3Threev(16b|8h|4s)")>; +// -- Load 3-element structures to 3 registers, 2D size +def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], + (instregex "^LD3Threev2d")>; +// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1BWrite_9cyc_3L_3XY], + (instregex "^LD3Threev(8b|4h|2s)")>; +// -- Load 4-element structure to all lanes of 4 registers, 1D size +def : InstRW<[Ampere1BWrite_5cyc_4L], + (instregex "^LD4Rv1d")>; +// -- Load 4-element structure to all lanes of 4 registers, other sizes +def : InstRW<[Ampere1BWrite_7cyc_4L_4XY], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 4-element structure to one lane of 4 registers +def : InstRW<[Ampere1BWrite_7cyc_4L_4XY], + (instregex "^LD4i(8|16|32|64)")>; +// -- Load 4-element structures to 4 registers, 2D size +def : InstRW<[Ampere1BWrite_8cyc_4L_4XY], + (instregex "^LD4Fourv2d")>; +// -- Load 4-element structures to 4 registers, 2S size +def : InstRW<[Ampere1BWrite_11cyc_4L_8XY], + (instregex "^LD4Fourv2s")>; +// -- Load 4-element structures to 4 registers, other sizes +def : InstRW<[Ampere1BWrite_10cyc_4L_8XY], + (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>; +// -- Load pair, Q-form +def : InstRW<[Ampere1BWrite_4cyc_2L], (instregex "LDN?PQ")>; +// -- Load pair, S/D-form +def : InstRW<[Ampere1BWrite_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>; +// -- Load register +def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDU?R[BHSDQ]i")>; +// -- Load register, sign-extended register +def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDR[BHSDQ]ro(W|X)")>; + +// FP and vector store instructions +// -- Store 1-element structure from one lane of 1 register +def : InstRW<[Ampere1BWrite_4cyc_1XY_1S_1Z], + (instregex "^ST1i(8|16|32|64)")>; +// -- Store 1-element structures from 1 register +def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 2 registers +def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 3 registers +def : InstRW<[Ampere1BWrite_4cyc_3S_3Z], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 4 registers +def : InstRW<[Ampere1BWrite_5cyc_4S_4Z], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 2-element structure from one lane of 2 registers +def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z], + (instregex "^ST2i(8|16|32|64)")>; +// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes +def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z], + (instregex "^ST2Twov(16b|8h|4s|2d)")>; +// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1BWrite_6cyc_2XY_2S_2Z], + (instregex "^ST2Twov(8b|4h|2s)")>; +// -- Store 3-element structure from one lane of 3 registers +def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z], + (instregex "^ST3i(8|16|32|64)")>; +// -- Store 3-element structures from 3 registers +def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z], + (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 4-element structure from one lane of 4 registers +def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], + (instregex "^ST4i(8|16|32|64)")>; +// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], + (instregex "^ST4Fourv(16b|8h|4s)")>; +// -- Store 4-element structures from 4 registers, 2D sizes +def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], + (instregex "^ST4Fourv2d")>; +// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1BWrite_9cyc_6XY_4S_4Z], + (instregex "^ST4Fourv(8b|4h|2s)")>; +// -- Store pair, Q-form +def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?PQ")>; +// -- Store pair, S/D-form +def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?P[SD]")>; +// -- Store register +def : InstRW<[Ampere1BWrite_2cyc_1S_2Z], (instregex "^STU?R[BHSDQ](ui|i)")>; +// -- Store register, sign-extended register offset +def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>; + +// FP data processing, bfloat16 format +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFCVT)>; +def : InstRW<[Ampere1BWrite_8cyc_2XY], (instrs BFCVTN, BFCVTN2)>; +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFMMLA)>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^BFMLAL")>; + +// FP data processing, scalar/vector, half precision +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>; +def : InstRW<[Ampere1BWrite_3cyc_1X], + (instregex "^FCMPE?H")>; +def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X], + (instregex "^FCCMPE?H")>; +def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY], + (instregex "^FCSELH")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>; +// Convert FP to integer, H-form +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi]16")>; +// Convert to FP from GPR, H-form +def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]toH$")>; +// Convert to FP from GPR, fixed-point, H-form +def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX]Hri$")>; +def : InstRW<[Ampere1BWrite_9cyc_1X], (instrs FDIVHrr)>; +def : InstRW<[Ampere1BWrite_17cyc_1X], (instregex "^FDIVv.[if]16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>; +def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>; +def : InstRW<[Ampere1BWrite_9cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if]16")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX16)>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if]16")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>; +// FP square root, H-form +def : InstRW<[Ampere1BWrite_21cyc_1X], (instrs FSQRTHr)>; +// FP square root, vector-form, F16 +def : InstRW<[Ampere1BWrite_39cyc_1X], (instregex "^FSQRTv.f16")>; + +// FP data processing, scalar/vector, single/double precision +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1X], + (instregex "^FCMPE?(S|D)")>; +def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X], + (instregex "^FCCMPE?(S|D)")>; +def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY], + (instregex "^FCSEL(S|D)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>; +// Convert FP to integer, S/D-form +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi](32|64)")>; +// Convert to FP from GPR, S/D-form +def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]to[DS]$")>; +// Convert to FP from GPR, fixed-point, S/D-form +def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX][SD]ri$")>; +def : InstRW<[Ampere1BWrite_19cyc_1X], (instregex "^FDIVv.[if](64)", "FDIVD")>; +def : InstRW<[Ampere1BWrite_12cyc_1X], (instregex "^FDIVv.[if](32)", "FDIVS")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX32, FMULX64)>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?MUL")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT(32|64)")>; +def : InstRW<[Ampere1BWrite_63cyc_1X], (instregex "^FSQRTv.f64", "^FSQRTDr")>; +def : InstRW<[Ampere1BWrite_33cyc_1X], (instregex "^FSQRTv.f32", "^FSQRTSr")>; + +// FP miscellaneous instructions +def : InstRW<[Ampere1BWrite_7cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD]Hr")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD][SD]r")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVTLv")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT(N|XN)v")>; +def : InstRW<[Ampere1BWrite_7cyc_1X_1Z], (instrs FJCVTZS)>; +def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>; +def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>; +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>; +def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "^FMOVXDHighr")>; +def : InstRW<[Ampere1BWrite_3cyc_1Z], (instregex "^FMOV[WX][HSD]r")>; + +// Integer arithmetic and logical instructions +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instregex "ADC(W|X)r", "SBC(W|X)r")>; +def : InstRW<[Ampere1BWrite_Arith], + (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]")>; +def : InstRW<[Ampere1BWrite_1cyc_1AB], + (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[ri]")>; +def : InstRW<[Ampere1BWrite_ArithFlagsetting], + (instregex "(ADD|AND|BIC|SUB)S[WX]r[sx]")>; +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instregex "(ADD|AND|BIC|SUB)S[WX]r[ri]")>; +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instregex "(ADC|SBC)S[WX]r")>; +def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs RMIF)>; +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instregex "(CCMN|CCMP)(X|W)")>; +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>; +def : InstRW<[Ampere1BWrite_13cyc_1BS_1X], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[Ampere1BWrite_19cyc_2BS_1X], (instrs SDIVXr, UDIVXr)>; +def : InstRW<[Ampere1BWrite_3cyc_1BS], + (instregex "(S|U)MULHr")>; +def : InstRW<[Ampere1BWrite_4cyc_1BS_1AB], + (instregex "(S|U)?M(ADD|SUB)L?r")>; + +// Integer load instructions +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "(LDNP|LDP|LDPSW)(X|W)")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDR(B|D|H|Q|S)ui")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDR(D|Q|W|X)l")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDTR(B|H|W|X)i")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDTRS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDURS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>; +def : InstRW<[Ampere1BWrite_1cyc_1L], + (instrs PRFMl, PRFUMi, PRFUMi)>; +def : InstRW<[Ampere1BWrite_1cyc_1L], + (instrs PRFMroW, PRFMroX)>; + +// Integer miscellaneous instructions +def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs ADR, ADRP)>; +def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "EXTR(W|X)")>; +def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>; +def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>; +def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "CLS(W|X)")>; +def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs SETF8, SETF16)>; +def : InstRW<[Ampere1BWrite_1cyc_1AB], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; +def : InstRW<[Ampere1BWrite_1cyc_1B], + (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>; +def : InstRW<[Ampere1BWrite_1cyc_1B], + (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>; + +// Integer store instructions +def : InstRW<[Ampere1BWrite_1cyc_2S], (instregex "STNP(X|W)i")>; +def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STPXi)>; +def : InstRW<[Ampere1BWrite_2cyc_1B_1S], (instrs STPWi)>; +def : InstRW<[Ampere1BWrite_2cyc_1B_1S_1AB], (instregex "STP(W|X)(pre|post)")>; +def : InstRW<[Ampere1BWrite_1cyc_1S], (instrs STTRBi, STTRHi, STTRWi, STTRXi)>; +def : InstRW<[Ampere1BWrite_1cyc_1S], (instregex "STUR(BB|HH|X|W)i", + "STR(X|W)ui", + "STUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STRWroX, STRXroX)>; +def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STRWroW, STRXroW)>; + +// Memory tagging + +// Insert Random Tags +def : InstRW<[Ampere1BWrite_1cyc_1BS_1B], (instrs IRG, IRGstack)>; +// Load allocation tag +def : InstRW<[Ampere1BWrite_4cyc_1L_1B], (instrs LDG, LDGM)>; +// Store allocation tags +def : InstRW<[Ampere1BWrite_1cyc_1S], + (instrs STGi, STGM, STGPreIndex, STGPostIndex)>; +// Store allocation tags and pair of registers +def : InstRW<[Ampere1BWrite_1cyc_2S], + (instrs STGPi, STGPpre, STGPpost)>; +// Store allocation tags and zero data +def : InstRW<[Ampere1BWrite_1cyc_1S], + (instrs STZGi, STZGM, STZGPreIndex, STZGPostIndex)>; +// Store two tags +def : InstRW<[Ampere1BWrite_1cyc_2S], + (instrs ST2Gi, ST2GPreIndex, ST2GPostIndex)>; +// Store two tags and zero data +def : InstRW<[Ampere1BWrite_1cyc_2S], + (instrs STZ2Gi, STZ2GPreIndex, STZ2GPostIndex)>; +// Subtract Pointer +def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBP)>; +// Subtract Pointer, flagset +def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBPS)>; +// Insert Tag Mask +def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs GMI)>; +// Arithmetic, immediate to logical address tag +def : InstRW<[Ampere1BWrite_1cyc_B], (instrs ADDG, SUBG)>; + +// Pointer authentication +def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^AUT")>; +def : InstRW<[Ampere1BWrite_6cyc_1BS_1A], + (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>; +def : InstRW<[Ampere1BWrite_6cyc_1BS_2A], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>; +def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^PAC")>; +def : InstRW<[Ampere1BWrite_8cyc_1BS_1L], (instregex "^LDRA(A|B)")>; +def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs XPACD, XPACI)>; + +// Vector integer instructions +// -- absolute difference +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv", + "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>; +// -- arithmetic +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD", + "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW", + "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>; +// -- arithmetic, horizontal, 16B +def : InstRW<[Ampere1BWrite_8cyc_4XY], + (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>; +def : InstRW<[Ampere1BWrite_8cyc_4XY], + (instregex "^[SU](MIN|MAX)Vv16i8v")>; +// -- arithmetic, horizontal, 4H/4S +def : InstRW<[Ampere1BWrite_4cyc_2XY], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>; +def : InstRW<[Ampere1BWrite_4cyc_2XY], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>; +// -- arithmetic, horizontal, 8B/8H +def : InstRW<[Ampere1BWrite_6cyc_3XY], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>; +def : InstRW<[Ampere1BWrite_6cyc_3XY], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>; +// -- arithmetic, narrowing +def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>; +// -- arithmetic, pairwise +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>; +// -- arithmetic, saturating +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>; +// -- bit count +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^(CLS|CLZ|CNT)v")>; +// -- compare +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv", + "^CMHIv", "^CMHSv")>; +// -- compare non-zero +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^CMTSTv")>; +// -- dot product +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>; +// -- fp reciprocal estimate +def : InstRW<[Ampere1BWrite_6cyc_1X], (instregex "^FRECPEv", "^FRSQRTEv")>; +// -- integer reciprocal estimate +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>; +// -- logical +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; +// -- logical, narrowing +def : InstRW<[Ampere1BWrite_6cyc_2XY], + (instregex "RSHRNv", + "SHRNv", "SQSHRNv", "SQSHRUNv", + "UQXTNv")>; +// -- matrix multiply +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instrs SMMLA, UMMLA, USMMLA)>; +// -- max/min +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; +// -- move immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>; +// -- multiply +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>; +// -- multiply accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>; +// -- negation, saturating +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>; +// -- reverse bits/bytes +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>; +// -- shift +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// -- shift and accumulate +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>; +// -- shift, saturating +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU", + "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL", + "^UQSHL")>; + +// Vector miscellaneous instructions +// -- duplicate element +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^DUPv.+lane")>; +// -- duplicate from GPR +def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^DUPv.+gpr")>; +// -- extract narrow +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^XTNv")>; +// -- insert/extract element +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>; +// -- move FP immediate +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOVv")>; +// -- move element to GPR +def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "(S|U)MOVv")>; +// -- move from GPR to any element +def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>; +// -- table lookup +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; +def : InstRW<[Ampere1BWrite_4cyc_2XY], + (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[Ampere1BWrite_6cyc_3XY], + (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[Ampere1BWrite_8cyc_4XY], + (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; +// -- transpose +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; +// -- zip/unzip +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>; + +} // SchedModel = Ampere1BModel -- cgit v1.1 From 78145a6bd0023ff1c218dda59b192345d773ebe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 9 Feb 2024 15:57:16 -0800 Subject: [flang][cuda] Lower attribute for procedure (#81336) This PR adds a new attribute to represent the CUDA attribute attached to procedure. This attribute is attached to the func.func operation during lowering. Other procedures information such as `launch_bounds` and `cluster_dims` will be added separately. --- flang/include/flang/Optimizer/Dialect/FIRAttr.td | 54 ++++++++++++++++-------- flang/include/flang/Optimizer/Support/Utils.h | 27 ++++++++++++ flang/lib/Lower/CallInterface.cpp | 7 +++ flang/lib/Optimizer/Dialect/FIRAttr.cpp | 2 +- flang/test/Lower/CUDA/cuda-proc-attribute.cuf | 34 +++++++++++++++ 5 files changed, 106 insertions(+), 18 deletions(-) create mode 100644 flang/test/Lower/CUDA/cuda-proc-attribute.cuf diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td index 422ad53..00e293e 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td +++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td @@ -58,19 +58,34 @@ def fir_FortranVariableFlagsAttr : fir_Attr<"FortranVariableFlags"> { "::fir::FortranVariableFlagsAttr::get($_builder.getContext(), $0)"; } -def CUDAconstant : I32EnumAttrCase<"Constant", 0, "constant">; -def CUDAdevice : I32EnumAttrCase<"Device", 1, "device">; -def CUDAmanaged : I32EnumAttrCase<"Managed", 2, "managed">; -def CUDApinned : I32EnumAttrCase<"Pinned", 3, "pinned">; -def CUDAshared : I32EnumAttrCase<"Shared", 4, "shared">; -def CUDAunified : I32EnumAttrCase<"Unified", 5, "unified">; -// Texture is omitted since it is obsolete and rejected by semantic. +def fir_BoxFieldAttr : I32EnumAttr< + "BoxFieldAttr", "", + [ + I32EnumAttrCase<"base_addr", 0>, + I32EnumAttrCase<"derived_type", 1> + ]> { + let cppNamespace = "fir"; +} + +// mlir::SideEffects::Resource for modelling operations which add debugging information +def DebuggingResource : Resource<"::fir::DebuggingResource">; + +//===----------------------------------------------------------------------===// +// CUDA Fortran specific attributes +//===----------------------------------------------------------------------===// def fir_CUDADataAttribute : I32EnumAttr< "CUDADataAttribute", "CUDA Fortran variable attributes", - [CUDAconstant, CUDAdevice, CUDAmanaged, CUDApinned, CUDAshared, - CUDAunified]> { + [ + I32EnumAttrCase<"Constant", 0, "constant">, + I32EnumAttrCase<"Device", 1, "device">, + I32EnumAttrCase<"Managed", 2, "managed">, + I32EnumAttrCase<"Pinned", 3, "pinned">, + I32EnumAttrCase<"Shared", 4, "shared">, + I32EnumAttrCase<"Unified", 5, "unified">, + // Texture is omitted since it is obsolete and rejected by semantic. + ]> { let genSpecializedAttr = 0; let cppNamespace = "::fir"; } @@ -80,17 +95,22 @@ def fir_CUDADataAttributeAttr : let assemblyFormat = [{ ```<` $value `>` }]; } -def fir_BoxFieldAttr : I32EnumAttr< - "BoxFieldAttr", "", +def fir_CUDAProcAttribute : I32EnumAttr< + "CUDAProcAttribute", "CUDA Fortran procedure attributes", [ - I32EnumAttrCase<"base_addr", 0>, - I32EnumAttrCase<"derived_type", 1> + I32EnumAttrCase<"Host", 0, "host">, + I32EnumAttrCase<"Device", 1, "device">, + I32EnumAttrCase<"HostDevice", 2, "host_device">, + I32EnumAttrCase<"Global", 3, "global">, + I32EnumAttrCase<"GridGlobal", 4, "grid_global">, ]> { - let cppNamespace = "fir"; + let genSpecializedAttr = 0; + let cppNamespace = "::fir"; } - -// mlir::SideEffects::Resource for modelling operations which add debugging information -def DebuggingResource : Resource<"::fir::DebuggingResource">; +def fir_CUDAProcAttributeAttr : + EnumAttr { + let assemblyFormat = [{ ```<` $value `>` }]; +} #endif // FIR_DIALECT_FIR_ATTRS diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h index 84c550a..4e06bf8 100644 --- a/flang/include/flang/Optimizer/Support/Utils.h +++ b/flang/include/flang/Optimizer/Support/Utils.h @@ -303,6 +303,33 @@ getCUDADataAttribute(mlir::MLIRContext *mlirContext, return {}; } +inline fir::CUDAProcAttributeAttr getCUDAProcAttribute( + mlir::MLIRContext *mlirContext, + std::optional cudaAttr) { + if (cudaAttr) { + fir::CUDAProcAttribute attr; + switch (*cudaAttr) { + case Fortran::common::CUDASubprogramAttrs::Host: + attr = fir::CUDAProcAttribute::Host; + break; + case Fortran::common::CUDASubprogramAttrs::Device: + attr = fir::CUDAProcAttribute::Device; + break; + case Fortran::common::CUDASubprogramAttrs::HostDevice: + attr = fir::CUDAProcAttribute::HostDevice; + break; + case Fortran::common::CUDASubprogramAttrs::Global: + attr = fir::CUDAProcAttribute::Global; + break; + case Fortran::common::CUDASubprogramAttrs::Grid_Global: + attr = fir::CUDAProcAttribute::GridGlobal; + break; + } + return fir::CUDAProcAttributeAttr::get(mlirContext, attr); + } + return {}; +} + } // namespace fir #endif // FORTRAN_OPTIMIZER_SUPPORT_UTILS_H diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp index 9c32b71..41597c1 100644 --- a/flang/lib/Lower/CallInterface.cpp +++ b/flang/lib/Lower/CallInterface.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "flang/Lower/CallInterface.h" +#include "flang/Common/Fortran.h" #include "flang/Evaluate/fold.h" #include "flang/Lower/Bridge.h" #include "flang/Lower/Mangler.h" @@ -559,6 +560,12 @@ void Fortran::lower::CallInterface::declare() { func.setArgAttrs(placeHolder.index(), placeHolder.value().attributes); side().setFuncAttrs(func); } + if (characteristic && characteristic->cudaSubprogramAttrs) { + func.getOperation()->setAttr( + fir::getCUDAAttrName(), + fir::getCUDAProcAttribute(func.getContext(), + *characteristic->cudaSubprogramAttrs)); + } } } diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp index 218fa50..8df7a6c 100644 --- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp +++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp @@ -298,5 +298,5 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr, void FIROpsDialect::registerAttributes() { addAttributes(); + UpperBoundAttr, CUDADataAttributeAttr, CUDAProcAttributeAttr>(); } diff --git a/flang/test/Lower/CUDA/cuda-proc-attribute.cuf b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf new file mode 100644 index 0000000..0507310 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf @@ -0,0 +1,34 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s +! RUN: bbc -emit-hlfir -fcuda %s -o - | fir-opt -convert-hlfir-to-fir | FileCheck %s + +! Test lowering of CUDA attribute on procedures. + +attributes(host) subroutine sub_host(); end +! CHECK: func.func @_QPsub_host() attributes {fir.cuda_attr = #fir.cuda_proc} + +attributes(device) subroutine sub_device(); end +! CHECK: func.func @_QPsub_device() attributes {fir.cuda_attr = #fir.cuda_proc} + +attributes(host) attributes(device) subroutine sub_host_device; end +! CHECK: func.func @_QPsub_host_device() attributes {fir.cuda_attr = #fir.cuda_proc} + +attributes(device) attributes(host) subroutine sub_device_host; end +! CHECK: func.func @_QPsub_device_host() attributes {fir.cuda_attr = #fir.cuda_proc} + +attributes(global) subroutine sub_global(); end +! CHECK: func.func @_QPsub_global() attributes {fir.cuda_attr = #fir.cuda_proc} + +attributes(grid_global) subroutine sub_grid_global(); end +! CHECK: func.func @_QPsub_grid_global() attributes {fir.cuda_attr = #fir.cuda_proc} + +attributes(host) integer function fct_host(); end +! CHECK: func.func @_QPfct_host() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc} + +attributes(device) integer function fct_device(); end +! CHECK: func.func @_QPfct_device() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc} + +attributes(host) attributes(device) integer function fct_host_device; end +! CHECK: func.func @_QPfct_host_device() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc} + +attributes(device) attributes(host) integer function fct_device_host; end +! CHECK: func.func @_QPfct_device_host() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc} -- cgit v1.1 From 8509f75d618a41b946391a73cdbfee53565fbf85 Mon Sep 17 00:00:00 2001 From: Philipp Tomsich Date: Sat, 10 Feb 2024 00:57:08 +0100 Subject: Revert "[AArch64] Add Ampere1B scheduling/pipeline model (#81338)" This reverts commit 014401158bbbc6899144905c1eb9e44fac86867e. --- llvm/lib/Target/AArch64/AArch64.td | 3 +- llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td | 1061 ----------------------- 2 files changed, 1 insertion(+), 1063 deletions(-) delete mode 100644 llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 156c48e..e76204f 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -837,7 +837,6 @@ include "AArch64SchedA64FX.td" include "AArch64SchedThunderX3T110.td" include "AArch64SchedTSV110.td" include "AArch64SchedAmpere1.td" -include "AArch64SchedAmpere1B.td" include "AArch64SchedNeoverseN1.td" include "AArch64SchedNeoverseN2.td" include "AArch64SchedNeoverseV1.td" @@ -1723,7 +1722,7 @@ def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1, def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A, [TuneAmpere1A]>; -def : ProcessorModel<"ampere1b", Ampere1BModel, ProcessorFeatures.Ampere1B, +def : ProcessorModel<"ampere1b", Ampere1Model, ProcessorFeatures.Ampere1B, [TuneAmpere1B]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td deleted file mode 100644 index 43da762..0000000 --- a/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td +++ /dev/null @@ -1,1061 +0,0 @@ -//=- AArch64SchedAmpere1B.td - Ampere-1B scheduling def -----*- tablegen -*-=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the machine model for the Ampere Computing Ampere-1B to -// support instruction scheduling and other instruction cost heuristics. -// -//===----------------------------------------------------------------------===// - -// The Ampere-1 core is an out-of-order micro-architecture. The front -// end has branch prediction, with a 10-cycle recovery time from a -// mispredicted branch. Instructions coming out of the front end are -// decoded into internal micro-ops (uops). - -def Ampere1BModel : SchedMachineModel { - let IssueWidth = 4; // 4-way decode and dispatch - let MicroOpBufferSize = 192; // micro-op re-order buffer size - let LoadLatency = 3; // Optimistic load latency - let MispredictPenalty = 10; // Branch mispredict penalty - let LoopMicroOpBufferSize = 32; // Instruction queue size - let CompleteModel = 0; - - list UnsupportedFeatures = !listconcat(SVEUnsupported.F, - SMEUnsupported.F, - PAUnsupported.F); -} - -let SchedModel = Ampere1BModel in { - -//===----------------------------------------------------------------------===// -// Define each kind of processor resource and number available on Ampere-1. -// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP, -// and 2 memory) issue into. The integer and FP schedulers can each issue -// one uop per cycle, while the memory schedulers can each issue one load -// and one store address calculation per cycle. - -def Ampere1BUnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w -def Ampere1BUnitB : ProcResource<2>; // integer single-cycle, and complex shifts -def Ampere1BUnitBS : ProcResource<1>; // integer multi-cycle -def Ampere1BUnitL : ProcResource<2>; // load -def Ampere1BUnitS : ProcResource<2>; // store address calculation -def Ampere1BUnitX : ProcResource<1>; // FP and vector operations, and flag write -def Ampere1BUnitY : ProcResource<1>; // FP and vector operations, and crypto -def Ampere1BUnitZ : ProcResource<1>; // FP store data and FP-to-integer moves - -def Ampere1BUnitAB : ProcResGroup<[Ampere1BUnitA, Ampere1BUnitB]>; -def Ampere1BUnitXY : ProcResGroup<[Ampere1BUnitX, Ampere1BUnitY]>; - -//===----------------------------------------------------------------------===// -// Define customized scheduler read/write types specific to the Ampere-1. - -def Ampere1BWrite_1cyc_1A : SchedWriteRes<[Ampere1BUnitA]> { - let Latency = 1; - let NumMicroOps = 1; -} - -def Ampere1BWrite_1cyc_2A : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitA]> { - let Latency = 1; - let NumMicroOps = 2; -} - -def Ampere1BWrite_1cyc_1B : SchedWriteRes<[Ampere1BUnitB]> { - let Latency = 1; - let NumMicroOps = 1; -} - -def Ampere1BWrite_1cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { - let Latency = 1; - let NumMicroOps = 1; -} - -def Ampere1BWrite_1cyc_1BS_1B : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitB]> { - let Latency = 1; - let NumMicroOps = 2; -} - -def Ampere1BWrite_1cyc_1AB : SchedWriteRes<[Ampere1BUnitAB]> { - let Latency = 1; - let NumMicroOps = 1; -} - -def Ampere1BWrite_1cyc_1AB_1A : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitA]> { - let Latency = 1; - let NumMicroOps = 2; -} - -def Ampere1BWrite_1cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { - let Latency = 1; - let NumMicroOps = 1; -} - -def Ampere1BWrite_1cyc_1S : SchedWriteRes<[Ampere1BUnitS]> { - let Latency = 1; - let NumMicroOps = 1; -} - -def Ampere1BWrite_1cyc_2S : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS]> { - let Latency = 1; - let NumMicroOps = 2; -} - -def Ampere1BWrite_2cyc_1Y : SchedWriteRes<[Ampere1BUnitY]> { - let Latency = 2; - let NumMicroOps = 1; -} - -def Ampere1BWrite_2cyc_2AB : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitAB]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def Ampere1BWrite_2cyc_1B_1AB : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitAB]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def Ampere1BWrite_2cyc_1B_1S : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitS]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def Ampere1BWrite_2cyc_1B_1S_1AB : SchedWriteRes<[Ampere1BUnitB, - Ampere1BUnitS, - Ampere1BUnitAB]> { - let Latency = 2; - let NumMicroOps = 3; -} - -def Ampere1BWrite_2cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { - let Latency = 2; - let NumMicroOps = 1; -} - -def Ampere1BWrite_2cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitZ]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def Ampere1BWrite_3cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { - let Latency = 3; - let NumMicroOps = 1; -} - -def Ampere1BWrite_3cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { - let Latency = 3; - let NumMicroOps = 1; -} - -def Ampere1BWrite_3cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 3; - let NumMicroOps = 1; -} - -def Ampere1BWrite_3cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { - let Latency = 3; - let NumMicroOps = 1; -} - -def Ampere1BWrite_3cyc_1Z : SchedWriteRes<[Ampere1BUnitZ]> { - let Latency = 3; - let NumMicroOps = 1; -} - -def Ampere1BWrite_3cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, - Ampere1BUnitZ]> { - let Latency = 3; - let NumMicroOps = 2; -} - -def Ampere1BWrite_3cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS, - Ampere1BUnitZ, Ampere1BUnitZ]> { - let Latency = 3; - let NumMicroOps = 3; -} - -def Ampere1BWrite_3cyc_2S_2Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS, - Ampere1BUnitZ, Ampere1BUnitZ]> { - let Latency = 3; - let NumMicroOps = 4; -} - -def Ampere1BWrite_4cyc_1BS_1AB : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitAB]> { - let Latency = 4; - let NumMicroOps = 2; -} - -def Ampere1BWrite_4cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { - let Latency = 4; - let NumMicroOps = 1; -} - -def Ampere1BWrite_4cyc_1L_1B : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitB]> { - let Latency = 4; - let NumMicroOps = 2; -} - -def Ampere1BWrite_4cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 4; - let NumMicroOps = 1; -} - -def Ampere1BWrite_4cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { - let Latency = 4; - let NumMicroOps = 1; -} - -def Ampere1BWrite_4cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { - let Latency = 4; - let NumMicroOps = 2; -} - -def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { - let Latency = 5; - let NumMicroOps = 1; -} - -def Ampere1BWrite_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1BUnitXY, - Ampere1BUnitS - Ampere1BUnitZ]> { - let Latency = 4; - let NumMicroOps = 3; -} - -def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { - let Latency = 5; - let NumMicroOps = 1; -} - -def Ampere1BWrite_5cyc_4S_4Z : SchedWriteRes<[Ampere1BUnitL, - Ampere1BUnitBS]> { - let Latency = 5; - let NumMicroOps = 8; -} - -def Ampere1BWrite_5cyc_1L_1BS : SchedWriteRes<[Ampere1BUnitL, - Ampere1BUnitBS]> { - let Latency = 5; - let NumMicroOps = 2; -} - -def Ampere1BWrite_5cyc_3L : SchedWriteRes<[Ampere1BUnitL, - Ampere1BUnitL, - Ampere1BUnitL]> { - let Latency = 5; - let NumMicroOps = 3; -} - -def Ampere1BWrite_5cyc_4L : SchedWriteRes<[Ampere1BUnitL, - Ampere1BUnitL, - Ampere1BUnitL, - Ampere1BUnitL]> { - let Latency = 5; - let NumMicroOps = 4; -} - -def Ampere1BWrite_5cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 5; - let NumMicroOps = 1; -} - -def Ampere1BWrite_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, - Ampere1BUnitS, Ampere1BUnitS, - Ampere1BUnitZ, Ampere1BUnitZ]> { - let Latency = 5; - let NumMicroOps = 6; -} - -def Ampere1BWrite_6cyc_1BS_1A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA]> { - let Latency = 6; - let NumMicroOps = 2; -} - -def Ampere1BWrite_6cyc_1BS_2A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA, - Ampere1BUnitA]> { - let Latency = 6; - let NumMicroOps = 3; -} - -def Ampere1BWrite_6cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 6; - let NumMicroOps = 2; -} - -def Ampere1BWrite_6cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { - let Latency = 6; - let NumMicroOps = 2; -} - -def Ampere1BWrite_6cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, - Ampere1BUnitXY]> { - let Latency = 6; - let NumMicroOps = 3; -} - -def Ampere1BWrite_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, - Ampere1BUnitS, Ampere1BUnitS, - Ampere1BUnitZ, Ampere1BUnitZ]> { - let Latency = 6; - let NumMicroOps = 6; -} - -def Ampere1BWrite_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY, - Ampere1BUnitS, Ampere1BUnitS, Ampere1BUnitS, - Ampere1BUnitZ, Ampere1BUnitZ, Ampere1BUnitZ]> { - let Latency = 6; - let NumMicroOps = 9; -} - -def Ampere1BWrite_7cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> { - let Latency = 7; - let NumMicroOps = 2; -} - -def Ampere1BWrite_7cyc_1XY_1Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitZ]> { - let Latency = 7; - let NumMicroOps = 2; -} - -def Ampere1BWrite_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, - Ampere1BUnitXY, Ampere1BUnitXY, - Ampere1BUnitS, Ampere1BUnitS, - Ampere1BUnitS, Ampere1BUnitS, - Ampere1BUnitZ, Ampere1BUnitZ, - Ampere1BUnitZ, Ampere1BUnitZ]> { - let Latency = 7; - let NumMicroOps = 12; -} - -def Ampere1BWrite_8cyc_1BS_1L : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitL]> { - let Latency = 8; - let NumMicroOps = 2; -} - -def Ampere1BWrite_8cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> { - let Latency = 8; - let NumMicroOps = 2; -} - -def Ampere1BWrite_8cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { - let Latency = 8; - let NumMicroOps = 2; -} - -def Ampere1BWrite_8cyc_4XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, - Ampere1BUnitXY, Ampere1BUnitXY]> { - let Latency = 8; - let NumMicroOps = 4; -} - -def Ampere1BWrite_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, - Ampere1BUnitXY, Ampere1BUnitXY, - Ampere1BUnitXY, Ampere1BUnitXY, - Ampere1BUnitS, Ampere1BUnitS, - Ampere1BUnitS, Ampere1BUnitS, - Ampere1BUnitZ, Ampere1BUnitZ, - Ampere1BUnitZ, Ampere1BUnitZ]> { - let Latency = 9; - let NumMicroOps = 14; -} - -def Ampere1BWrite_9cyc_1A_1BS_1X : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitX]> { - let Latency = 9; - let NumMicroOps = 3; -} - -def Ampere1BWrite_9cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitXY]> { - let Latency = 9; - let NumMicroOps = 3; -} - -def Ampere1BWrite_9cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 9; - let NumMicroOps = 1; -} - -def Ampere1BWrite_9cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY]> { - let Latency = 9; - let NumMicroOps = 3; -} - -def Ampere1BWrite_11cyc_1BS_2XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY, Ampere1BUnitXY]> { - let Latency = 11; - let NumMicroOps = 3; -} - -def Ampere1BWrite_12cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 12; - let NumMicroOps = 1; -} - -def Ampere1BWrite_13cyc_1BS_1X : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitX]> { - let Latency = 13; - let NumMicroOps = 2; -} - -def Ampere1BWrite_17cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 17; - let NumMicroOps = 1; -} - -def Ampere1BWrite_19cyc_2BS_1X : SchedWriteRes<[Ampere1BUnitBS, - Ampere1BUnitBS, - Ampere1BUnitX]> { - let Latency = 13; - let NumMicroOps = 3; -} - -def Ampere1BWrite_19cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 19; - let NumMicroOps = 1; -} - -def Ampere1BWrite_21cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 21; - let NumMicroOps = 1; -} - -def Ampere1BWrite_33cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 33; - let NumMicroOps = 1; -} - -def Ampere1BWrite_39cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 39; - let NumMicroOps = 1; -} - -def Ampere1BWrite_63cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { - let Latency = 63; - let NumMicroOps = 1; -} - -// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4), -// which are a single uop, and for extended registers, which have full flexibility -// across Unit A or B for both uops. -def Ampere1BWrite_Arith : SchedWriteVariant<[ - SchedVar, - SchedVar, - SchedVar]>; - -def Ampere1BWrite_ArithFlagsetting : SchedWriteVariant<[ - SchedVar, - SchedVar, - SchedVar]>; - -//===----------------------------------------------------------------------===// -// Map the target-defined scheduler read/write resources and latencies for Ampere-1. -// This provides a coarse model, which is then specialised below. - -def : WriteRes; // MOVN, MOVZ -def : WriteRes; // ALU -def : WriteRes { - let Latency = 2; - let NumMicroOps = 2; -} // ALU of Shifted-Reg -def : WriteRes { - let Latency = 2; - let NumMicroOps = 2; -} // ALU of Extended-Reg -def : WriteRes; // EXTR shifts a reg pair -def : WriteRes; // Shift/Scale -def : WriteRes { - let Latency = 13; -} // 32-bit Divide -def : WriteRes { - let Latency = 19; -} // 64-bit Divide -def : WriteRes { - let Latency = 3; -} // 32-bit Multiply -def : WriteRes { - let Latency = 3; -} // 64-bit Multiply -def : WriteRes; -def : WriteRes; -def : WriteRes { - let Latency = 3; -} // Load from base addr plus immediate offset -def : WriteRes { - let Latency = 1; -} // Store to base addr plus immediate offset -def : WriteRes { - let Latency = 1; - let NumMicroOps = 1; -} // Store a register pair. -def : WriteRes; -def : WriteRes { - let Latency = 3; - let NumMicroOps = 1; -} // Load from a register index (maybe scaled). -def : WriteRes { - let Latency = 1; - let NumMicroOps = 2; -} // Store to a register index (maybe scaled). -def : WriteRes { - let Latency = 2; -} // General floating-point ops. -def : WriteRes { - let Latency = 3; -} // Floating-point compare. -def : WriteRes { - let Latency = 3; -} // Float conversion. -def : WriteRes { -} // Float-int register copy. -def : WriteRes { - let Latency = 2; -} // Float-int register copy. -def : WriteRes { - let Latency = 4; -} // Floating-point multiply. -def : WriteRes { - let Latency = 19; -} // Floating-point division. -def : WriteRes { - let Latency = 3; -} // 64bit Vector D ops. -def : WriteRes { - let Latency = 3; -} // 128bit Vector Q ops. -def : WriteRes { - let Latency = 4; -} // Vector loads. -def : WriteRes { - let Latency = 2; -} // Vector stores. - -def : WriteRes { let Unsupported = 1; } - -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } - -def : WriteRes { - let Latency = 3; -} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP - -// Forwarding logic. -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; - -//===----------------------------------------------------------------------===// -// Specialising the scheduling model further for Ampere-1B. - -def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs COPY)>; - -// Branch instructions -def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs Bcc, BL, RET)>; -def : InstRW<[Ampere1BWrite_1cyc_1A], - (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; -def : InstRW<[Ampere1BWrite_1cyc_2A], (instrs BLR)>; - -// Common Short Sequence Compression (CSSC) -def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs ABS)>; -def : InstRW<[Ampere1BWrite_1cyc_1BS], (instrs CNT)>; -def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instrs SMAX, SMIN)>; -def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs CTZ)>; -def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instrs UMAX, USMIN)>; - -// Cryptography instructions -// -- AES encryption/decryption -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AES[DE]")>; -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AESI?MC")>; -// -- Polynomial multiplication -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^PMUL", "^PMULL")>; -// -- SHA-256 hash -def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA256(H|H2)")>; -// -- SHA-256 schedule update -def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA256SU[01]")>; -// -- SHA-3 instructions -def : InstRW<[Ampere1BWrite_2cyc_1XY], - (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>; -// -- SHA-512 hash -def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA512(H|H2)")>; -// -- SHA-512 schedule update -def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA512SU[01]")>; -// -- SHA1 choose/majority/parity -def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA1[CMP]")>; -// -- SHA1 hash/schedule update -def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1SU[01]")>; -def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1H")>; -// -- SM3 hash -def : InstRW<[Ampere1BWrite_2cyc_1XY], - (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$"0)>; -def : InstRW<[Ampere1BWrite_4cyc_1X], (instrs SM4E, SM4ENCKEY)>; - -// FP and vector load instructions -// -- Load 1-element structure to one/all lanes -// ---- all lanes -def : InstRW<[Ampere1BWrite_6cyc_1L_1XY], - (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>; -// ---- one lane -def : InstRW<[Ampere1BWrite_6cyc_1L_1XY], - (instregex "^LD1i(8|16|32|64)")>; -// -- Load 1-element structure to one/all lanes, 1D size -def : InstRW<[Ampere1BWrite_4cyc_1L], - (instregex "^LD1Rv1d")>; -// -- Load 1-element structures to 1 register -def : InstRW<[Ampere1BWrite_4cyc_1L], - (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; -// -- Load 1-element structures to 2 registers -def : InstRW<[Ampere1BWrite_4cyc_2L], - (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; -// -- Load 1-element structures to 3 registers -def : InstRW<[Ampere1BWrite_5cyc_3L], - (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; -// -- Load 1-element structures to 4 registers -def : InstRW<[Ampere1BWrite_5cyc_4L], - (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; -// -- Load 2-element structure to all lanes of 2 registers, 1D size -def : InstRW<[Ampere1BWrite_4cyc_2L], - (instregex "^LD2Rv1d")>; -// -- Load 2-element structure to all lanes of 2 registers, other sizes -def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], - (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>; -// -- Load 2-element structure to one lane of 2 registers -def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], - (instregex "^LD2i(8|16|32|64)")>; -// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size -def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], - (instregex "^LD2Twov(16b|8h|4s|2d)")>; -// -- Load 2-element structures to 2 registers, 8B/4H/2S size -def : InstRW<[Ampere1BWrite_8cyc_2L_3XY], - (instregex "^LD2Twov(8b|4h|2s)")>; -// -- Load 3-element structure to all lanes of 3 registers, 1D size -def : InstRW<[Ampere1BWrite_5cyc_3L], - (instregex "^LD3Rv1d")>; -// -- Load 3-element structure to all lanes of 3 registers, other sizes -def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], - (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>; -// -- Load 3-element structure to one lane of 3 registers -def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], - (instregex "^LD3i(8|16|32|64)")>; -// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes -def : InstRW<[Ampere1BWrite_8cyc_3L_3XY], - (instregex "^LD3Threev(16b|8h|4s)")>; -// -- Load 3-element structures to 3 registers, 2D size -def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], - (instregex "^LD3Threev2d")>; -// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes -def : InstRW<[Ampere1BWrite_9cyc_3L_3XY], - (instregex "^LD3Threev(8b|4h|2s)")>; -// -- Load 4-element structure to all lanes of 4 registers, 1D size -def : InstRW<[Ampere1BWrite_5cyc_4L], - (instregex "^LD4Rv1d")>; -// -- Load 4-element structure to all lanes of 4 registers, other sizes -def : InstRW<[Ampere1BWrite_7cyc_4L_4XY], - (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>; -// -- Load 4-element structure to one lane of 4 registers -def : InstRW<[Ampere1BWrite_7cyc_4L_4XY], - (instregex "^LD4i(8|16|32|64)")>; -// -- Load 4-element structures to 4 registers, 2D size -def : InstRW<[Ampere1BWrite_8cyc_4L_4XY], - (instregex "^LD4Fourv2d")>; -// -- Load 4-element structures to 4 registers, 2S size -def : InstRW<[Ampere1BWrite_11cyc_4L_8XY], - (instregex "^LD4Fourv2s")>; -// -- Load 4-element structures to 4 registers, other sizes -def : InstRW<[Ampere1BWrite_10cyc_4L_8XY], - (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>; -// -- Load pair, Q-form -def : InstRW<[Ampere1BWrite_4cyc_2L], (instregex "LDN?PQ")>; -// -- Load pair, S/D-form -def : InstRW<[Ampere1BWrite_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>; -// -- Load register -def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDU?R[BHSDQ]i")>; -// -- Load register, sign-extended register -def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDR[BHSDQ]ro(W|X)")>; - -// FP and vector store instructions -// -- Store 1-element structure from one lane of 1 register -def : InstRW<[Ampere1BWrite_4cyc_1XY_1S_1Z], - (instregex "^ST1i(8|16|32|64)")>; -// -- Store 1-element structures from 1 register -def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], - (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; -// -- Store 1-element structures from 2 registers -def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], - (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; -// -- Store 1-element structures from 3 registers -def : InstRW<[Ampere1BWrite_4cyc_3S_3Z], - (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; -// -- Store 1-element structures from 4 registers -def : InstRW<[Ampere1BWrite_5cyc_4S_4Z], - (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; -// -- Store 2-element structure from one lane of 2 registers -def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z], - (instregex "^ST2i(8|16|32|64)")>; -// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes -def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z], - (instregex "^ST2Twov(16b|8h|4s|2d)")>; -// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes -def : InstRW<[Ampere1BWrite_6cyc_2XY_2S_2Z], - (instregex "^ST2Twov(8b|4h|2s)")>; -// -- Store 3-element structure from one lane of 3 registers -def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z], - (instregex "^ST3i(8|16|32|64)")>; -// -- Store 3-element structures from 3 registers -def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z], - (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; -// -- Store 4-element structure from one lane of 4 registers -def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], - (instregex "^ST4i(8|16|32|64)")>; -// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes -def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], - (instregex "^ST4Fourv(16b|8h|4s)")>; -// -- Store 4-element structures from 4 registers, 2D sizes -def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], - (instregex "^ST4Fourv2d")>; -// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes -def : InstRW<[Ampere1BWrite_9cyc_6XY_4S_4Z], - (instregex "^ST4Fourv(8b|4h|2s)")>; -// -- Store pair, Q-form -def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?PQ")>; -// -- Store pair, S/D-form -def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?P[SD]")>; -// -- Store register -def : InstRW<[Ampere1BWrite_2cyc_1S_2Z], (instregex "^STU?R[BHSDQ](ui|i)")>; -// -- Store register, sign-extended register offset -def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>; - -// FP data processing, bfloat16 format -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFCVT)>; -def : InstRW<[Ampere1BWrite_8cyc_2XY], (instrs BFCVTN, BFCVTN2)>; -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFMMLA)>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^BFMLAL")>; - -// FP data processing, scalar/vector, half precision -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], - (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], - (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], - (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>; -def : InstRW<[Ampere1BWrite_3cyc_1X], - (instregex "^FCMPE?H")>; -def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X], - (instregex "^FCCMPE?H")>; -def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY], - (instregex "^FCSELH")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>; -// Convert FP to integer, H-form -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi]16")>; -// Convert to FP from GPR, H-form -def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]toH$")>; -// Convert to FP from GPR, fixed-point, H-form -def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX]Hri$")>; -def : InstRW<[Ampere1BWrite_9cyc_1X], (instrs FDIVHrr)>; -def : InstRW<[Ampere1BWrite_17cyc_1X], (instregex "^FDIVv.[if]16")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>; -def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>; -def : InstRW<[Ampere1BWrite_9cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if]16")>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX16)>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if]16")>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>; -// FP square root, H-form -def : InstRW<[Ampere1BWrite_21cyc_1X], (instrs FSQRTHr)>; -// FP square root, vector-form, F16 -def : InstRW<[Ampere1BWrite_39cyc_1X], (instregex "^FSQRTv.f16")>; - -// FP data processing, scalar/vector, single/double precision -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], - (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], - (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], - (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>; -def : InstRW<[Ampere1BWrite_3cyc_1X], - (instregex "^FCMPE?(S|D)")>; -def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X], - (instregex "^FCCMPE?(S|D)")>; -def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY], - (instregex "^FCSEL(S|D)")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>; -// Convert FP to integer, S/D-form -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi](32|64)")>; -// Convert to FP from GPR, S/D-form -def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]to[DS]$")>; -// Convert to FP from GPR, fixed-point, S/D-form -def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX][SD]ri$")>; -def : InstRW<[Ampere1BWrite_19cyc_1X], (instregex "^FDIVv.[if](64)", "FDIVD")>; -def : InstRW<[Ampere1BWrite_12cyc_1X], (instregex "^FDIVv.[if](32)", "FDIVS")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>; -def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX32, FMULX64)>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?MUL")>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; -def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT(32|64)")>; -def : InstRW<[Ampere1BWrite_63cyc_1X], (instregex "^FSQRTv.f64", "^FSQRTDr")>; -def : InstRW<[Ampere1BWrite_33cyc_1X], (instregex "^FSQRTv.f32", "^FSQRTSr")>; - -// FP miscellaneous instructions -def : InstRW<[Ampere1BWrite_7cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD]Hr")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD][SD]r")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVTLv")>; -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT(N|XN)v")>; -def : InstRW<[Ampere1BWrite_7cyc_1X_1Z], (instrs FJCVTZS)>; -def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>; -def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>; -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>; -def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "^FMOVXDHighr")>; -def : InstRW<[Ampere1BWrite_3cyc_1Z], (instregex "^FMOV[WX][HSD]r")>; - -// Integer arithmetic and logical instructions -def : InstRW<[Ampere1BWrite_1cyc_1A], - (instregex "ADC(W|X)r", "SBC(W|X)r")>; -def : InstRW<[Ampere1BWrite_Arith], - (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]")>; -def : InstRW<[Ampere1BWrite_1cyc_1AB], - (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[ri]")>; -def : InstRW<[Ampere1BWrite_ArithFlagsetting], - (instregex "(ADD|AND|BIC|SUB)S[WX]r[sx]")>; -def : InstRW<[Ampere1BWrite_1cyc_1A], - (instregex "(ADD|AND|BIC|SUB)S[WX]r[ri]")>; -def : InstRW<[Ampere1BWrite_1cyc_1A], - (instregex "(ADC|SBC)S[WX]r")>; -def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs RMIF)>; -def : InstRW<[Ampere1BWrite_1cyc_1A], - (instregex "(CCMN|CCMP)(X|W)")>; -def : InstRW<[Ampere1BWrite_1cyc_1A], - (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>; -def : InstRW<[Ampere1BWrite_13cyc_1BS_1X], (instrs SDIVWr, UDIVWr)>; -def : InstRW<[Ampere1BWrite_19cyc_2BS_1X], (instrs SDIVXr, UDIVXr)>; -def : InstRW<[Ampere1BWrite_3cyc_1BS], - (instregex "(S|U)MULHr")>; -def : InstRW<[Ampere1BWrite_4cyc_1BS_1AB], - (instregex "(S|U)?M(ADD|SUB)L?r")>; - -// Integer load instructions -def : InstRW<[Ampere1BWrite_3cyc_1L], - (instregex "(LDNP|LDP|LDPSW)(X|W)")>; -def : InstRW<[Ampere1BWrite_3cyc_1L], - (instregex "LDR(B|D|H|Q|S)ui")>; -def : InstRW<[Ampere1BWrite_3cyc_1L], - (instregex "LDR(D|Q|W|X)l")>; -def : InstRW<[Ampere1BWrite_3cyc_1L], - (instregex "LDTR(B|H|W|X)i")>; -def : InstRW<[Ampere1BWrite_3cyc_1L], - (instregex "LDTRS(BW|BX|HW|HX|W)i")>; -def : InstRW<[Ampere1BWrite_3cyc_1L], - (instregex "LDUR(BB|HH|X|W)i")>; -def : InstRW<[Ampere1BWrite_3cyc_1L], - (instregex "LDURS(BW|BX|HW|HX|W)i")>; -def : InstRW<[Ampere1BWrite_3cyc_1L], - (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>; -def : InstRW<[Ampere1BWrite_1cyc_1L], - (instrs PRFMl, PRFUMi, PRFUMi)>; -def : InstRW<[Ampere1BWrite_1cyc_1L], - (instrs PRFMroW, PRFMroX)>; - -// Integer miscellaneous instructions -def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs ADR, ADRP)>; -def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "EXTR(W|X)")>; -def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>; -def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>; -def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "CLS(W|X)")>; -def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs SETF8, SETF16)>; -def : InstRW<[Ampere1BWrite_1cyc_1AB], - (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; -def : InstRW<[Ampere1BWrite_1cyc_1B], - (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>; -def : InstRW<[Ampere1BWrite_1cyc_1B], - (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>; - -// Integer store instructions -def : InstRW<[Ampere1BWrite_1cyc_2S], (instregex "STNP(X|W)i")>; -def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STPXi)>; -def : InstRW<[Ampere1BWrite_2cyc_1B_1S], (instrs STPWi)>; -def : InstRW<[Ampere1BWrite_2cyc_1B_1S_1AB], (instregex "STP(W|X)(pre|post)")>; -def : InstRW<[Ampere1BWrite_1cyc_1S], (instrs STTRBi, STTRHi, STTRWi, STTRXi)>; -def : InstRW<[Ampere1BWrite_1cyc_1S], (instregex "STUR(BB|HH|X|W)i", - "STR(X|W)ui", - "STUR(BB|HH|X|W)i")>; -def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STRWroX, STRXroX)>; -def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STRWroW, STRXroW)>; - -// Memory tagging - -// Insert Random Tags -def : InstRW<[Ampere1BWrite_1cyc_1BS_1B], (instrs IRG, IRGstack)>; -// Load allocation tag -def : InstRW<[Ampere1BWrite_4cyc_1L_1B], (instrs LDG, LDGM)>; -// Store allocation tags -def : InstRW<[Ampere1BWrite_1cyc_1S], - (instrs STGi, STGM, STGPreIndex, STGPostIndex)>; -// Store allocation tags and pair of registers -def : InstRW<[Ampere1BWrite_1cyc_2S], - (instrs STGPi, STGPpre, STGPpost)>; -// Store allocation tags and zero data -def : InstRW<[Ampere1BWrite_1cyc_1S], - (instrs STZGi, STZGM, STZGPreIndex, STZGPostIndex)>; -// Store two tags -def : InstRW<[Ampere1BWrite_1cyc_2S], - (instrs ST2Gi, ST2GPreIndex, ST2GPostIndex)>; -// Store two tags and zero data -def : InstRW<[Ampere1BWrite_1cyc_2S], - (instrs STZ2Gi, STZ2GPreIndex, STZ2GPostIndex)>; -// Subtract Pointer -def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBP)>; -// Subtract Pointer, flagset -def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBPS)>; -// Insert Tag Mask -def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs GMI)>; -// Arithmetic, immediate to logical address tag -def : InstRW<[Ampere1BWrite_1cyc_B], (instrs ADDG, SUBG)>; - -// Pointer authentication -def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^AUT")>; -def : InstRW<[Ampere1BWrite_6cyc_1BS_1A], - (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>; -def : InstRW<[Ampere1BWrite_6cyc_1BS_2A], - (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>; -def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^PAC")>; -def : InstRW<[Ampere1BWrite_8cyc_1BS_1L], (instregex "^LDRA(A|B)")>; -def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs XPACD, XPACI)>; - -// Vector integer instructions -// -- absolute difference -def : InstRW<[Ampere1BWrite_2cyc_1XY], - (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv", - "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>; -// -- arithmetic -def : InstRW<[Ampere1BWrite_2cyc_1XY], - (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD", - "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW", - "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>; -// -- arithmetic, horizontal, 16B -def : InstRW<[Ampere1BWrite_8cyc_4XY], - (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>; -def : InstRW<[Ampere1BWrite_8cyc_4XY], - (instregex "^[SU](MIN|MAX)Vv16i8v")>; -// -- arithmetic, horizontal, 4H/4S -def : InstRW<[Ampere1BWrite_4cyc_2XY], - (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>; -def : InstRW<[Ampere1BWrite_4cyc_2XY], - (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>; -// -- arithmetic, horizontal, 8B/8H -def : InstRW<[Ampere1BWrite_6cyc_3XY], - (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>; -def : InstRW<[Ampere1BWrite_6cyc_3XY], - (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>; -// -- arithmetic, narrowing -def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(ADD|SUB)HNv.*")>; -def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>; -// -- arithmetic, pairwise -def : InstRW<[Ampere1BWrite_2cyc_1XY], - (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>; -// -- arithmetic, saturating -def : InstRW<[Ampere1BWrite_2cyc_1XY], - (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>; -// -- bit count -def : InstRW<[Ampere1BWrite_2cyc_1XY], - (instregex "^(CLS|CLZ|CNT)v")>; -// -- compare -def : InstRW<[Ampere1BWrite_2cyc_1XY], - (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv", - "^CMHIv", "^CMHSv")>; -// -- compare non-zero -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^CMTSTv")>; -// -- dot product -def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>; -// -- fp reciprocal estimate -def : InstRW<[Ampere1BWrite_6cyc_1X], (instregex "^FRECPEv", "^FRSQRTEv")>; -// -- integer reciprocal estimate -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>; -// -- logical -def : InstRW<[Ampere1BWrite_2cyc_1XY], - (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; -// -- logical, narrowing -def : InstRW<[Ampere1BWrite_6cyc_2XY], - (instregex "RSHRNv", - "SHRNv", "SQSHRNv", "SQSHRUNv", - "UQXTNv")>; -// -- matrix multiply -def : InstRW<[Ampere1BWrite_3cyc_1XY], - (instrs SMMLA, UMMLA, USMMLA)>; -// -- max/min -def : InstRW<[Ampere1Write_2cyc_1XY], - (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; -def : InstRW<[Ampere1Write_2cyc_1XY], - (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; -// -- move immediate -def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>; -// -- multiply -def : InstRW<[Ampere1Write_3cyc_1XY], - (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>; -// -- multiply accumulate -def : InstRW<[Ampere1Write_3cyc_1XY], - (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>; -// -- negation, saturating -def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>; -// -- reverse bits/bytes -def : InstRW<[Ampere1Write_2cyc_1XY], - (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>; -// -- shift -def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; -// -- shift and accumulate -def : InstRW<[Ampere1Write_2cyc_1XY], - (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>; -// -- shift, saturating -def : InstRW<[Ampere1Write_2cyc_1XY], - (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU", - "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL", - "^UQSHL")>; - -// Vector miscellaneous instructions -// -- duplicate element -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^DUPv.+lane")>; -// -- duplicate from GPR -def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^DUPv.+gpr")>; -// -- extract narrow -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^XTNv")>; -// -- insert/extract element -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>; -// -- move FP immediate -def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOVv")>; -// -- move element to GPR -def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "(S|U)MOVv")>; -// -- move from GPR to any element -def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>; -// -- table lookup -def : InstRW<[Ampere1BWrite_2cyc_1XY], - (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; -def : InstRW<[Ampere1BWrite_4cyc_2XY], - (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; -def : InstRW<[Ampere1BWrite_6cyc_3XY], - (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; -def : InstRW<[Ampere1BWrite_8cyc_4XY], - (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; -// -- transpose -def : InstRW<[Ampere1Write_2cyc_1XY], - (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; -// -- zip/unzip -def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>; - -} // SchedModel = Ampere1BModel -- cgit v1.1 From 8f23464a5d957242c89ca6f33d4379c42519cd81 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Sat, 10 Feb 2024 01:00:14 +0100 Subject: [llvm-lib][llvm-dlltool][Object] Add support for EXPORTAS name types. (#78772) EXPORTAS is a new name type in import libraries. It's used by default on ARM64EC, but it's allowed on other platforms as well. --- llvm/include/llvm/BinaryFormat/COFF.h | 5 +- llvm/include/llvm/Object/COFFImportFile.h | 4 ++ llvm/lib/Object/COFFImportFile.cpp | 66 ++++++++++++------- llvm/lib/Object/COFFModuleDefinition.cpp | 13 +++- llvm/test/tools/llvm-lib/exportas.test | 94 ++++++++++++++++++++++++++++ llvm/tools/llvm-readobj/COFFImportDumper.cpp | 3 + 6 files changed, 162 insertions(+), 23 deletions(-) create mode 100644 llvm/test/tools/llvm-lib/exportas.test diff --git a/llvm/include/llvm/BinaryFormat/COFF.h b/llvm/include/llvm/BinaryFormat/COFF.h index 522ee37..72461d0 100644 --- a/llvm/include/llvm/BinaryFormat/COFF.h +++ b/llvm/include/llvm/BinaryFormat/COFF.h @@ -716,7 +716,10 @@ enum ImportNameType : unsigned { IMPORT_NAME_NOPREFIX = 2, /// The import name is the public symbol name, but skipping the leading ?, /// @, or optionally _, and truncating at the first @. - IMPORT_NAME_UNDECORATE = 3 + IMPORT_NAME_UNDECORATE = 3, + /// The import name is specified as a separate string in the import library + /// object file. + IMPORT_NAME_EXPORTAS = 4 }; enum class GuardFlags : uint32_t { diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h index 45a4a79..7c5846e9 100644 --- a/llvm/include/llvm/Object/COFFImportFile.h +++ b/llvm/include/llvm/Object/COFFImportFile.h @@ -92,6 +92,10 @@ struct COFFShortExport { /// file, this is "baz" in "EXPORTS\nfoo = bar == baz". std::string AliasTarget; + /// Specifies EXPORTAS name. In a .def file, this is "bar" in + /// "EXPORTS\nfoo EXPORTAS bar". + std::string ExportAs; + uint16_t Ordinal = 0; bool Noname = false; bool Data = false; diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp index d7d26f4..51e6274 100644 --- a/llvm/lib/Object/COFFImportFile.cpp +++ b/llvm/lib/Object/COFFImportFile.cpp @@ -71,6 +71,12 @@ StringRef COFFImportFile::getExportName() const { name = ltrim1(name, "?@_"); name = name.substr(0, name.find('@')); break; + case IMPORT_NAME_EXPORTAS: { + // Skip DLL name + name = Data.getBuffer().substr(sizeof(*hdr) + name.size() + 1); + name = name.split('\0').second.split('\0').first; + break; + } default: break; } @@ -209,6 +215,7 @@ public: // Library Format. NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal, ImportType Type, ImportNameType NameType, + StringRef ExportName, MachineTypes Machine); // Create a weak external file which is described in PE/COFF Aux Format 3. @@ -500,12 +507,13 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector &Buffer) { return {MemoryBufferRef{F, ImportName}}; } -NewArchiveMember ObjectFactory::createShortImport(StringRef Sym, - uint16_t Ordinal, - ImportType ImportType, - ImportNameType NameType, - MachineTypes Machine) { +NewArchiveMember +ObjectFactory::createShortImport(StringRef Sym, uint16_t Ordinal, + ImportType ImportType, ImportNameType NameType, + StringRef ExportName, MachineTypes Machine) { size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs + if (!ExportName.empty()) + ImpSize += ExportName.size() + 1; size_t Size = sizeof(coff_import_header) + ImpSize; char *Buf = Alloc.Allocate(Size); memset(Buf, 0, Size); @@ -525,6 +533,10 @@ NewArchiveMember ObjectFactory::createShortImport(StringRef Sym, memcpy(P, Sym.data(), Sym.size()); P += Sym.size() + 1; memcpy(P, ImportName.data(), ImportName.size()); + if (!ExportName.empty()) { + P += ImportName.size() + 1; + memcpy(P, ExportName.data(), ExportName.size()); + } return {MemoryBufferRef(StringRef(Buf, Size), ImportName)}; } @@ -641,27 +653,39 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path, ImportType = IMPORT_CONST; StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName; - ImportNameType NameType = E.Noname - ? IMPORT_ORDINAL - : getNameType(SymbolName, E.Name, - Machine, MinGW); - Expected Name = E.ExtName.empty() - ? std::string(SymbolName) - : replace(SymbolName, E.Name, E.ExtName); - - if (!Name) - return Name.takeError(); - - if (!E.AliasTarget.empty() && *Name != E.AliasTarget) { + std::string Name; + + if (E.ExtName.empty()) { + Name = std::string(SymbolName); + } else { + Expected ReplacedName = + replace(SymbolName, E.Name, E.ExtName); + if (!ReplacedName) + return ReplacedName.takeError(); + Name.swap(*ReplacedName); + } + + if (!E.AliasTarget.empty() && Name != E.AliasTarget) { Members.push_back( - OF.createWeakExternal(E.AliasTarget, *Name, false, Machine)); + OF.createWeakExternal(E.AliasTarget, Name, false, Machine)); Members.push_back( - OF.createWeakExternal(E.AliasTarget, *Name, true, Machine)); + OF.createWeakExternal(E.AliasTarget, Name, true, Machine)); continue; } - Members.push_back( - OF.createShortImport(*Name, E.Ordinal, ImportType, NameType, Machine)); + ImportNameType NameType; + std::string ExportName; + if (E.Noname) { + NameType = IMPORT_ORDINAL; + } else if (!E.ExportAs.empty()) { + NameType = IMPORT_NAME_EXPORTAS; + ExportName = E.ExportAs; + } else { + NameType = getNameType(SymbolName, E.Name, Machine, MinGW); + } + + Members.push_back(OF.createShortImport(Name, E.Ordinal, ImportType, + NameType, ExportName, Machine)); } return writeArchive(Path, Members, SymtabWritingMode::NormalSymtab, diff --git a/llvm/lib/Object/COFFModuleDefinition.cpp b/llvm/lib/Object/COFFModuleDefinition.cpp index 35e6ab8..549348a 100644 --- a/llvm/lib/Object/COFFModuleDefinition.cpp +++ b/llvm/lib/Object/COFFModuleDefinition.cpp @@ -39,6 +39,7 @@ enum Kind { KwConstant, KwData, KwExports, + KwExportAs, KwHeapsize, KwLibrary, KwName, @@ -116,6 +117,7 @@ public: .Case("CONSTANT", KwConstant) .Case("DATA", KwData) .Case("EXPORTS", KwExports) + .Case("EXPORTAS", KwExportAs) .Case("HEAPSIZE", KwHeapsize) .Case("LIBRARY", KwLibrary) .Case("NAME", KwName) @@ -284,7 +286,16 @@ private: E.AliasTarget = std::string("_").append(E.AliasTarget); continue; } - unget(); + // EXPORTAS must be at the end of export definition + if (Tok.K == KwExportAs) { + read(); + if (Tok.K == Eof) + return createError( + "unexpected end of file, EXPORTAS identifier expected"); + E.ExportAs = std::string(Tok.Value); + } else { + unget(); + } Info.Exports.push_back(E); return Error::success(); } diff --git a/llvm/test/tools/llvm-lib/exportas.test b/llvm/test/tools/llvm-lib/exportas.test new file mode 100644 index 0000000..f6e845c --- /dev/null +++ b/llvm/test/tools/llvm-lib/exportas.test @@ -0,0 +1,94 @@ +Test EXPORTAS in importlibs. + +RUN: split-file %s %t.dir && cd %t.dir +RUN: llvm-lib -machine:amd64 -def:test.def -out:test.lib + +RUN: llvm-nm --print-armap test.lib | FileCheck --check-prefix=ARMAP %s + +ARMAP: Archive map +ARMAP-NEXT: __IMPORT_DESCRIPTOR_test in test.dll +ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll +ARMAP-NEXT: __imp_func in test.dll +ARMAP-NEXT: __imp_func2 in test.dll +ARMAP-NEXT: __imp_func3 in test.dll +ARMAP-NEXT: __imp_mydata in test.dll +ARMAP-NEXT: func in test.dll +ARMAP-NEXT: func2 in test.dll +ARMAP-NEXT: func3 in test.dll +ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll + +RUN: llvm-readobj test.lib | FileCheck --check-prefix=READOBJ %s + +READOBJ: File: test.lib(test.dll) +READOBJ-NEXT: Format: COFF-x86-64 +READOBJ-NEXT: Arch: x86_64 +READOBJ-NEXT: AddressSize: 64bit +READOBJ-EMPTY: +READOBJ-NEXT: File: test.lib(test.dll) +READOBJ-NEXT: Format: COFF-x86-64 +READOBJ-NEXT: Arch: x86_64 +READOBJ-NEXT: AddressSize: 64bit +READOBJ-EMPTY: +READOBJ-NEXT: File: test.lib(test.dll) +READOBJ-NEXT: Format: COFF-x86-64 +READOBJ-NEXT: Arch: x86_64 +READOBJ-NEXT: AddressSize: 64bit +READOBJ-EMPTY: +READOBJ-NEXT: File: test.dll +READOBJ-NEXT: Format: COFF-import-file-x86-64 +READOBJ-NEXT: Type: code +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: expfunc +READOBJ-NEXT: Symbol: __imp_func +READOBJ-NEXT: Symbol: func +READOBJ-EMPTY: +READOBJ-NEXT: File: test.dll +READOBJ-NEXT: Format: COFF-import-file-x86-64 +READOBJ-NEXT: Type: data +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: expdata +READOBJ-NEXT: Symbol: __imp_mydata +READOBJ-EMPTY: +READOBJ-NEXT: File: test.dll +READOBJ-NEXT: Format: COFF-import-file-x86-64 +READOBJ-NEXT: Type: code +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: expfunc2 +READOBJ-NEXT: Symbol: __imp_func2 +READOBJ-NEXT: Symbol: func2 +READOBJ-EMPTY: +READOBJ-NEXT: File: test.dll +READOBJ-NEXT: Format: COFF-import-file-x86-64 +READOBJ-NEXT: Type: code +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: expfunc3 +READOBJ-NEXT: Symbol: __imp_func3 +READOBJ-NEXT: Symbol: func3 + + +EXPORTAS must be at the end of entry declaration. +RUN: not llvm-lib -machine:amd64 -def:test2.def -out:test2.lib 2>&1 \ +RUN: | FileCheck --check-prefix=ERROR %s +RUN: not llvm-lib -machine:amd64 -def:test3.def -out:test3.lib 2>&1 \ +RUN: | FileCheck --check-prefix=ERROR %s +ERROR: Invalid data was encountered while parsing the file + + +#--- test.def +LIBRARY test.dll +EXPORTS + func EXPORTAS expfunc + mydata DATA EXPORTAS expdata + func2 = myfunc2 EXPORTAS expfunc2 + func3 = otherdll.otherfunc3 EXPORTAS expfunc3 + +#--- test2.def +LIBRARY test.dll +EXPORTS + func EXPORTAS expfunc + mydata EXPORTAS expdata DATA + +#--- test3.def +LIBRARY test.dll +EXPORTS + mydata EXPORTAS diff --git a/llvm/tools/llvm-readobj/COFFImportDumper.cpp b/llvm/tools/llvm-readobj/COFFImportDumper.cpp index 656ca32..0ab2a17 100644 --- a/llvm/tools/llvm-readobj/COFFImportDumper.cpp +++ b/llvm/tools/llvm-readobj/COFFImportDumper.cpp @@ -45,6 +45,9 @@ void dumpCOFFImportFile(const COFFImportFile *File, ScopedPrinter &Writer) { case COFF::IMPORT_NAME_UNDECORATE: Writer.printString("Name type", "undecorate"); break; + case COFF::IMPORT_NAME_EXPORTAS: + Writer.printString("Name type", "export as"); + break; } if (H->getNameType() != COFF::IMPORT_ORDINAL) -- cgit v1.1 From 224145ee882e32aaa1fae9ae88698cf1b07b22e4 Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Fri, 9 Feb 2024 16:01:42 -0800 Subject: [DWARFDump][nfc] Fix incorrect comment (#81276) It claimed to dump all sections by default, but this hasn't been true since 2017: https://reviews.llvm.org/D37717 --- llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp index 559e7a6..8cdd84b 100644 --- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp +++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp @@ -845,8 +845,9 @@ int main(int argc, char **argv) { bool OffsetRequested = false; - // Defaults to dumping all sections, unless brief mode is specified in which - // case only the .debug_info section in dumped. + // Defaults to dumping only debug_info, unless: A) verbose mode is specified, + // in which case all sections are dumped, or B) a specific section is + // requested. #define HANDLE_DWARF_SECTION(ENUM_NAME, ELF_NAME, CMDLINE_NAME, OPTION) \ if (Dump##ENUM_NAME.IsRequested) { \ DumpType |= DIDT_##ENUM_NAME; \ -- cgit v1.1 From 3a3302ef7b48f7907d0fb62b380d9d515a5f35e4 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 9 Feb 2024 16:10:10 -0800 Subject: [flang][test] Update driver-help*.f90 after 9397d23671f26ab8631e90f688ae2ea212f3c770 --- flang/test/Driver/driver-help-hidden.f90 | 2 +- flang/test/Driver/driver-help.f90 | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90 index 36b7d20..44dbac4 100644 --- a/flang/test/Driver/driver-help-hidden.f90 +++ b/flang/test/Driver/driver-help-hidden.f90 @@ -148,7 +148,7 @@ ! CHECK-NEXT: -Rpass= Report transformations performed by optimization passes whose name matches the given POSIX regular expression ! CHECK-NEXT: -R Enable the specified remark ! CHECK-NEXT: -save-temps= Save intermediate compilation results. -! CHECK-NEXT: -save-temps Save intermediate compilation results +! CHECK-NEXT: -save-temps Alias for --save-temps=cwd ! CHECK-NEXT: -std= Language standard to compile for ! CHECK-NEXT: -S Only run preprocess and compilation steps ! CHECK-NEXT: --target= Generate code for the given target diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90 index f69f437..b4280a4 100644 --- a/flang/test/Driver/driver-help.f90 +++ b/flang/test/Driver/driver-help.f90 @@ -134,7 +134,7 @@ ! HELP-NEXT: -Rpass= Report transformations performed by optimization passes whose name matches the given POSIX regular expression ! HELP-NEXT: -R Enable the specified remark ! HELP-NEXT: -save-temps= Save intermediate compilation results. -! HELP-NEXT: -save-temps Save intermediate compilation results +! HELP-NEXT: -save-temps Alias for --save-temps=cwd ! HELP-NEXT: -std= Language standard to compile for ! HELP-NEXT: -S Only run preprocess and compilation steps ! HELP-NEXT: --target= Generate code for the given target @@ -275,7 +275,7 @@ ! HELP-FC1-NEXT: -Rpass= Report transformations performed by optimization passes whose name matches the given POSIX regular expression ! HELP-FC1-NEXT: -R Enable the specified remark ! HELP-FC1-NEXT: -save-temps= Save intermediate compilation results. -! HELP-FC1-NEXT: -save-temps Save intermediate compilation results +! HELP-FC1-NEXT: -save-temps Alias for --save-temps=cwd ! HELP-FC1-NEXT: -std= Language standard to compile for ! HELP-FC1-NEXT: -S Only run preprocess and compilation steps ! HELP-FC1-NEXT: -target-cpu Target a specific cpu type -- cgit v1.1 From eb1b428750181ea742c547db0bc7136cd5b8f732 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 9 Feb 2024 16:52:25 -0800 Subject: [llvm][aarch64] Apple A16 & A17 had adrp-add fusion, but A14 did not (#81325) --- llvm/lib/Target/AArch64/AArch64.td | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index e76204f..3377fcf 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -1120,7 +1120,6 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", FeatureFuseArithmeticLogic, FeatureFuseCCSelect, FeatureFuseCryptoEOR, - FeatureFuseAdrpAdd, FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMove, @@ -1149,6 +1148,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFuseAddress, + FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, FeatureFuseCCSelect, @@ -1165,6 +1165,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFuseAddress, + FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, FeatureFuseCCSelect, -- cgit v1.1 From b7cc401df5ac714f5de0cbc64e6c7083d2c1d712 Mon Sep 17 00:00:00 2001 From: Enna1 Date: Sat, 10 Feb 2024 09:10:24 +0800 Subject: =?UTF-8?q?[hwasan]=20Call=20user=20provided=20callback=20function?= =?UTF-8?q?=20for=20both=20fatal=20and=20non-=E2=80=A6=20(#80429)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …fatal error report This makes the behavior of HWASan matching with ASan: always call user provided callback function for an error report, but only call `SetAbortMessage()` on Android when `flags()->halt_on_error` is true. --- compiler-rt/lib/hwasan/hwasan_report.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp index c3d260d..d3398ff 100644 --- a/compiler-rt/lib/hwasan/hwasan_report.cpp +++ b/compiler-rt/lib/hwasan/hwasan_report.cpp @@ -40,7 +40,7 @@ class ScopedReport { public: explicit ScopedReport(bool fatal) : fatal(fatal) { Lock lock(&error_message_lock_); - error_message_ptr_ = fatal ? &error_message_ : nullptr; + error_message_ptr_ = &error_message_; ++hwasan_report_count; } -- cgit v1.1 From c344953ae78b0e9545b7374a2bea35abaee18c38 Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Fri, 9 Feb 2024 17:24:27 -0800 Subject: Fix 01706e7 on 32-bit platforms Make the type match the printf format. --- llvm/tools/llvm-objdump/llvm-objdump.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp index 0e4f4e1..948a5d7 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -2949,7 +2949,8 @@ void Dumper::printSymbol(const SymbolRef &Symbol, outs() << '\t' << format(Fmt, ELFSymbolRef(Symbol).getSize()); else if (O.isWasm()) outs() << '\t' - << format(Fmt, cast(O).getSymbolSize(Symbol)); + << format(Fmt, static_cast( + cast(O).getSymbolSize(Symbol))); if (O.isELF()) { if (!SymbolVersions.empty()) { -- cgit v1.1 From cc02e50e77419475fa958b2626600a48f8208098 Mon Sep 17 00:00:00 2001 From: Pete Steinfeld <47540744+psteinfeld@users.noreply.github.com> Date: Fri, 9 Feb 2024 18:04:53 -0800 Subject: =?UTF-8?q?Revert=20"[Flang]=20Update=20the=20fix=20of=20PR=208073?= =?UTF-8?q?8=20to=20cover=20generic=20interface=E2=80=A6=20(#81321)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … inside modules (#81087)" This reverts commit 0802596df3d1ffd15f6b828a0f5c1e5b687a730f. See comments in PR #81087 for a test case that shows why I'm reverting. --- flang/lib/Semantics/resolve-names.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 2a42c791..36deab9 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -5648,10 +5648,9 @@ void DeclarationVisitor::Post(const parser::ProcDecl &x) { const auto &name{std::get(x.t)}; const Symbol *procInterface{nullptr}; if (interfaceName_) { - Symbol *ultimate{&interfaceName_->symbol->GetUltimate()}; - procInterface = ultimate->has() - ? ultimate->get().specific() - : ultimate; + procInterface = interfaceName_->symbol->has() + ? interfaceName_->symbol->get().specific() + : interfaceName_->symbol; } auto attrs{HandleSaveName(name.source, GetAttrs())}; DerivedTypeDetails *dtDetails{nullptr}; -- cgit v1.1 From 637c37025d2a9747d440034fff7b4d549dead6f3 Mon Sep 17 00:00:00 2001 From: lntue <35648136+lntue@users.noreply.github.com> Date: Fri, 9 Feb 2024 21:13:14 -0500 Subject: [libc][math] Add C23 math function frexpf128. (#81337) --- libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/riscv/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/math/index.rst | 2 + libc/spec/stdc.td | 1 + libc/src/math/CMakeLists.txt | 1 + libc/src/math/frexpf128.h | 20 ++++++++++ libc/src/math/generic/CMakeLists.txt | 23 +++++++++--- libc/src/math/generic/frexpf128.cpp | 19 ++++++++++ libc/test/src/math/smoke/CMakeLists.txt | 18 ++++++--- libc/test/src/math/smoke/FrexpTest.h | 58 ++++++++++++++--------------- libc/test/src/math/smoke/frexp_test.cpp | 2 +- libc/test/src/math/smoke/frexpf128_test.cpp | 13 +++++++ libc/test/src/math/smoke/frexpf_test.cpp | 2 +- libc/test/src/math/smoke/frexpl_test.cpp | 2 +- 15 files changed, 119 insertions(+), 45 deletions(-) create mode 100644 libc/src/math/frexpf128.h create mode 100644 libc/src/math/generic/frexpf128.cpp create mode 100644 libc/test/src/math/smoke/frexpf128_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index f75b267..bc09f4881 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -386,6 +386,7 @@ if(LIBC_COMPILER_HAS_FLOAT128) libc.src.math.floorf128 libc.src.math.fmaxf128 libc.src.math.fminf128 + libc.src.math.frexpf128 libc.src.math.roundf128 libc.src.math.sqrtf128 libc.src.math.truncf128 diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 762beb9..02412e7 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -395,6 +395,7 @@ if(LIBC_COMPILER_HAS_FLOAT128) libc.src.math.floorf128 libc.src.math.fmaxf128 libc.src.math.fminf128 + libc.src.math.frexpf128 libc.src.math.roundf128 libc.src.math.sqrtf128 libc.src.math.truncf128 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 52a3ce0..8ca9375 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -414,6 +414,7 @@ if(LIBC_COMPILER_HAS_FLOAT128) libc.src.math.floorf128 libc.src.math.fmaxf128 libc.src.math.fminf128 + libc.src.math.frexpf128 libc.src.math.roundf128 libc.src.math.sqrtf128 libc.src.math.truncf128 diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index 2758b42..9460449 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -176,6 +176,8 @@ Basic Operations +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | frexpl | |check| | |check| | |check| | |check| | |check| | | | |check| | |check| | |check| | | | +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ +| frexpf128 | |check| | |check| | | |check| | | | | | | | | | ++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | ilogb | |check| | |check| | |check| | |check| | |check| | | | |check| | |check| | |check| | | | +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | ilogbf | |check| | |check| | |check| | |check| | |check| | | | |check| | |check| | |check| | | | diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 9c8b5e5..afddc77 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -401,6 +401,7 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"frexp", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"frexpf", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"frexpl", RetValSpec, [ArgSpec, ArgSpec]>, + GuardedFunctionSpec<"frexpf128", RetValSpec, [ArgSpec, ArgSpec]], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"hypot", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"hypotf", RetValSpec, [ArgSpec, ArgSpec]>, diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 8cdd84a..985585c 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -137,6 +137,7 @@ add_math_entrypoint_object(fmodf) add_math_entrypoint_object(frexp) add_math_entrypoint_object(frexpf) add_math_entrypoint_object(frexpl) +add_math_entrypoint_object(frexpf128) add_math_entrypoint_object(hypot) add_math_entrypoint_object(hypotf) diff --git a/libc/src/math/frexpf128.h b/libc/src/math/frexpf128.h new file mode 100644 index 0000000..5d70860 --- /dev/null +++ b/libc/src/math/frexpf128.h @@ -0,0 +1,20 @@ +//===-- Implementation header for frexpf128 ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FREXPF128_H +#define LLVM_LIBC_SRC_MATH_FREXPF128_H + +#include "src/__support/macros/properties/float.h" + +namespace LIBC_NAMESPACE { + +float128 frexpf128(float128 x, int *exp); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FREXPF128_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 3216ec3..fdf383f 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -916,10 +916,10 @@ add_entrypoint_object( frexp.cpp HDRS ../frexp.h + COMPILE_OPTIONS + -O3 DEPENDS libc.src.__support.FPUtil.manipulation_functions - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -928,10 +928,10 @@ add_entrypoint_object( frexpf.cpp HDRS ../frexpf.h + COMPILE_OPTIONS + -O3 DEPENDS libc.src.__support.FPUtil.manipulation_functions - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -940,10 +940,23 @@ add_entrypoint_object( frexpl.cpp HDRS ../frexpl.h + COMPILE_OPTIONS + -O3 DEPENDS libc.src.__support.FPUtil.manipulation_functions +) + +add_entrypoint_object( + frexpf128 + SRCS + frexpf128.cpp + HDRS + ../frexpf128.h COMPILE_OPTIONS - -O2 + -O3 + DEPENDS + libc.src.__support.macros.properties.float + libc.src.__support.FPUtil.manipulation_functions ) add_entrypoint_object( diff --git a/libc/src/math/generic/frexpf128.cpp b/libc/src/math/generic/frexpf128.cpp new file mode 100644 index 0000000..b50f37d --- /dev/null +++ b/libc/src/math/generic/frexpf128.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of frexpf128 function ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/frexpf128.h" +#include "src/__support/FPUtil/ManipulationFunctions.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float128, frexpf128, (float128 x, int *exp)) { + return fputil::frexp(x, *exp); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 93ce0b7..0d55be5 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -779,9 +779,7 @@ add_fp_unittest( HDRS FrexpTest.h DEPENDS - libc.include.math libc.src.math.frexp - libc.src.__support.FPUtil.basic_operations ) add_fp_unittest( @@ -793,9 +791,7 @@ add_fp_unittest( HDRS FrexpTest.h DEPENDS - libc.include.math libc.src.math.frexpf - libc.src.__support.FPUtil.basic_operations ) add_fp_unittest( @@ -807,9 +803,19 @@ add_fp_unittest( HDRS FrexpTest.h DEPENDS - libc.include.math libc.src.math.frexpl - libc.src.__support.FPUtil.basic_operations +) + +add_fp_unittest( + frexpf128_test + SUITE + libc-math-smoke-tests + SRCS + frexpf128_test.cpp + HDRS + FrexpTest.h + DEPENDS + libc.src.math.frexpf128 ) # FIXME: These tests are currently broken for NVPTX. diff --git a/libc/test/src/math/smoke/FrexpTest.h b/libc/test/src/math/smoke/FrexpTest.h index 981872a..bf99a9a 100644 --- a/libc/test/src/math/smoke/FrexpTest.h +++ b/libc/test/src/math/smoke/FrexpTest.h @@ -10,81 +10,76 @@ #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include - template class FrexpTest : public LIBC_NAMESPACE::testing::Test { DECLARE_SPECIAL_CONSTANTS(T) - static constexpr StorageType HIDDEN_BIT = - StorageType(1) << LIBC_NAMESPACE::fputil::FPBits::FRACTION_LEN; - public: typedef T (*FrexpFunc)(T, int *); void testSpecialNumbers(FrexpFunc func) { int exponent; - ASSERT_FP_EQ(aNaN, func(aNaN, &exponent)); - ASSERT_FP_EQ(inf, func(inf, &exponent)); - ASSERT_FP_EQ(neg_inf, func(neg_inf, &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN, &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, &exponent)); - ASSERT_FP_EQ(0.0, func(0.0, &exponent)); - ASSERT_EQ(exponent, 0); + EXPECT_FP_EQ_ALL_ROUNDING(0.0, func(0.0, &exponent)); + EXPECT_EQ(exponent, 0); - ASSERT_FP_EQ(-0.0, func(-0.0, &exponent)); - ASSERT_EQ(exponent, 0); + EXPECT_FP_EQ_ALL_ROUNDING(-0.0, func(-0.0, &exponent)); + EXPECT_EQ(exponent, 0); } void testPowersOfTwo(FrexpFunc func) { int exponent; - EXPECT_FP_EQ(T(0.5), func(T(1.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(1.0), &exponent)); EXPECT_EQ(exponent, 1); - EXPECT_FP_EQ(T(-0.5), func(T(-1.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-1.0), &exponent)); EXPECT_EQ(exponent, 1); - EXPECT_FP_EQ(T(0.5), func(T(2.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(2.0), &exponent)); EXPECT_EQ(exponent, 2); - EXPECT_FP_EQ(T(-0.5), func(T(-2.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-2.0), &exponent)); EXPECT_EQ(exponent, 2); - EXPECT_FP_EQ(T(0.5), func(T(4.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(4.0), &exponent)); EXPECT_EQ(exponent, 3); - EXPECT_FP_EQ(T(-0.5), func(T(-4.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-4.0), &exponent)); EXPECT_EQ(exponent, 3); - EXPECT_FP_EQ(T(0.5), func(T(8.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(8.0), &exponent)); EXPECT_EQ(exponent, 4); - EXPECT_FP_EQ(T(-0.5), func(T(-8.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-8.0), &exponent)); EXPECT_EQ(exponent, 4); - EXPECT_FP_EQ(T(0.5), func(T(16.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(16.0), &exponent)); EXPECT_EQ(exponent, 5); - EXPECT_FP_EQ(T(-0.5), func(T(-16.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-16.0), &exponent)); EXPECT_EQ(exponent, 5); - EXPECT_FP_EQ(T(0.5), func(T(32.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(32.0), &exponent)); EXPECT_EQ(exponent, 6); - EXPECT_FP_EQ(T(-0.5), func(T(-32.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-32.0), &exponent)); EXPECT_EQ(exponent, 6); } void testSomeIntegers(FrexpFunc func) { int exponent; - EXPECT_FP_EQ(T(0.75), func(T(24.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(0.75), func(T(24.0), &exponent)); EXPECT_EQ(exponent, 5); - EXPECT_FP_EQ(T(-0.75), func(T(-24.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(-0.75), func(T(-24.0), &exponent)); EXPECT_EQ(exponent, 5); - EXPECT_FP_EQ(T(0.625), func(T(40.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(0.625), func(T(40.0), &exponent)); EXPECT_EQ(exponent, 6); - EXPECT_FP_EQ(T(-0.625), func(T(-40.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(-0.625), func(T(-40.0), &exponent)); EXPECT_EQ(exponent, 6); - EXPECT_FP_EQ(T(0.78125), func(T(800.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(0.78125), func(T(800.0), &exponent)); EXPECT_EQ(exponent, 10); - EXPECT_FP_EQ(T(-0.78125), func(T(-800.0), &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(T(-0.78125), func(T(-800.0), &exponent)); EXPECT_EQ(exponent, 10); } }; @@ -93,4 +88,5 @@ public: using LlvmLibcFrexpTest = FrexpTest; \ TEST_F(LlvmLibcFrexpTest, SpecialNumbers) { testSpecialNumbers(&func); } \ TEST_F(LlvmLibcFrexpTest, PowersOfTwo) { testPowersOfTwo(&func); } \ - TEST_F(LlvmLibcFrexpTest, SomeIntegers) { testSomeIntegers(&func); } + TEST_F(LlvmLibcFrexpTest, SomeIntegers) { testSomeIntegers(&func); } \ + static_assert(true, "Require semicolon.") diff --git a/libc/test/src/math/smoke/frexp_test.cpp b/libc/test/src/math/smoke/frexp_test.cpp index 4d078ba..79aa972 100644 --- a/libc/test/src/math/smoke/frexp_test.cpp +++ b/libc/test/src/math/smoke/frexp_test.cpp @@ -10,4 +10,4 @@ #include "src/math/frexp.h" -LIST_FREXP_TESTS(double, LIBC_NAMESPACE::frexp) +LIST_FREXP_TESTS(double, LIBC_NAMESPACE::frexp); diff --git a/libc/test/src/math/smoke/frexpf128_test.cpp b/libc/test/src/math/smoke/frexpf128_test.cpp new file mode 100644 index 0000000..a0df32f --- /dev/null +++ b/libc/test/src/math/smoke/frexpf128_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for frexpf128 -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FrexpTest.h" + +#include "src/math/frexpf128.h" + +LIST_FREXP_TESTS(float128, LIBC_NAMESPACE::frexpf128); diff --git a/libc/test/src/math/smoke/frexpf_test.cpp b/libc/test/src/math/smoke/frexpf_test.cpp index 577eb96..f2ae637e 100644 --- a/libc/test/src/math/smoke/frexpf_test.cpp +++ b/libc/test/src/math/smoke/frexpf_test.cpp @@ -10,4 +10,4 @@ #include "src/math/frexpf.h" -LIST_FREXP_TESTS(float, LIBC_NAMESPACE::frexpf) +LIST_FREXP_TESTS(float, LIBC_NAMESPACE::frexpf); diff --git a/libc/test/src/math/smoke/frexpl_test.cpp b/libc/test/src/math/smoke/frexpl_test.cpp index e5184cd..3e1f8b4 100644 --- a/libc/test/src/math/smoke/frexpl_test.cpp +++ b/libc/test/src/math/smoke/frexpl_test.cpp @@ -10,4 +10,4 @@ #include "src/math/frexpl.h" -LIST_FREXP_TESTS(long double, LIBC_NAMESPACE::frexpl) +LIST_FREXP_TESTS(long double, LIBC_NAMESPACE::frexpl); -- cgit v1.1 From c5cbfc5689a26651634e1990b430e917d1ae85da Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Fri, 9 Feb 2024 19:53:04 -0800 Subject: [clang-format] Rename option AlwaysBreakTemplateDeclarations (#81093) Drop the "Always" prefix to remove the self-contradiction. --- clang/docs/ClangFormatStyleOptions.rst | 117 +++++++++++++++-------------- clang/docs/ReleaseNotes.rst | 3 + clang/docs/tools/dump_format_style.py | 7 ++ clang/include/clang/Format/Format.h | 7 +- clang/lib/Format/Format.cpp | 6 +- clang/unittests/Format/ConfigParseTest.cpp | 13 ++++ 6 files changed, 94 insertions(+), 59 deletions(-) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 4ccdd2d..5deeff0 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -1659,62 +1659,8 @@ the configuration (without a prefix: ``Auto``). .. _AlwaysBreakTemplateDeclarations: -**AlwaysBreakTemplateDeclarations** (``BreakTemplateDeclarationsStyle``) :versionbadge:`clang-format 3.4` :ref:`¶ ` - The template declaration breaking style to use. - - Possible values: - - * ``BTDS_Leave`` (in configuration: ``Leave``) - Do not change the line breaking before the declaration. - - .. code-block:: c++ - - template - T foo() { - } - template T foo(int aaaaaaaaaaaaaaaaaaaaa, - int bbbbbbbbbbbbbbbbbbbbb) { - } - - * ``BTDS_No`` (in configuration: ``No``) - Do not force break before declaration. - ``PenaltyBreakTemplateDeclaration`` is taken into account. - - .. code-block:: c++ - - template T foo() { - } - template T foo(int aaaaaaaaaaaaaaaaaaaaa, - int bbbbbbbbbbbbbbbbbbbbb) { - } - - * ``BTDS_MultiLine`` (in configuration: ``MultiLine``) - Force break after template declaration only when the following - declaration spans multiple lines. - - .. code-block:: c++ - - template T foo() { - } - template - T foo(int aaaaaaaaaaaaaaaaaaaaa, - int bbbbbbbbbbbbbbbbbbbbb) { - } - - * ``BTDS_Yes`` (in configuration: ``Yes``) - Always break after template declaration. - - .. code-block:: c++ - - template - T foo() { - } - template - T foo(int aaaaaaaaaaaaaaaaaaaaa, - int bbbbbbbbbbbbbbbbbbbbb) { - } - - +**AlwaysBreakTemplateDeclarations** (``deprecated``) :versionbadge:`clang-format 3.4` :ref:`¶ ` + This option is renamed to ``BreakTemplateDeclarations``. .. _AttributeMacros: @@ -3014,6 +2960,65 @@ the configuration (without a prefix: ``Auto``). string x = "veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongString"; +.. _BreakTemplateDeclarations: + +**BreakTemplateDeclarations** (``BreakTemplateDeclarationsStyle``) :versionbadge:`clang-format 19` :ref:`¶ ` + The template declaration breaking style to use. + + Possible values: + + * ``BTDS_Leave`` (in configuration: ``Leave``) + Do not change the line breaking before the declaration. + + .. code-block:: c++ + + template + T foo() { + } + template T foo(int aaaaaaaaaaaaaaaaaaaaa, + int bbbbbbbbbbbbbbbbbbbbb) { + } + + * ``BTDS_No`` (in configuration: ``No``) + Do not force break before declaration. + ``PenaltyBreakTemplateDeclaration`` is taken into account. + + .. code-block:: c++ + + template T foo() { + } + template T foo(int aaaaaaaaaaaaaaaaaaaaa, + int bbbbbbbbbbbbbbbbbbbbb) { + } + + * ``BTDS_MultiLine`` (in configuration: ``MultiLine``) + Force break after template declaration only when the following + declaration spans multiple lines. + + .. code-block:: c++ + + template T foo() { + } + template + T foo(int aaaaaaaaaaaaaaaaaaaaa, + int bbbbbbbbbbbbbbbbbbbbb) { + } + + * ``BTDS_Yes`` (in configuration: ``Yes``) + Always break after template declaration. + + .. code-block:: c++ + + template + T foo() { + } + template + T foo(int aaaaaaaaaaaaaaaaaaaaa, + int bbbbbbbbbbbbbbbbbbbbb) { + } + + + .. _ColumnLimit: **ColumnLimit** (``Unsigned``) :versionbadge:`clang-format 3.7` :ref:`¶ ` diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 7631f3b..ece6013 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -291,6 +291,9 @@ AST Matchers clang-format ------------ +- ``AlwaysBreakTemplateDeclarations`` is deprecated and renamed to + ``BreakTemplateDeclarations``. + libclang -------- diff --git a/clang/docs/tools/dump_format_style.py b/clang/docs/tools/dump_format_style.py index e41891f..af0124b 100755 --- a/clang/docs/tools/dump_format_style.py +++ b/clang/docs/tools/dump_format_style.py @@ -308,6 +308,7 @@ class OptionsReader: enum = None nested_struct = None version = None + deprecated = False for line in self.header: self.lineno += 1 @@ -327,6 +328,8 @@ class OptionsReader: match = re.match(r"/// \\version\s*(?P[0-9.]+)*", line) if match: version = match.group("version") + elif line.startswith("/// @deprecated"): + deprecated = True elif line.startswith("///"): comment += self.__clean_comment_line(line) elif line.startswith("enum"): @@ -345,6 +348,9 @@ class OptionsReader: field_type, field_name = re.match( r"([<>:\w(,\s)]+)\s+(\w+);", line ).groups() + if deprecated: + field_type = "deprecated" + deprecated = False if not version: self.__warning(f"missing version for {field_name}", line) @@ -456,6 +462,7 @@ class OptionsReader: "std::vector", "std::vector", "std::optional", + "deprecated", ]: if option.type in enums: option.enum = enums[option.type] diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index cb14d98..b4969aa 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -1075,8 +1075,9 @@ struct FormatStyle { BTDS_Yes }; - /// The template declaration breaking style to use. + /// This option is renamed to ``BreakTemplateDeclarations``. /// \version 3.4 + /// @deprecated BreakTemplateDeclarationsStyle AlwaysBreakTemplateDeclarations; /// A vector of strings that should be interpreted as attributes/qualifiers @@ -2293,6 +2294,10 @@ struct FormatStyle { /// \version 7 BreakInheritanceListStyle BreakInheritanceList; + /// The template declaration breaking style to use. + /// \version 19 + // BreakTemplateDeclarationsStyle BreakTemplateDeclarations; + /// If ``true``, consecutive namespace declarations will be on the same /// line. If ``false``, each namespace is declared on a new line. /// \code diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index c5714af..c5a8949 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -877,6 +877,8 @@ template <> struct MappingTraits { if (!IO.outputting()) { IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines); IO.mapOptional("AllowAllConstructorInitializersOnNextLine", OnNextLine); + IO.mapOptional("AlwaysBreakTemplateDeclarations", + Style.AlwaysBreakTemplateDeclarations); IO.mapOptional("BreakBeforeInheritanceComma", BreakBeforeInheritanceComma); IO.mapOptional("BreakConstructorInitializersBeforeComma", @@ -943,8 +945,6 @@ template <> struct MappingTraits { Style.AlwaysBreakAfterReturnType); IO.mapOptional("AlwaysBreakBeforeMultilineStrings", Style.AlwaysBreakBeforeMultilineStrings); - IO.mapOptional("AlwaysBreakTemplateDeclarations", - Style.AlwaysBreakTemplateDeclarations); IO.mapOptional("AttributeMacros", Style.AttributeMacros); IO.mapOptional("BinPackArguments", Style.BinPackArguments); IO.mapOptional("BinPackParameters", Style.BinPackParameters); @@ -971,6 +971,8 @@ template <> struct MappingTraits { Style.BreakConstructorInitializers); IO.mapOptional("BreakInheritanceList", Style.BreakInheritanceList); IO.mapOptional("BreakStringLiterals", Style.BreakStringLiterals); + IO.mapOptional("BreakTemplateDeclarations", + Style.AlwaysBreakTemplateDeclarations); IO.mapOptional("ColumnLimit", Style.ColumnLimit); IO.mapOptional("CommentPragmas", Style.CommentPragmas); IO.mapOptional("CompactNamespaces", Style.CompactNamespaces); diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 7493b0a..22681a2 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -695,6 +695,19 @@ TEST(ConfigParseTest, ParsesConfiguration) { FormatStyle::RTBS_TopLevelDefinitions); Style.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes; + CHECK_PARSE("BreakTemplateDeclarations: Leave", + AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Leave); + CHECK_PARSE("BreakTemplateDeclarations: No", AlwaysBreakTemplateDeclarations, + FormatStyle::BTDS_No); + CHECK_PARSE("BreakTemplateDeclarations: MultiLine", + AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine); + CHECK_PARSE("BreakTemplateDeclarations: Yes", AlwaysBreakTemplateDeclarations, + FormatStyle::BTDS_Yes); + CHECK_PARSE("BreakTemplateDeclarations: false", + AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine); + CHECK_PARSE("BreakTemplateDeclarations: true", + AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Yes); + // For backward compatibility: CHECK_PARSE("AlwaysBreakTemplateDeclarations: Leave", AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Leave); CHECK_PARSE("AlwaysBreakTemplateDeclarations: No", -- cgit v1.1 From 7664ddf8811242295abb837640cad8dd8cefb5e8 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Fri, 9 Feb 2024 20:15:35 -0800 Subject: [clang-format][NFC] Drop "Always" in "AlwaysBreakTemplateDeclarations" --- clang/include/clang/Format/Format.h | 7 +++--- clang/lib/Format/ContinuationIndenter.cpp | 5 ++-- clang/lib/Format/Format.cpp | 10 ++++---- clang/lib/Format/TokenAnnotator.cpp | 6 ++--- clang/unittests/Format/ConfigParseTest.cpp | 38 +++++++++++++++--------------- clang/unittests/Format/FormatTest.cpp | 11 ++++----- 6 files changed, 37 insertions(+), 40 deletions(-) diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index b4969aa..ab56cc8 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -1078,7 +1078,7 @@ struct FormatStyle { /// This option is renamed to ``BreakTemplateDeclarations``. /// \version 3.4 /// @deprecated - BreakTemplateDeclarationsStyle AlwaysBreakTemplateDeclarations; + // BreakTemplateDeclarationsStyle AlwaysBreakTemplateDeclarations; /// A vector of strings that should be interpreted as attributes/qualifiers /// instead of identifiers. This can be useful for language extensions or @@ -2296,7 +2296,7 @@ struct FormatStyle { /// The template declaration breaking style to use. /// \version 19 - // BreakTemplateDeclarationsStyle BreakTemplateDeclarations; + BreakTemplateDeclarationsStyle BreakTemplateDeclarations; /// If ``true``, consecutive namespace declarations will be on the same /// line. If ``false``, each namespace is declared on a new line. @@ -4822,8 +4822,7 @@ struct FormatStyle { AlwaysBreakAfterReturnType == R.AlwaysBreakAfterReturnType && AlwaysBreakBeforeMultilineStrings == R.AlwaysBreakBeforeMultilineStrings && - AlwaysBreakTemplateDeclarations == - R.AlwaysBreakTemplateDeclarations && + BreakTemplateDeclarations == R.BreakTemplateDeclarations && AttributeMacros == R.AttributeMacros && BinPackArguments == R.BinPackArguments && BinPackParameters == R.BinPackParameters && diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 7fd04b2..0b2ef97 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -569,9 +569,8 @@ bool ContinuationIndenter::mustBreak(const LineState &State) { return true; } } - return Style.AlwaysBreakTemplateDeclarations != FormatStyle::BTDS_No && - (Style.AlwaysBreakTemplateDeclarations != - FormatStyle::BTDS_Leave || + return Style.BreakTemplateDeclarations != FormatStyle::BTDS_No && + (Style.BreakTemplateDeclarations != FormatStyle::BTDS_Leave || Current.NewlinesBefore > 0); } if (Previous.is(TT_FunctionAnnotationRParen) && diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index c5a8949..d2cc466 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -878,7 +878,7 @@ template <> struct MappingTraits { IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines); IO.mapOptional("AllowAllConstructorInitializersOnNextLine", OnNextLine); IO.mapOptional("AlwaysBreakTemplateDeclarations", - Style.AlwaysBreakTemplateDeclarations); + Style.BreakTemplateDeclarations); IO.mapOptional("BreakBeforeInheritanceComma", BreakBeforeInheritanceComma); IO.mapOptional("BreakConstructorInitializersBeforeComma", @@ -972,7 +972,7 @@ template <> struct MappingTraits { IO.mapOptional("BreakInheritanceList", Style.BreakInheritanceList); IO.mapOptional("BreakStringLiterals", Style.BreakStringLiterals); IO.mapOptional("BreakTemplateDeclarations", - Style.AlwaysBreakTemplateDeclarations); + Style.BreakTemplateDeclarations); IO.mapOptional("ColumnLimit", Style.ColumnLimit); IO.mapOptional("CommentPragmas", Style.CommentPragmas); IO.mapOptional("CompactNamespaces", Style.CompactNamespaces); @@ -1441,7 +1441,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.AlwaysBreakAfterReturnType = FormatStyle::RTBS_None; LLVMStyle.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_None; LLVMStyle.AlwaysBreakBeforeMultilineStrings = false; - LLVMStyle.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_MultiLine; + LLVMStyle.BreakTemplateDeclarations = FormatStyle::BTDS_MultiLine; LLVMStyle.AttributeMacros.push_back("__capability"); LLVMStyle.BitFieldColonSpacing = FormatStyle::BFCS_Both; LLVMStyle.BinPackArguments = true; @@ -1631,7 +1631,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) { FormatStyle::SIS_WithoutElse; GoogleStyle.AllowShortLoopsOnASingleLine = true; GoogleStyle.AlwaysBreakBeforeMultilineStrings = true; - GoogleStyle.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes; + GoogleStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes; GoogleStyle.DerivePointerAlignment = true; GoogleStyle.IncludeStyle.IncludeCategories = {{"^", 2, 0, false}, {"^<.*\\.h>", 1, 0, false}, @@ -1824,7 +1824,7 @@ FormatStyle getMozillaStyle() { MozillaStyle.AlwaysBreakAfterReturnType = FormatStyle::RTBS_TopLevel; MozillaStyle.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_TopLevel; - MozillaStyle.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes; + MozillaStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes; MozillaStyle.BinPackParameters = false; MozillaStyle.BinPackArguments = false; MozillaStyle.BreakBeforeBraces = FormatStyle::BS_Mozilla; diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index cec56fa..b103400 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -5184,8 +5184,8 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, // concept ... if (Right.is(tok::kw_concept)) return Style.BreakBeforeConceptDeclarations == FormatStyle::BBCDS_Always; - return Style.AlwaysBreakTemplateDeclarations == FormatStyle::BTDS_Yes || - (Style.AlwaysBreakTemplateDeclarations == FormatStyle::BTDS_Leave && + return Style.BreakTemplateDeclarations == FormatStyle::BTDS_Yes || + (Style.BreakTemplateDeclarations == FormatStyle::BTDS_Leave && Right.NewlinesBefore > 0); } if (Left.ClosesRequiresClause && Right.isNot(tok::semi)) { @@ -5620,7 +5620,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, if (Right.is(TT_RequiresClause)) return true; if (Left.ClosesTemplateDeclaration) { - return Style.AlwaysBreakTemplateDeclarations != FormatStyle::BTDS_Leave || + return Style.BreakTemplateDeclarations != FormatStyle::BTDS_Leave || Right.NewlinesBefore > 0; } if (Left.is(TT_FunctionAnnotationRParen)) diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 22681a2..571e1eb 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -694,32 +694,32 @@ TEST(ConfigParseTest, ParsesConfiguration) { AlwaysBreakAfterReturnType, FormatStyle::RTBS_TopLevelDefinitions); - Style.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes; - CHECK_PARSE("BreakTemplateDeclarations: Leave", - AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Leave); - CHECK_PARSE("BreakTemplateDeclarations: No", AlwaysBreakTemplateDeclarations, + Style.BreakTemplateDeclarations = FormatStyle::BTDS_Yes; + CHECK_PARSE("BreakTemplateDeclarations: Leave", BreakTemplateDeclarations, + FormatStyle::BTDS_Leave); + CHECK_PARSE("BreakTemplateDeclarations: No", BreakTemplateDeclarations, FormatStyle::BTDS_No); - CHECK_PARSE("BreakTemplateDeclarations: MultiLine", - AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine); - CHECK_PARSE("BreakTemplateDeclarations: Yes", AlwaysBreakTemplateDeclarations, + CHECK_PARSE("BreakTemplateDeclarations: MultiLine", BreakTemplateDeclarations, + FormatStyle::BTDS_MultiLine); + CHECK_PARSE("BreakTemplateDeclarations: Yes", BreakTemplateDeclarations, + FormatStyle::BTDS_Yes); + CHECK_PARSE("BreakTemplateDeclarations: false", BreakTemplateDeclarations, + FormatStyle::BTDS_MultiLine); + CHECK_PARSE("BreakTemplateDeclarations: true", BreakTemplateDeclarations, FormatStyle::BTDS_Yes); - CHECK_PARSE("BreakTemplateDeclarations: false", - AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine); - CHECK_PARSE("BreakTemplateDeclarations: true", - AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Yes); // For backward compatibility: CHECK_PARSE("AlwaysBreakTemplateDeclarations: Leave", - AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Leave); - CHECK_PARSE("AlwaysBreakTemplateDeclarations: No", - AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_No); + BreakTemplateDeclarations, FormatStyle::BTDS_Leave); + CHECK_PARSE("AlwaysBreakTemplateDeclarations: No", BreakTemplateDeclarations, + FormatStyle::BTDS_No); CHECK_PARSE("AlwaysBreakTemplateDeclarations: MultiLine", - AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine); - CHECK_PARSE("AlwaysBreakTemplateDeclarations: Yes", - AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Yes); + BreakTemplateDeclarations, FormatStyle::BTDS_MultiLine); + CHECK_PARSE("AlwaysBreakTemplateDeclarations: Yes", BreakTemplateDeclarations, + FormatStyle::BTDS_Yes); CHECK_PARSE("AlwaysBreakTemplateDeclarations: false", - AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine); + BreakTemplateDeclarations, FormatStyle::BTDS_MultiLine); CHECK_PARSE("AlwaysBreakTemplateDeclarations: true", - AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Yes); + BreakTemplateDeclarations, FormatStyle::BTDS_Yes); Style.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_All; CHECK_PARSE("AlwaysBreakAfterDefinitionReturnType: None", diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index b1a2247..7b65c8d 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -10638,7 +10638,7 @@ TEST_F(FormatTest, WrapsTemplateDeclarations) { " const typename aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaa);"); FormatStyle AlwaysBreak = getLLVMStyle(); - AlwaysBreak.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes; + AlwaysBreak.BreakTemplateDeclarations = FormatStyle::BTDS_Yes; verifyFormat("template \nclass C {};", AlwaysBreak); verifyFormat("template \nvoid f();", AlwaysBreak); verifyFormat("template \nvoid f() {}", AlwaysBreak); @@ -10667,7 +10667,7 @@ TEST_F(FormatTest, WrapsTemplateDeclarations) { "};"); FormatStyle NeverBreak = getLLVMStyle(); - NeverBreak.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_No; + NeverBreak.BreakTemplateDeclarations = FormatStyle::BTDS_No; verifyFormat("template class C {};", NeverBreak); verifyFormat("template void f();", NeverBreak); verifyFormat("template void f() {}", NeverBreak); @@ -10699,7 +10699,7 @@ TEST_F(FormatTest, WrapsTemplateDeclarations) { NeverBreak); auto Style = getLLVMStyle(); - Style.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Leave; + Style.BreakTemplateDeclarations = FormatStyle::BTDS_Leave; verifyNoChange("template \n" "class C {};", @@ -11297,7 +11297,7 @@ TEST_F(FormatTest, UnderstandsFunctionRefQualification) { verifyFormat("SomeType MemberFunction( const Deleted & ) &;", Spaces); FormatStyle BreakTemplate = getLLVMStyle(); - BreakTemplate.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes; + BreakTemplate.BreakTemplateDeclarations = FormatStyle::BTDS_Yes; verifyFormat("struct f {\n" " template \n" @@ -11330,8 +11330,7 @@ TEST_F(FormatTest, UnderstandsFunctionRefQualification) { BreakTemplate); FormatStyle AlignLeftBreakTemplate = getLLVMStyle(); - AlignLeftBreakTemplate.AlwaysBreakTemplateDeclarations = - FormatStyle::BTDS_Yes; + AlignLeftBreakTemplate.BreakTemplateDeclarations = FormatStyle::BTDS_Yes; AlignLeftBreakTemplate.PointerAlignment = FormatStyle::PAS_Left; verifyFormat("struct f {\n" -- cgit v1.1 From e165bea1d4ec2de96ee0548cece79d71a75ce8f8 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 9 Feb 2024 20:57:05 -0800 Subject: [lld] Fix test failures when running as root user (#81339) This makes it easier to run the tests in a containerized environment. --- lld/test/COFF/lto-cache-errors.ll | 2 +- lld/test/COFF/thinlto-emit-imports.ll | 2 +- lld/test/ELF/lto/resolution-err.ll | 2 +- lld/test/ELF/lto/thinlto-cant-write-index.ll | 2 +- lld/test/ELF/lto/thinlto-emit-imports.ll | 2 +- lld/test/MachO/invalid/invalid-lto-object-path.ll | 2 +- lld/test/MachO/thinlto-emit-imports.ll | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lld/test/COFF/lto-cache-errors.ll b/lld/test/COFF/lto-cache-errors.ll index 55244e5..a46190a 100644 --- a/lld/test/COFF/lto-cache-errors.ll +++ b/lld/test/COFF/lto-cache-errors.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ;; Not supported on windows since we use permissions to deny the creation ; UNSUPPORTED: system-windows diff --git a/lld/test/COFF/thinlto-emit-imports.ll b/lld/test/COFF/thinlto-emit-imports.ll index a9f22c1..b47a6cea 100644 --- a/lld/test/COFF/thinlto-emit-imports.ll +++ b/lld/test/COFF/thinlto-emit-imports.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ; Generate summary sections and test lld handling. ; RUN: opt -module-summary %s -o %t1.obj diff --git a/lld/test/ELF/lto/resolution-err.ll b/lld/test/ELF/lto/resolution-err.ll index 6dfa64b..f9855ab 100644 --- a/lld/test/ELF/lto/resolution-err.ll +++ b/lld/test/ELF/lto/resolution-err.ll @@ -1,5 +1,5 @@ ; UNSUPPORTED: system-windows -; REQUIRES: shell +; REQUIRES: shell, non-root-user ; RUN: llvm-as %s -o %t.bc ; RUN: touch %t.resolution.txt ; RUN: chmod u-w %t.resolution.txt diff --git a/lld/test/ELF/lto/thinlto-cant-write-index.ll b/lld/test/ELF/lto/thinlto-cant-write-index.ll index e664acb..286fcdd 100644 --- a/lld/test/ELF/lto/thinlto-cant-write-index.ll +++ b/lld/test/ELF/lto/thinlto-cant-write-index.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ; Basic ThinLTO tests. ; RUN: opt -module-summary %s -o %t1.o diff --git a/lld/test/ELF/lto/thinlto-emit-imports.ll b/lld/test/ELF/lto/thinlto-emit-imports.ll index 6d0e1e6..253ec08 100644 --- a/lld/test/ELF/lto/thinlto-emit-imports.ll +++ b/lld/test/ELF/lto/thinlto-emit-imports.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ;; Test a few properties not tested by thinlto-index-only.ll ; RUN: opt -module-summary %s -o %t1.o diff --git a/lld/test/MachO/invalid/invalid-lto-object-path.ll b/lld/test/MachO/invalid/invalid-lto-object-path.ll index 75c6a97..c862538 100644 --- a/lld/test/MachO/invalid/invalid-lto-object-path.ll +++ b/lld/test/MachO/invalid/invalid-lto-object-path.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ;; Creating read-only directories with `chmod 400` isn't supported on Windows ; UNSUPPORTED: system-windows diff --git a/lld/test/MachO/thinlto-emit-imports.ll b/lld/test/MachO/thinlto-emit-imports.ll index 47a612b..88f766f 100644 --- a/lld/test/MachO/thinlto-emit-imports.ll +++ b/lld/test/MachO/thinlto-emit-imports.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ; RUN: rm -rf %t; split-file %s %t ; Generate summary sections and test lld handling. -- cgit v1.1 From 7192c22ee43500b1a6313d1ade38e002463944a6 Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Sat, 10 Feb 2024 00:42:33 -0500 Subject: [GlobalISel][RISCV] Use constant pool for large integer constants. (#81101) We apply custom lowering to 64 bit constants where we use the same logic as in non-global isel: if materializing in registers is too expensive, we emit a load from constant pool. Later, during instruction selection, constant pool address is generated using `selectAddr`. --- llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 89 +++++++++++++++++++++- llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h | 4 + .../legalizer/legalize-bitreverse-rv64.mir | 33 ++++---- .../GlobalISel/legalizer/legalize-const-rv64.mir | 36 +++++++-- 4 files changed, 138 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index ae02e86..e852052 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -11,11 +11,14 @@ //===----------------------------------------------------------------------===// #include "RISCVLegalizerInfo.h" +#include "MCTargetDesc/RISCVMatInt.h" #include "RISCVMachineFunctionInfo.h" #include "RISCVSubtarget.h" +#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" @@ -182,7 +185,13 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) CTPOPActions.maxScalar(0, sXLen).scalarSameSizeAs(1, 0).lower(); } - getActionDefinitionsBuilder({G_CONSTANT, G_IMPLICIT_DEF}) + auto &ConstantActions = getActionDefinitionsBuilder(G_CONSTANT); + ConstantActions.legalFor({s32, p0}); + if (ST.is64Bit()) + ConstantActions.customFor({s64}); + ConstantActions.widenScalarToNextPow2(0).clampScalar(0, s32, sXLen); + + getActionDefinitionsBuilder(G_IMPLICIT_DEF) .legalFor({s32, sXLen, p0}) .widenScalarToNextPow2(0) .clampScalar(0, s32, sXLen); @@ -451,17 +460,95 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI, return true; } +bool RISCVLegalizerInfo::shouldBeInConstantPool(APInt APImm, + bool ShouldOptForSize) const { + unsigned BitWidth = APImm.getBitWidth(); + assert(BitWidth == 32 || BitWidth == 64); + int64_t Imm = APImm.getSExtValue(); + // All simm32 constants should be handled by isel. + // NOTE: The getMaxBuildIntsCost call below should return a value >= 2 making + // this check redundant, but small immediates are common so this check + // should have better compile time. + if (isInt<32>(Imm)) + return false; + + // We only need to cost the immediate, if constant pool lowering is enabled. + if (!STI.useConstantPoolForLargeInts()) + return false; + + RISCVMatInt::InstSeq Seq = RISCVMatInt::generateInstSeq(Imm, STI); + if (Seq.size() <= STI.getMaxBuildIntsCost()) + return false; + + // Optimizations below are disabled for opt size. If we're optimizing for + // size, use a constant pool. + if (ShouldOptForSize) + return true; + // + // Special case. See if we can build the constant as (ADD (SLLI X, C), X) do + // that if it will avoid a constant pool. + // It will require an extra temporary register though. + // If we have Zba we can use (ADD_UW X, (SLLI X, 32)) to handle cases where + // low and high 32 bits are the same and bit 31 and 63 are set. + unsigned ShiftAmt, AddOpc; + RISCVMatInt::InstSeq SeqLo = + RISCVMatInt::generateTwoRegInstSeq(Imm, STI, ShiftAmt, AddOpc); + return !(!SeqLo.empty() && (SeqLo.size() + 2) <= STI.getMaxBuildIntsCost()); +} + +// TODO: This is almost the same as LegalizerHelper::lowerFConstant and is +// target-independent. Should we move this to LegalizeHelper? +bool RISCVLegalizerInfo::emitLoadFromConstantPool( + Register DstReg, const Constant *ConstVal, + MachineIRBuilder &MIRBuilder) const { + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + MachineFunction &MF = MIRBuilder.getMF(); + const DataLayout &DL = MIRBuilder.getDataLayout(); + LLVMContext &Ctx = MF.getFunction().getContext(); + unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace(); + LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); + LLT DstLLT = MRI.getType(DstReg); + + Align Alignment(DL.getABITypeAlign(getTypeForLLT(DstLLT, Ctx))); + + auto Addr = MIRBuilder.buildConstantPool( + AddrPtrTy, + MF.getConstantPool()->getConstantPoolIndex(ConstVal, Alignment)); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF), + MachineMemOperand::MOLoad, DstLLT, Alignment); + + MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, DstReg, Addr, *MMO); + return true; +} + bool RISCVLegalizerInfo::legalizeCustom( LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; GISelChangeObserver &Observer = Helper.Observer; + MachineFunction &MF = *MI.getParent()->getParent(); switch (MI.getOpcode()) { default: // No idea what to do. return false; case TargetOpcode::G_ABS: return Helper.lowerAbsToMaxNeg(MI); + // TODO: G_FCONSTANT + case TargetOpcode::G_CONSTANT: { + const Function &F = MF.getFunction(); + // TODO: if PSI and BFI are present, add " || + // llvm::shouldOptForSize(*CurMBB, PSI, BFI)". + bool ShouldOptForSize = F.hasOptSize() || F.hasMinSize(); + const ConstantInt *ConstVal = MI.getOperand(1).getCImm(); + if (!shouldBeInConstantPool(ConstVal->getValue(), ShouldOptForSize)) + return true; + emitLoadFromConstantPool(MI.getOperand(0).getReg(), + MI.getOperand(1).getCImm(), MIRBuilder); + MI.eraseFromParent(); + return true; + } case TargetOpcode::G_SHL: case TargetOpcode::G_ASHR: case TargetOpcode::G_LSHR: diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h index f3ec6be..046555f 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h @@ -14,6 +14,7 @@ #define LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/Register.h" namespace llvm { @@ -36,6 +37,9 @@ public: MachineInstr &MI) const override; private: + bool shouldBeInConstantPool(APInt APImm, bool ShouldOptForSize) const; + bool emitLoadFromConstantPool(Register DstReg, const Constant *CPVal, + MachineIRBuilder &MIRBuilder) const; bool legalizeShlAshrLshr(MachineInstr &MI, MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const; diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir index f4a098d..d147350 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir @@ -220,25 +220,28 @@ body: | ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[LSHR3]], [[C5]] ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[OR5]], [[AND5]] ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1085102592571150096 - ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[OR6]], [[C8]] + ; CHECK-NEXT: [[CONSTANT_POOL:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.2 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL]](p0) :: (load (s64) from constant-pool) + ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[OR6]], [[LOAD]] ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[AND6]], [[C7]](s64) ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[OR6]], [[C7]](s64) - ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SHL4]], [[C8]] + ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SHL4]], [[LOAD]] ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s64) = G_OR [[LSHR4]], [[AND7]] - ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 -3689348814741910324 - ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[OR7]], [[C10]] - ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[AND8]], [[C9]](s64) - ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[OR7]], [[C9]](s64) - ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SHL5]], [[C10]] + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[CONSTANT_POOL1:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.1 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL1]](p0) :: (load (s64) from constant-pool) + ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[OR7]], [[LOAD1]] + ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[AND8]], [[C8]](s64) + ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[OR7]], [[C8]](s64) + ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SHL5]], [[LOAD1]] ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[AND9]] - ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 -6148914691236517206 - ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[OR8]], [[C12]] - ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND10]], [[C11]](s64) - ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[OR8]], [[C11]](s64) - ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s64) = G_AND [[SHL6]], [[C12]] + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[CONSTANT_POOL2:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.0 + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL2]](p0) :: (load (s64) from constant-pool) + ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[OR8]], [[LOAD2]] + ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND10]], [[C9]](s64) + ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[OR8]], [[C9]](s64) + ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s64) = G_AND [[SHL6]], [[LOAD2]] ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s64) = G_OR [[LSHR6]], [[AND11]] ; CHECK-NEXT: $x10 = COPY [[OR9]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir index fa57295..6389fd6 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir @@ -6,8 +6,9 @@ name: const_i8 body: | bb.0.entry: ; CHECK-LABEL: name: const_i8 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -127 - ; CHECK-NEXT: $x10 = COPY [[C]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -127 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32) + ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 %0:_(s8) = G_CONSTANT i8 129 %1:_(s64) = G_ANYEXT %0(s8) @@ -20,8 +21,9 @@ name: const_i15 body: | bb.0.entry: ; CHECK-LABEL: name: const_i15 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 - ; CHECK-NEXT: $x10 = COPY [[C]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32) + ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 %0:_(s15) = G_CONSTANT i15 15 %1:_(s64) = G_ANYEXT %0(s15) @@ -34,8 +36,9 @@ name: const_i16 body: | bb.0.entry: ; CHECK-LABEL: name: const_i16 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 767 - ; CHECK-NEXT: $x10 = COPY [[C]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 767 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32) + ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 %0:_(s16) = G_CONSTANT i16 -64769 %1:_(s64) = G_ANYEXT %0(s16) @@ -48,8 +51,9 @@ name: const_i32 body: | bb.0.entry: ; CHECK-LABEL: name: const_i32 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -64769 - ; CHECK-NEXT: $x10 = COPY [[C]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -64769 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32) + ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 %0:_(s32) = G_CONSTANT i32 -64769 %1:_(s64) = G_ANYEXT %0(s32) @@ -180,3 +184,19 @@ body: | PseudoRET implicit $x10 ... + +... +--- +name: constant_pool_i64 +body: | + bb.0.entry: + ; CHECK-LABEL: name: constant_pool_i64 + ; CHECK: [[CONSTANT_POOL:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL]](p0) :: (load (s64) from constant-pool) + ; CHECK-NEXT: $x10 = COPY [[LOAD]](s64) + ; CHECK-NEXT: PseudoRET implicit $x10 + %0:_(s64) = G_CONSTANT i64 -1085102592571150096 + $x10 = COPY %0(s64) + PseudoRET implicit $x10 + +... -- cgit v1.1 From c08b90c50bcac9f3f563c79491c8dbcbe7c3b574 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 9 Feb 2024 21:34:10 -0800 Subject: [RISCV] Lower the TransientStackAlignment to the ABI alignment for rv32e/rv64e. I don't think the transient alignment needs to be larger than the ABI alignment. --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 8 +- llvm/test/CodeGen/RISCV/callee-saved-gprs.ll | 80 ++-- llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll | 208 +++++----- llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll | 226 +++++------ llvm/test/CodeGen/RISCV/vararg.ll | 470 +++++++++++------------ 5 files changed, 496 insertions(+), 496 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 60f92af..0de4785 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -36,10 +36,10 @@ static Align getABIStackAlignment(RISCVABI::ABI ABI) { } RISCVFrameLowering::RISCVFrameLowering(const RISCVSubtarget &STI) - : TargetFrameLowering(StackGrowsDown, - getABIStackAlignment(STI.getTargetABI()), - /*LocalAreaOffset=*/0, - /*TransientStackAlignment=*/Align(16)), + : TargetFrameLowering( + StackGrowsDown, getABIStackAlignment(STI.getTargetABI()), + /*LocalAreaOffset=*/0, + /*TransientStackAlignment=*/getABIStackAlignment(STI.getTargetABI())), STI(STI) {} static const MCPhysReg AllPopRegs[] = { diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll index 710b602..5e8ed45 100644 --- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll +++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll @@ -150,24 +150,24 @@ define void @callee() nounwind { ; ; RV32I-ILP32E-LABEL: callee: ; RV32I-ILP32E: # %bb.0: -; RV32I-ILP32E-NEXT: addi sp, sp, -48 -; RV32I-ILP32E-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32I-ILP32E-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; RV32I-ILP32E-NEXT: sw s1, 36(sp) # 4-byte Folded Spill +; RV32I-ILP32E-NEXT: addi sp, sp, -36 +; RV32I-ILP32E-NEXT: sw ra, 32(sp) # 4-byte Folded Spill +; RV32I-ILP32E-NEXT: sw s0, 28(sp) # 4-byte Folded Spill +; RV32I-ILP32E-NEXT: sw s1, 24(sp) # 4-byte Folded Spill ; RV32I-ILP32E-NEXT: lui a6, %hi(var) ; RV32I-ILP32E-NEXT: lw a0, %lo(var)(a6) -; RV32I-ILP32E-NEXT: sw a0, 32(sp) # 4-byte Folded Spill +; RV32I-ILP32E-NEXT: sw a0, 20(sp) # 4-byte Folded Spill ; RV32I-ILP32E-NEXT: lw a0, %lo(var+4)(a6) -; RV32I-ILP32E-NEXT: sw a0, 28(sp) # 4-byte Folded Spill +; RV32I-ILP32E-NEXT: sw a0, 16(sp) # 4-byte Folded Spill ; RV32I-ILP32E-NEXT: lw a0, %lo(var+8)(a6) -; RV32I-ILP32E-NEXT: sw a0, 24(sp) # 4-byte Folded Spill +; RV32I-ILP32E-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32I-ILP32E-NEXT: lw a0, %lo(var+12)(a6) -; RV32I-ILP32E-NEXT: sw a0, 20(sp) # 4-byte Folded Spill +; RV32I-ILP32E-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32I-ILP32E-NEXT: addi a5, a6, %lo(var) ; RV32I-ILP32E-NEXT: lw a0, 16(a5) -; RV32I-ILP32E-NEXT: sw a0, 16(sp) # 4-byte Folded Spill +; RV32I-ILP32E-NEXT: sw a0, 4(sp) # 4-byte Folded Spill ; RV32I-ILP32E-NEXT: lw a0, 20(a5) -; RV32I-ILP32E-NEXT: sw a0, 12(sp) # 4-byte Folded Spill +; RV32I-ILP32E-NEXT: sw a0, 0(sp) # 4-byte Folded Spill ; RV32I-ILP32E-NEXT: lw t0, 24(a5) ; RV32I-ILP32E-NEXT: lw t1, 28(a5) ; RV32I-ILP32E-NEXT: lw t2, 32(a5) @@ -220,22 +220,22 @@ define void @callee() nounwind { ; RV32I-ILP32E-NEXT: sw t2, 32(a5) ; RV32I-ILP32E-NEXT: sw t1, 28(a5) ; RV32I-ILP32E-NEXT: sw t0, 24(a5) -; RV32I-ILP32E-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; RV32I-ILP32E-NEXT: lw a0, 0(sp) # 4-byte Folded Reload ; RV32I-ILP32E-NEXT: sw a0, 20(a5) -; RV32I-ILP32E-NEXT: lw a0, 16(sp) # 4-byte Folded Reload +; RV32I-ILP32E-NEXT: lw a0, 4(sp) # 4-byte Folded Reload ; RV32I-ILP32E-NEXT: sw a0, 16(a5) -; RV32I-ILP32E-NEXT: lw a0, 20(sp) # 4-byte Folded Reload +; RV32I-ILP32E-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-ILP32E-NEXT: sw a0, %lo(var+12)(a6) -; RV32I-ILP32E-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; RV32I-ILP32E-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32I-ILP32E-NEXT: sw a0, %lo(var+8)(a6) -; RV32I-ILP32E-NEXT: lw a0, 28(sp) # 4-byte Folded Reload +; RV32I-ILP32E-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32I-ILP32E-NEXT: sw a0, %lo(var+4)(a6) -; RV32I-ILP32E-NEXT: lw a0, 32(sp) # 4-byte Folded Reload +; RV32I-ILP32E-NEXT: lw a0, 20(sp) # 4-byte Folded Reload ; RV32I-ILP32E-NEXT: sw a0, %lo(var)(a6) -; RV32I-ILP32E-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; RV32I-ILP32E-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; RV32I-ILP32E-NEXT: lw s1, 36(sp) # 4-byte Folded Reload -; RV32I-ILP32E-NEXT: addi sp, sp, 48 +; RV32I-ILP32E-NEXT: lw ra, 32(sp) # 4-byte Folded Reload +; RV32I-ILP32E-NEXT: lw s0, 28(sp) # 4-byte Folded Reload +; RV32I-ILP32E-NEXT: lw s1, 24(sp) # 4-byte Folded Reload +; RV32I-ILP32E-NEXT: addi sp, sp, 36 ; RV32I-ILP32E-NEXT: ret ; ; RV32I-WITH-FP-LABEL: callee: @@ -659,24 +659,24 @@ define void @callee() nounwind { ; ; RV64I-LP64E-LABEL: callee: ; RV64I-LP64E: # %bb.0: -; RV64I-LP64E-NEXT: addi sp, sp, -80 -; RV64I-LP64E-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; RV64I-LP64E-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; RV64I-LP64E-NEXT: sd s1, 56(sp) # 8-byte Folded Spill +; RV64I-LP64E-NEXT: addi sp, sp, -72 +; RV64I-LP64E-NEXT: sd ra, 64(sp) # 8-byte Folded Spill +; RV64I-LP64E-NEXT: sd s0, 56(sp) # 8-byte Folded Spill +; RV64I-LP64E-NEXT: sd s1, 48(sp) # 8-byte Folded Spill ; RV64I-LP64E-NEXT: lui a6, %hi(var) ; RV64I-LP64E-NEXT: lw a0, %lo(var)(a6) -; RV64I-LP64E-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-LP64E-NEXT: lw a0, %lo(var+4)(a6) ; RV64I-LP64E-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-LP64E-NEXT: lw a0, %lo(var+8)(a6) +; RV64I-LP64E-NEXT: lw a0, %lo(var+4)(a6) ; RV64I-LP64E-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-LP64E-NEXT: lw a0, %lo(var+12)(a6) +; RV64I-LP64E-NEXT: lw a0, %lo(var+8)(a6) ; RV64I-LP64E-NEXT: sd a0, 24(sp) # 8-byte Folded Spill +; RV64I-LP64E-NEXT: lw a0, %lo(var+12)(a6) +; RV64I-LP64E-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64I-LP64E-NEXT: addi a5, a6, %lo(var) ; RV64I-LP64E-NEXT: lw a0, 16(a5) -; RV64I-LP64E-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-LP64E-NEXT: lw a0, 20(a5) ; RV64I-LP64E-NEXT: sd a0, 8(sp) # 8-byte Folded Spill +; RV64I-LP64E-NEXT: lw a0, 20(a5) +; RV64I-LP64E-NEXT: sd a0, 0(sp) # 8-byte Folded Spill ; RV64I-LP64E-NEXT: lw t0, 24(a5) ; RV64I-LP64E-NEXT: lw t1, 28(a5) ; RV64I-LP64E-NEXT: lw t2, 32(a5) @@ -729,22 +729,22 @@ define void @callee() nounwind { ; RV64I-LP64E-NEXT: sw t2, 32(a5) ; RV64I-LP64E-NEXT: sw t1, 28(a5) ; RV64I-LP64E-NEXT: sw t0, 24(a5) -; RV64I-LP64E-NEXT: ld a0, 8(sp) # 8-byte Folded Reload +; RV64I-LP64E-NEXT: ld a0, 0(sp) # 8-byte Folded Reload ; RV64I-LP64E-NEXT: sw a0, 20(a5) -; RV64I-LP64E-NEXT: ld a0, 16(sp) # 8-byte Folded Reload +; RV64I-LP64E-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64I-LP64E-NEXT: sw a0, 16(a5) -; RV64I-LP64E-NEXT: ld a0, 24(sp) # 8-byte Folded Reload +; RV64I-LP64E-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64I-LP64E-NEXT: sw a0, %lo(var+12)(a6) -; RV64I-LP64E-NEXT: ld a0, 32(sp) # 8-byte Folded Reload +; RV64I-LP64E-NEXT: ld a0, 24(sp) # 8-byte Folded Reload ; RV64I-LP64E-NEXT: sw a0, %lo(var+8)(a6) -; RV64I-LP64E-NEXT: ld a0, 40(sp) # 8-byte Folded Reload +; RV64I-LP64E-NEXT: ld a0, 32(sp) # 8-byte Folded Reload ; RV64I-LP64E-NEXT: sw a0, %lo(var+4)(a6) -; RV64I-LP64E-NEXT: ld a0, 48(sp) # 8-byte Folded Reload +; RV64I-LP64E-NEXT: ld a0, 40(sp) # 8-byte Folded Reload ; RV64I-LP64E-NEXT: sw a0, %lo(var)(a6) -; RV64I-LP64E-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; RV64I-LP64E-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; RV64I-LP64E-NEXT: ld s1, 56(sp) # 8-byte Folded Reload -; RV64I-LP64E-NEXT: addi sp, sp, 80 +; RV64I-LP64E-NEXT: ld ra, 64(sp) # 8-byte Folded Reload +; RV64I-LP64E-NEXT: ld s0, 56(sp) # 8-byte Folded Reload +; RV64I-LP64E-NEXT: ld s1, 48(sp) # 8-byte Folded Reload +; RV64I-LP64E-NEXT: addi sp, sp, 72 ; RV64I-LP64E-NEXT: ret ; ; RV64I-WITH-FP-LABEL: callee: diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll index 0eb6391..5c55113 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll @@ -151,20 +151,20 @@ define i32 @callee_float_on_stack(i64 %a, i64 %b, i64 %c, i64 %d, float %e) { ; ; ILP32E-WITHFP-LABEL: callee_float_on_stack: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: lw a0, 8(s0) ; ILP32E-WITHFP-NEXT: lw a1, 0(s0) ; ILP32E-WITHFP-NEXT: add a0, a1, a0 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_float_on_stack: @@ -298,18 +298,18 @@ define float @callee_tiny_scalar_ret() { ; ; ILP32E-WITHFP-LABEL: callee_tiny_scalar_ret: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: lui a0, 260096 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_tiny_scalar_ret: @@ -543,13 +543,13 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 % ; ; ILP32E-WITHFP-LABEL: callee_aligned_stack: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: lw a0, 0(a2) ; ILP32E-WITHFP-NEXT: lw a1, 12(s0) @@ -562,9 +562,9 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 % ; ILP32E-WITHFP-NEXT: add a0, a0, a1 ; ILP32E-WITHFP-NEXT: add a4, a5, a4 ; ILP32E-WITHFP-NEXT: add a0, a0, a4 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_aligned_stack: @@ -847,19 +847,19 @@ define double @callee_small_scalar_ret() { ; ; ILP32E-WITHFP-LABEL: callee_small_scalar_ret: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: lui a1, 261888 ; ILP32E-WITHFP-NEXT: li a0, 0 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_small_scalar_ret: @@ -944,18 +944,18 @@ define i32 @callee_i64_in_regs(i32 %a, i64 %b) { ; ; ILP32E-WITHFP-LABEL: callee_i64_in_regs: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: add a0, a0, a1 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_i64_in_regs: @@ -1066,13 +1066,13 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i ; ; ILP32E-WITHFP-LABEL: callee_many_scalars: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: lw a6, 12(s0) ; ILP32E-WITHFP-NEXT: lw a7, 0(s0) @@ -1091,9 +1091,9 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i ; ILP32E-WITHFP-NEXT: add a0, a0, a7 ; ILP32E-WITHFP-NEXT: add a0, a0, a6 ; ILP32E-WITHFP-NEXT: add a0, a1, a0 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_many_scalars: @@ -1287,13 +1287,13 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) { ; ; ILP32E-WITHFP-LABEL: callee_large_scalars: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: lw a2, 0(a1) ; ILP32E-WITHFP-NEXT: lw a3, 0(a0) @@ -1311,9 +1311,9 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) { ; ILP32E-WITHFP-NEXT: or a0, a2, a0 ; ILP32E-WITHFP-NEXT: or a0, a0, a4 ; ILP32E-WITHFP-NEXT: seqz a0, a0 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars: @@ -1514,13 +1514,13 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; ; ILP32E-WITHFP-LABEL: callee_large_scalars_exhausted_regs: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: lw a0, 12(s0) ; ILP32E-WITHFP-NEXT: lw a1, 4(s0) @@ -1540,9 +1540,9 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; ILP32E-WITHFP-NEXT: or a0, a2, a0 ; ILP32E-WITHFP-NEXT: or a0, a0, a4 ; ILP32E-WITHFP-NEXT: seqz a0, a0 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars_exhausted_regs: @@ -1872,19 +1872,19 @@ define i32 @callee_small_coerced_struct([2 x i32] %a.coerce) { ; ; ILP32E-WITHFP-LABEL: callee_small_coerced_struct: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: xor a0, a0, a1 ; ILP32E-WITHFP-NEXT: seqz a0, a0 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_small_coerced_struct: @@ -1983,20 +1983,20 @@ define i32 @callee_large_struct(ptr byval(%struct.large) align 4 %a) { ; ; ILP32E-WITHFP-LABEL: callee_large_struct: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: lw a1, 0(a0) ; ILP32E-WITHFP-NEXT: lw a0, 12(a0) ; ILP32E-WITHFP-NEXT: add a0, a1, a0 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_struct: @@ -2153,19 +2153,19 @@ define %struct.small @callee_small_struct_ret() { ; ; ILP32E-WITHFP-LABEL: callee_small_struct_ret: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: li a0, 1 ; ILP32E-WITHFP-NEXT: li a1, 0 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_small_struct_ret: @@ -2260,22 +2260,22 @@ define fp128 @callee_large_scalar_ret() { ; ; ILP32E-WITHFP-LABEL: callee_large_scalar_ret: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: lui a1, 524272 ; ILP32E-WITHFP-NEXT: sw a1, 12(a0) ; ILP32E-WITHFP-NEXT: sw zero, 8(a0) ; ILP32E-WITHFP-NEXT: sw zero, 4(a0) ; ILP32E-WITHFP-NEXT: sw zero, 0(a0) -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalar_ret: @@ -2395,13 +2395,13 @@ define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result ; ; ILP32E-WITHFP-LABEL: callee_large_struct_ret: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -16 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 16 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -8 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 8 +; ILP32E-WITHFP-NEXT: sw ra, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 0(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -4 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: li a1, 1 ; ILP32E-WITHFP-NEXT: sw a1, 0(a0) @@ -2411,9 +2411,9 @@ define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result ; ILP32E-WITHFP-NEXT: sw a1, 8(a0) ; ILP32E-WITHFP-NEXT: li a1, 4 ; ILP32E-WITHFP-NEXT: sw a1, 12(a0) -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 16 +; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 8 ; ILP32E-WITHFP-NEXT: ret ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_struct_ret: diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll index 2fb674f..7fe67a0 100644 --- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll +++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll @@ -76,28 +76,28 @@ define i32 @va1(ptr %fmt, ...) { ; ; LP64E-FPELIM-LABEL: va1: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -64 -; LP64E-FPELIM-NEXT: .cfi_def_cfa_offset 64 +; LP64E-FPELIM-NEXT: addi sp, sp, -56 +; LP64E-FPELIM-NEXT: .cfi_def_cfa_offset 56 ; LP64E-FPELIM-NEXT: mv a0, a1 -; LP64E-FPELIM-NEXT: sd a5, 56(sp) -; LP64E-FPELIM-NEXT: sd a4, 48(sp) -; LP64E-FPELIM-NEXT: sd a3, 40(sp) -; LP64E-FPELIM-NEXT: sd a2, 32(sp) -; LP64E-FPELIM-NEXT: sd a1, 24(sp) -; LP64E-FPELIM-NEXT: addi a1, sp, 28 -; LP64E-FPELIM-NEXT: sd a1, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 64 +; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 24(sp) +; LP64E-FPELIM-NEXT: sd a1, 16(sp) +; LP64E-FPELIM-NEXT: addi a1, sp, 20 +; LP64E-FPELIM-NEXT: sd a1, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va1: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -80 -; LP64E-WITHFP-NEXT: .cfi_def_cfa_offset 80 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi sp, sp, -72 +; LP64E-WITHFP-NEXT: .cfi_def_cfa_offset 72 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill ; LP64E-WITHFP-NEXT: .cfi_offset ra, -56 ; LP64E-WITHFP-NEXT: .cfi_offset s0, -64 -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: .cfi_def_cfa s0, 48 ; LP64E-WITHFP-NEXT: mv a0, a1 ; LP64E-WITHFP-NEXT: sd a5, 40(s0) @@ -107,9 +107,9 @@ define i32 @va1(ptr %fmt, ...) { ; LP64E-WITHFP-NEXT: sd a1, 8(s0) ; LP64E-WITHFP-NEXT: addi a1, s0, 12 ; LP64E-WITHFP-NEXT: sd a1, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 80 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 72 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -161,24 +161,24 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind { ; ; LP64E-FPELIM-LABEL: va1_va_arg: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -64 +; LP64E-FPELIM-NEXT: addi sp, sp, -56 ; LP64E-FPELIM-NEXT: mv a0, a1 -; LP64E-FPELIM-NEXT: sd a5, 56(sp) -; LP64E-FPELIM-NEXT: sd a4, 48(sp) -; LP64E-FPELIM-NEXT: sd a3, 40(sp) -; LP64E-FPELIM-NEXT: sd a2, 32(sp) -; LP64E-FPELIM-NEXT: sd a1, 24(sp) -; LP64E-FPELIM-NEXT: addi a1, sp, 32 -; LP64E-FPELIM-NEXT: sd a1, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 64 +; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 24(sp) +; LP64E-FPELIM-NEXT: sd a1, 16(sp) +; LP64E-FPELIM-NEXT: addi a1, sp, 24 +; LP64E-FPELIM-NEXT: sd a1, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va1_va_arg: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -80 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -72 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: mv a0, a1 ; LP64E-WITHFP-NEXT: sd a5, 40(s0) ; LP64E-WITHFP-NEXT: sd a4, 32(s0) @@ -187,9 +187,9 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind { ; LP64E-WITHFP-NEXT: sd a1, 8(s0) ; LP64E-WITHFP-NEXT: addi a1, s0, 16 ; LP64E-WITHFP-NEXT: sd a1, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 80 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 72 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -435,24 +435,24 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ; LP64E-FPELIM-LABEL: va2: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -64 +; LP64E-FPELIM-NEXT: addi sp, sp, -56 ; LP64E-FPELIM-NEXT: mv a0, a1 -; LP64E-FPELIM-NEXT: sd a5, 56(sp) -; LP64E-FPELIM-NEXT: sd a4, 48(sp) -; LP64E-FPELIM-NEXT: sd a3, 40(sp) -; LP64E-FPELIM-NEXT: sd a2, 32(sp) -; LP64E-FPELIM-NEXT: sd a1, 24(sp) -; LP64E-FPELIM-NEXT: addi a1, sp, 39 -; LP64E-FPELIM-NEXT: sd a1, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 64 +; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 24(sp) +; LP64E-FPELIM-NEXT: sd a1, 16(sp) +; LP64E-FPELIM-NEXT: addi a1, sp, 31 +; LP64E-FPELIM-NEXT: sd a1, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va2: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -80 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -72 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: mv a0, a1 ; LP64E-WITHFP-NEXT: sd a5, 40(s0) ; LP64E-WITHFP-NEXT: sd a4, 32(s0) @@ -461,9 +461,9 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; LP64E-WITHFP-NEXT: sd a1, 8(s0) ; LP64E-WITHFP-NEXT: addi a1, s0, 23 ; LP64E-WITHFP-NEXT: sd a1, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 80 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 72 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -521,24 +521,24 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; ; LP64E-FPELIM-LABEL: va2_va_arg: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -64 +; LP64E-FPELIM-NEXT: addi sp, sp, -56 ; LP64E-FPELIM-NEXT: mv a0, a1 -; LP64E-FPELIM-NEXT: sd a5, 56(sp) -; LP64E-FPELIM-NEXT: sd a4, 48(sp) -; LP64E-FPELIM-NEXT: sd a3, 40(sp) -; LP64E-FPELIM-NEXT: sd a2, 32(sp) -; LP64E-FPELIM-NEXT: sd a1, 24(sp) -; LP64E-FPELIM-NEXT: addi a1, sp, 32 -; LP64E-FPELIM-NEXT: sd a1, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 64 +; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 24(sp) +; LP64E-FPELIM-NEXT: sd a1, 16(sp) +; LP64E-FPELIM-NEXT: addi a1, sp, 24 +; LP64E-FPELIM-NEXT: sd a1, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va2_va_arg: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -80 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -72 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: mv a0, a1 ; LP64E-WITHFP-NEXT: sd a5, 40(s0) ; LP64E-WITHFP-NEXT: sd a4, 32(s0) @@ -547,9 +547,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; LP64E-WITHFP-NEXT: sd a1, 8(s0) ; LP64E-WITHFP-NEXT: addi a1, s0, 16 ; LP64E-WITHFP-NEXT: sd a1, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 80 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 72 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -654,23 +654,23 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ; LP64E-FPELIM-LABEL: va3: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -48 -; LP64E-FPELIM-NEXT: sd a5, 40(sp) -; LP64E-FPELIM-NEXT: sd a4, 32(sp) -; LP64E-FPELIM-NEXT: sd a3, 24(sp) -; LP64E-FPELIM-NEXT: sd a2, 16(sp) -; LP64E-FPELIM-NEXT: addi a3, sp, 31 +; LP64E-FPELIM-NEXT: addi sp, sp, -40 +; LP64E-FPELIM-NEXT: sd a5, 32(sp) +; LP64E-FPELIM-NEXT: sd a4, 24(sp) +; LP64E-FPELIM-NEXT: sd a3, 16(sp) +; LP64E-FPELIM-NEXT: sd a2, 8(sp) +; LP64E-FPELIM-NEXT: addi a3, sp, 23 ; LP64E-FPELIM-NEXT: add a0, a1, a2 -; LP64E-FPELIM-NEXT: sd a3, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 48 +; LP64E-FPELIM-NEXT: sd a3, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 40 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va3: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -64 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -56 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: sd a5, 24(s0) ; LP64E-WITHFP-NEXT: sd a4, 16(s0) ; LP64E-WITHFP-NEXT: sd a3, 8(s0) @@ -678,9 +678,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; LP64E-WITHFP-NEXT: addi a3, s0, 15 ; LP64E-WITHFP-NEXT: add a0, a1, a2 ; LP64E-WITHFP-NEXT: sd a3, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 64 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 56 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -737,23 +737,23 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; ; LP64E-FPELIM-LABEL: va3_va_arg: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -48 -; LP64E-FPELIM-NEXT: sd a5, 40(sp) -; LP64E-FPELIM-NEXT: sd a4, 32(sp) -; LP64E-FPELIM-NEXT: sd a3, 24(sp) -; LP64E-FPELIM-NEXT: sd a2, 16(sp) -; LP64E-FPELIM-NEXT: addi a3, sp, 24 +; LP64E-FPELIM-NEXT: addi sp, sp, -40 +; LP64E-FPELIM-NEXT: sd a5, 32(sp) +; LP64E-FPELIM-NEXT: sd a4, 24(sp) +; LP64E-FPELIM-NEXT: sd a3, 16(sp) +; LP64E-FPELIM-NEXT: sd a2, 8(sp) +; LP64E-FPELIM-NEXT: addi a3, sp, 16 ; LP64E-FPELIM-NEXT: add a0, a1, a2 -; LP64E-FPELIM-NEXT: sd a3, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 48 +; LP64E-FPELIM-NEXT: sd a3, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 40 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va3_va_arg: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -64 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -56 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: sd a5, 24(s0) ; LP64E-WITHFP-NEXT: sd a4, 16(s0) ; LP64E-WITHFP-NEXT: sd a3, 8(s0) @@ -761,9 +761,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; LP64E-WITHFP-NEXT: addi a3, s0, 8 ; LP64E-WITHFP-NEXT: add a0, a1, a2 ; LP64E-WITHFP-NEXT: sd a3, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 64 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 56 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -1208,24 +1208,24 @@ define i32 @va6_no_fixed_args(...) nounwind { ; ; LP64E-FPELIM-LABEL: va6_no_fixed_args: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -64 -; LP64E-FPELIM-NEXT: sd a5, 56(sp) -; LP64E-FPELIM-NEXT: sd a4, 48(sp) -; LP64E-FPELIM-NEXT: sd a3, 40(sp) -; LP64E-FPELIM-NEXT: sd a2, 32(sp) -; LP64E-FPELIM-NEXT: sd a1, 24(sp) -; LP64E-FPELIM-NEXT: sd a0, 16(sp) -; LP64E-FPELIM-NEXT: addi a1, sp, 24 -; LP64E-FPELIM-NEXT: sd a1, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 64 +; LP64E-FPELIM-NEXT: addi sp, sp, -56 +; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 24(sp) +; LP64E-FPELIM-NEXT: sd a1, 16(sp) +; LP64E-FPELIM-NEXT: sd a0, 8(sp) +; LP64E-FPELIM-NEXT: addi a1, sp, 16 +; LP64E-FPELIM-NEXT: sd a1, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va6_no_fixed_args: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -80 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -72 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: sd a5, 40(s0) ; LP64E-WITHFP-NEXT: sd a4, 32(s0) ; LP64E-WITHFP-NEXT: sd a3, 24(s0) @@ -1234,9 +1234,9 @@ define i32 @va6_no_fixed_args(...) nounwind { ; LP64E-WITHFP-NEXT: sd a0, 0(s0) ; LP64E-WITHFP-NEXT: addi a1, s0, 8 ; LP64E-WITHFP-NEXT: sd a1, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 80 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 72 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll index 14afbae..621f549 100644 --- a/llvm/test/CodeGen/RISCV/vararg.ll +++ b/llvm/test/CodeGen/RISCV/vararg.ll @@ -111,28 +111,28 @@ define i32 @va1(ptr %fmt, ...) { ; ; ILP32E-FPELIM-LABEL: va1: ; ILP32E-FPELIM: # %bb.0: -; ILP32E-FPELIM-NEXT: addi sp, sp, -32 -; ILP32E-FPELIM-NEXT: .cfi_def_cfa_offset 32 +; ILP32E-FPELIM-NEXT: addi sp, sp, -28 +; ILP32E-FPELIM-NEXT: .cfi_def_cfa_offset 28 ; ILP32E-FPELIM-NEXT: mv a0, a1 -; ILP32E-FPELIM-NEXT: sw a5, 28(sp) -; ILP32E-FPELIM-NEXT: sw a4, 24(sp) -; ILP32E-FPELIM-NEXT: sw a3, 20(sp) -; ILP32E-FPELIM-NEXT: sw a2, 16(sp) -; ILP32E-FPELIM-NEXT: sw a1, 12(sp) -; ILP32E-FPELIM-NEXT: addi a1, sp, 16 -; ILP32E-FPELIM-NEXT: sw a1, 4(sp) -; ILP32E-FPELIM-NEXT: addi sp, sp, 32 +; ILP32E-FPELIM-NEXT: sw a5, 24(sp) +; ILP32E-FPELIM-NEXT: sw a4, 20(sp) +; ILP32E-FPELIM-NEXT: sw a3, 16(sp) +; ILP32E-FPELIM-NEXT: sw a2, 12(sp) +; ILP32E-FPELIM-NEXT: sw a1, 8(sp) +; ILP32E-FPELIM-NEXT: addi a1, sp, 12 +; ILP32E-FPELIM-NEXT: sw a1, 0(sp) +; ILP32E-FPELIM-NEXT: addi sp, sp, 28 ; ILP32E-FPELIM-NEXT: ret ; ; ILP32E-WITHFP-LABEL: va1: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -48 -; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 48 -; ILP32E-WITHFP-NEXT: sw ra, 20(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 16(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi sp, sp, -36 +; ILP32E-WITHFP-NEXT: .cfi_def_cfa_offset 36 +; ILP32E-WITHFP-NEXT: sw ra, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 4(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: .cfi_offset ra, -28 ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -32 -; ILP32E-WITHFP-NEXT: addi s0, sp, 24 +; ILP32E-WITHFP-NEXT: addi s0, sp, 12 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 24 ; ILP32E-WITHFP-NEXT: mv a0, a1 ; ILP32E-WITHFP-NEXT: sw a5, 20(s0) @@ -142,9 +142,9 @@ define i32 @va1(ptr %fmt, ...) { ; ILP32E-WITHFP-NEXT: sw a1, 4(s0) ; ILP32E-WITHFP-NEXT: addi a1, s0, 8 ; ILP32E-WITHFP-NEXT: sw a1, -12(s0) -; ILP32E-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 16(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 48 +; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 36 ; ILP32E-WITHFP-NEXT: ret ; ; LP64-LP64F-LP64D-FPELIM-LABEL: va1: @@ -191,28 +191,28 @@ define i32 @va1(ptr %fmt, ...) { ; ; LP64E-FPELIM-LABEL: va1: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -64 -; LP64E-FPELIM-NEXT: .cfi_def_cfa_offset 64 -; LP64E-FPELIM-NEXT: sd a1, 24(sp) -; LP64E-FPELIM-NEXT: addi a0, sp, 28 -; LP64E-FPELIM-NEXT: sd a0, 8(sp) -; LP64E-FPELIM-NEXT: lw a0, 24(sp) -; LP64E-FPELIM-NEXT: sd a5, 56(sp) -; LP64E-FPELIM-NEXT: sd a4, 48(sp) -; LP64E-FPELIM-NEXT: sd a3, 40(sp) -; LP64E-FPELIM-NEXT: sd a2, 32(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 64 +; LP64E-FPELIM-NEXT: addi sp, sp, -56 +; LP64E-FPELIM-NEXT: .cfi_def_cfa_offset 56 +; LP64E-FPELIM-NEXT: sd a1, 16(sp) +; LP64E-FPELIM-NEXT: addi a0, sp, 20 +; LP64E-FPELIM-NEXT: sd a0, 0(sp) +; LP64E-FPELIM-NEXT: lw a0, 16(sp) +; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 24(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va1: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -80 -; LP64E-WITHFP-NEXT: .cfi_def_cfa_offset 80 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi sp, sp, -72 +; LP64E-WITHFP-NEXT: .cfi_def_cfa_offset 72 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill ; LP64E-WITHFP-NEXT: .cfi_offset ra, -56 ; LP64E-WITHFP-NEXT: .cfi_offset s0, -64 -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: .cfi_def_cfa s0, 48 ; LP64E-WITHFP-NEXT: sd a1, 8(s0) ; LP64E-WITHFP-NEXT: addi a0, s0, 12 @@ -222,9 +222,9 @@ define i32 @va1(ptr %fmt, ...) { ; LP64E-WITHFP-NEXT: sd a4, 32(s0) ; LP64E-WITHFP-NEXT: sd a3, 24(s0) ; LP64E-WITHFP-NEXT: sd a2, 16(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 80 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 72 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -292,24 +292,24 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind { ; ; ILP32E-FPELIM-LABEL: va1_va_arg: ; ILP32E-FPELIM: # %bb.0: -; ILP32E-FPELIM-NEXT: addi sp, sp, -32 +; ILP32E-FPELIM-NEXT: addi sp, sp, -28 ; ILP32E-FPELIM-NEXT: mv a0, a1 -; ILP32E-FPELIM-NEXT: sw a5, 28(sp) -; ILP32E-FPELIM-NEXT: sw a4, 24(sp) -; ILP32E-FPELIM-NEXT: sw a3, 20(sp) -; ILP32E-FPELIM-NEXT: sw a2, 16(sp) -; ILP32E-FPELIM-NEXT: sw a1, 12(sp) -; ILP32E-FPELIM-NEXT: addi a1, sp, 16 -; ILP32E-FPELIM-NEXT: sw a1, 4(sp) -; ILP32E-FPELIM-NEXT: addi sp, sp, 32 +; ILP32E-FPELIM-NEXT: sw a5, 24(sp) +; ILP32E-FPELIM-NEXT: sw a4, 20(sp) +; ILP32E-FPELIM-NEXT: sw a3, 16(sp) +; ILP32E-FPELIM-NEXT: sw a2, 12(sp) +; ILP32E-FPELIM-NEXT: sw a1, 8(sp) +; ILP32E-FPELIM-NEXT: addi a1, sp, 12 +; ILP32E-FPELIM-NEXT: sw a1, 0(sp) +; ILP32E-FPELIM-NEXT: addi sp, sp, 28 ; ILP32E-FPELIM-NEXT: ret ; ; ILP32E-WITHFP-LABEL: va1_va_arg: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -48 -; ILP32E-WITHFP-NEXT: sw ra, 20(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 16(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: addi s0, sp, 24 +; ILP32E-WITHFP-NEXT: addi sp, sp, -36 +; ILP32E-WITHFP-NEXT: sw ra, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi s0, sp, 12 ; ILP32E-WITHFP-NEXT: mv a0, a1 ; ILP32E-WITHFP-NEXT: sw a5, 20(s0) ; ILP32E-WITHFP-NEXT: sw a4, 16(s0) @@ -318,9 +318,9 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind { ; ILP32E-WITHFP-NEXT: sw a1, 4(s0) ; ILP32E-WITHFP-NEXT: addi a1, s0, 8 ; ILP32E-WITHFP-NEXT: sw a1, -12(s0) -; ILP32E-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 16(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 48 +; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 36 ; ILP32E-WITHFP-NEXT: ret ; ; LP64-LP64F-LP64D-FPELIM-LABEL: va1_va_arg: @@ -362,24 +362,24 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind { ; ; LP64E-FPELIM-LABEL: va1_va_arg: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -64 +; LP64E-FPELIM-NEXT: addi sp, sp, -56 ; LP64E-FPELIM-NEXT: mv a0, a1 -; LP64E-FPELIM-NEXT: sd a5, 56(sp) -; LP64E-FPELIM-NEXT: sd a4, 48(sp) -; LP64E-FPELIM-NEXT: sd a3, 40(sp) -; LP64E-FPELIM-NEXT: sd a2, 32(sp) -; LP64E-FPELIM-NEXT: sd a1, 24(sp) -; LP64E-FPELIM-NEXT: addi a1, sp, 32 -; LP64E-FPELIM-NEXT: sd a1, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 64 +; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 24(sp) +; LP64E-FPELIM-NEXT: sd a1, 16(sp) +; LP64E-FPELIM-NEXT: addi a1, sp, 24 +; LP64E-FPELIM-NEXT: sd a1, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va1_va_arg: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -80 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -72 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: mv a0, a1 ; LP64E-WITHFP-NEXT: sd a5, 40(s0) ; LP64E-WITHFP-NEXT: sd a4, 32(s0) @@ -388,9 +388,9 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind { ; LP64E-WITHFP-NEXT: sd a1, 8(s0) ; LP64E-WITHFP-NEXT: addi a1, s0, 16 ; LP64E-WITHFP-NEXT: sd a1, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 80 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 72 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -879,29 +879,29 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ; ILP32E-FPELIM-LABEL: va2: ; ILP32E-FPELIM: # %bb.0: -; ILP32E-FPELIM-NEXT: addi sp, sp, -32 -; ILP32E-FPELIM-NEXT: sw a5, 28(sp) -; ILP32E-FPELIM-NEXT: sw a4, 24(sp) -; ILP32E-FPELIM-NEXT: sw a3, 20(sp) -; ILP32E-FPELIM-NEXT: sw a2, 16(sp) -; ILP32E-FPELIM-NEXT: sw a1, 12(sp) -; ILP32E-FPELIM-NEXT: addi a0, sp, 12 -; ILP32E-FPELIM-NEXT: sw a0, 4(sp) -; ILP32E-FPELIM-NEXT: addi a0, sp, 19 +; ILP32E-FPELIM-NEXT: addi sp, sp, -28 +; ILP32E-FPELIM-NEXT: sw a5, 24(sp) +; ILP32E-FPELIM-NEXT: sw a4, 20(sp) +; ILP32E-FPELIM-NEXT: sw a3, 16(sp) +; ILP32E-FPELIM-NEXT: sw a2, 12(sp) +; ILP32E-FPELIM-NEXT: sw a1, 8(sp) +; ILP32E-FPELIM-NEXT: addi a0, sp, 8 +; ILP32E-FPELIM-NEXT: sw a0, 0(sp) +; ILP32E-FPELIM-NEXT: addi a0, sp, 15 ; ILP32E-FPELIM-NEXT: andi a0, a0, -8 -; ILP32E-FPELIM-NEXT: addi a1, sp, 27 -; ILP32E-FPELIM-NEXT: sw a1, 4(sp) +; ILP32E-FPELIM-NEXT: addi a1, sp, 23 +; ILP32E-FPELIM-NEXT: sw a1, 0(sp) ; ILP32E-FPELIM-NEXT: lw a1, 4(a0) ; ILP32E-FPELIM-NEXT: lw a0, 0(a0) -; ILP32E-FPELIM-NEXT: addi sp, sp, 32 +; ILP32E-FPELIM-NEXT: addi sp, sp, 28 ; ILP32E-FPELIM-NEXT: ret ; ; ILP32E-WITHFP-LABEL: va2: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -48 -; ILP32E-WITHFP-NEXT: sw ra, 20(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 16(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: addi s0, sp, 24 +; ILP32E-WITHFP-NEXT: addi sp, sp, -36 +; ILP32E-WITHFP-NEXT: sw ra, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi s0, sp, 12 ; ILP32E-WITHFP-NEXT: sw a5, 20(s0) ; ILP32E-WITHFP-NEXT: sw a4, 16(s0) ; ILP32E-WITHFP-NEXT: sw a3, 12(s0) @@ -915,9 +915,9 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ILP32E-WITHFP-NEXT: sw a1, -12(s0) ; ILP32E-WITHFP-NEXT: lw a1, 4(a0) ; ILP32E-WITHFP-NEXT: lw a0, 0(a0) -; ILP32E-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 16(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 48 +; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 36 ; ILP32E-WITHFP-NEXT: ret ; ; LP64-LP64F-LP64D-FPELIM-LABEL: va2: @@ -959,24 +959,24 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ; LP64E-FPELIM-LABEL: va2: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -64 +; LP64E-FPELIM-NEXT: addi sp, sp, -56 ; LP64E-FPELIM-NEXT: mv a0, a1 -; LP64E-FPELIM-NEXT: sd a5, 56(sp) -; LP64E-FPELIM-NEXT: sd a4, 48(sp) -; LP64E-FPELIM-NEXT: sd a3, 40(sp) -; LP64E-FPELIM-NEXT: sd a2, 32(sp) -; LP64E-FPELIM-NEXT: sd a1, 24(sp) -; LP64E-FPELIM-NEXT: addi a1, sp, 39 -; LP64E-FPELIM-NEXT: sd a1, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 64 +; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 24(sp) +; LP64E-FPELIM-NEXT: sd a1, 16(sp) +; LP64E-FPELIM-NEXT: addi a1, sp, 31 +; LP64E-FPELIM-NEXT: sd a1, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va2: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -80 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -72 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: mv a0, a1 ; LP64E-WITHFP-NEXT: sd a5, 40(s0) ; LP64E-WITHFP-NEXT: sd a4, 32(s0) @@ -985,9 +985,9 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; LP64E-WITHFP-NEXT: sd a1, 8(s0) ; LP64E-WITHFP-NEXT: addi a1, s0, 23 ; LP64E-WITHFP-NEXT: sd a1, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 80 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 72 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -1076,29 +1076,29 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; ; ILP32E-FPELIM-LABEL: va2_va_arg: ; ILP32E-FPELIM: # %bb.0: -; ILP32E-FPELIM-NEXT: addi sp, sp, -32 -; ILP32E-FPELIM-NEXT: sw a5, 28(sp) -; ILP32E-FPELIM-NEXT: sw a4, 24(sp) -; ILP32E-FPELIM-NEXT: sw a3, 20(sp) -; ILP32E-FPELIM-NEXT: sw a2, 16(sp) -; ILP32E-FPELIM-NEXT: sw a1, 12(sp) -; ILP32E-FPELIM-NEXT: addi a0, sp, 19 +; ILP32E-FPELIM-NEXT: addi sp, sp, -28 +; ILP32E-FPELIM-NEXT: sw a5, 24(sp) +; ILP32E-FPELIM-NEXT: sw a4, 20(sp) +; ILP32E-FPELIM-NEXT: sw a3, 16(sp) +; ILP32E-FPELIM-NEXT: sw a2, 12(sp) +; ILP32E-FPELIM-NEXT: sw a1, 8(sp) +; ILP32E-FPELIM-NEXT: addi a0, sp, 15 ; ILP32E-FPELIM-NEXT: andi a1, a0, -8 ; ILP32E-FPELIM-NEXT: addi a0, a1, 4 -; ILP32E-FPELIM-NEXT: sw a0, 4(sp) +; ILP32E-FPELIM-NEXT: sw a0, 0(sp) ; ILP32E-FPELIM-NEXT: lw a0, 0(a1) ; ILP32E-FPELIM-NEXT: addi a2, a1, 8 -; ILP32E-FPELIM-NEXT: sw a2, 4(sp) +; ILP32E-FPELIM-NEXT: sw a2, 0(sp) ; ILP32E-FPELIM-NEXT: lw a1, 4(a1) -; ILP32E-FPELIM-NEXT: addi sp, sp, 32 +; ILP32E-FPELIM-NEXT: addi sp, sp, 28 ; ILP32E-FPELIM-NEXT: ret ; ; ILP32E-WITHFP-LABEL: va2_va_arg: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -48 -; ILP32E-WITHFP-NEXT: sw ra, 20(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 16(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: addi s0, sp, 24 +; ILP32E-WITHFP-NEXT: addi sp, sp, -36 +; ILP32E-WITHFP-NEXT: sw ra, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi s0, sp, 12 ; ILP32E-WITHFP-NEXT: sw a5, 20(s0) ; ILP32E-WITHFP-NEXT: sw a4, 16(s0) ; ILP32E-WITHFP-NEXT: sw a3, 12(s0) @@ -1112,9 +1112,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; ILP32E-WITHFP-NEXT: addi a2, a1, 8 ; ILP32E-WITHFP-NEXT: sw a2, -12(s0) ; ILP32E-WITHFP-NEXT: lw a1, 4(a1) -; ILP32E-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 16(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 48 +; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 36 ; ILP32E-WITHFP-NEXT: ret ; ; LP64-LP64F-LP64D-FPELIM-LABEL: va2_va_arg: @@ -1156,24 +1156,24 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; ; LP64E-FPELIM-LABEL: va2_va_arg: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -64 +; LP64E-FPELIM-NEXT: addi sp, sp, -56 ; LP64E-FPELIM-NEXT: mv a0, a1 -; LP64E-FPELIM-NEXT: sd a5, 56(sp) -; LP64E-FPELIM-NEXT: sd a4, 48(sp) -; LP64E-FPELIM-NEXT: sd a3, 40(sp) -; LP64E-FPELIM-NEXT: sd a2, 32(sp) -; LP64E-FPELIM-NEXT: sd a1, 24(sp) -; LP64E-FPELIM-NEXT: addi a1, sp, 32 -; LP64E-FPELIM-NEXT: sd a1, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 64 +; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 24(sp) +; LP64E-FPELIM-NEXT: sd a1, 16(sp) +; LP64E-FPELIM-NEXT: addi a1, sp, 24 +; LP64E-FPELIM-NEXT: sd a1, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va2_va_arg: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -80 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -72 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: mv a0, a1 ; LP64E-WITHFP-NEXT: sd a5, 40(s0) ; LP64E-WITHFP-NEXT: sd a4, 32(s0) @@ -1182,9 +1182,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; LP64E-WITHFP-NEXT: sd a1, 8(s0) ; LP64E-WITHFP-NEXT: addi a1, s0, 16 ; LP64E-WITHFP-NEXT: sd a1, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 80 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 72 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -1389,31 +1389,31 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ; ILP32E-FPELIM-LABEL: va3: ; ILP32E-FPELIM: # %bb.0: -; ILP32E-FPELIM-NEXT: addi sp, sp, -32 -; ILP32E-FPELIM-NEXT: sw a5, 28(sp) -; ILP32E-FPELIM-NEXT: sw a4, 24(sp) -; ILP32E-FPELIM-NEXT: sw a3, 20(sp) -; ILP32E-FPELIM-NEXT: addi a0, sp, 20 -; ILP32E-FPELIM-NEXT: sw a0, 12(sp) -; ILP32E-FPELIM-NEXT: addi a0, sp, 27 +; ILP32E-FPELIM-NEXT: addi sp, sp, -20 +; ILP32E-FPELIM-NEXT: sw a5, 16(sp) +; ILP32E-FPELIM-NEXT: sw a4, 12(sp) +; ILP32E-FPELIM-NEXT: sw a3, 8(sp) +; ILP32E-FPELIM-NEXT: addi a0, sp, 8 +; ILP32E-FPELIM-NEXT: sw a0, 0(sp) +; ILP32E-FPELIM-NEXT: addi a0, sp, 15 ; ILP32E-FPELIM-NEXT: andi a0, a0, -8 -; ILP32E-FPELIM-NEXT: addi a3, sp, 35 -; ILP32E-FPELIM-NEXT: sw a3, 12(sp) +; ILP32E-FPELIM-NEXT: addi a3, sp, 23 +; ILP32E-FPELIM-NEXT: sw a3, 0(sp) ; ILP32E-FPELIM-NEXT: lw a3, 4(a0) ; ILP32E-FPELIM-NEXT: lw a0, 0(a0) ; ILP32E-FPELIM-NEXT: add a2, a2, a3 ; ILP32E-FPELIM-NEXT: add a0, a1, a0 ; ILP32E-FPELIM-NEXT: sltu a1, a0, a1 ; ILP32E-FPELIM-NEXT: add a1, a2, a1 -; ILP32E-FPELIM-NEXT: addi sp, sp, 32 +; ILP32E-FPELIM-NEXT: addi sp, sp, 20 ; ILP32E-FPELIM-NEXT: ret ; ; ILP32E-WITHFP-LABEL: va3: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -32 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi sp, sp, -28 +; ILP32E-WITHFP-NEXT: sw ra, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi s0, sp, 12 ; ILP32E-WITHFP-NEXT: sw a5, 12(s0) ; ILP32E-WITHFP-NEXT: sw a4, 8(s0) ; ILP32E-WITHFP-NEXT: sw a3, 4(s0) @@ -1429,9 +1429,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32E-WITHFP-NEXT: add a0, a1, a0 ; ILP32E-WITHFP-NEXT: sltu a1, a0, a1 ; ILP32E-WITHFP-NEXT: add a1, a2, a1 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 32 +; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 28 ; ILP32E-WITHFP-NEXT: ret ; ; LP64-LP64F-LP64D-FPELIM-LABEL: va3: @@ -1471,23 +1471,23 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ; LP64E-FPELIM-LABEL: va3: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -48 -; LP64E-FPELIM-NEXT: sd a5, 40(sp) -; LP64E-FPELIM-NEXT: sd a4, 32(sp) -; LP64E-FPELIM-NEXT: sd a3, 24(sp) -; LP64E-FPELIM-NEXT: sd a2, 16(sp) -; LP64E-FPELIM-NEXT: addi a3, sp, 31 +; LP64E-FPELIM-NEXT: addi sp, sp, -40 +; LP64E-FPELIM-NEXT: sd a5, 32(sp) +; LP64E-FPELIM-NEXT: sd a4, 24(sp) +; LP64E-FPELIM-NEXT: sd a3, 16(sp) +; LP64E-FPELIM-NEXT: sd a2, 8(sp) +; LP64E-FPELIM-NEXT: addi a3, sp, 23 ; LP64E-FPELIM-NEXT: add a0, a1, a2 -; LP64E-FPELIM-NEXT: sd a3, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 48 +; LP64E-FPELIM-NEXT: sd a3, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 40 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va3: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -64 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -56 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: sd a5, 24(s0) ; LP64E-WITHFP-NEXT: sd a4, 16(s0) ; LP64E-WITHFP-NEXT: sd a3, 8(s0) @@ -1495,9 +1495,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; LP64E-WITHFP-NEXT: addi a3, s0, 15 ; LP64E-WITHFP-NEXT: add a0, a1, a2 ; LP64E-WITHFP-NEXT: sd a3, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 64 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 56 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -1593,31 +1593,31 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; ; ILP32E-FPELIM-LABEL: va3_va_arg: ; ILP32E-FPELIM: # %bb.0: -; ILP32E-FPELIM-NEXT: addi sp, sp, -32 -; ILP32E-FPELIM-NEXT: sw a5, 28(sp) -; ILP32E-FPELIM-NEXT: sw a4, 24(sp) -; ILP32E-FPELIM-NEXT: sw a3, 20(sp) -; ILP32E-FPELIM-NEXT: addi a0, sp, 27 +; ILP32E-FPELIM-NEXT: addi sp, sp, -20 +; ILP32E-FPELIM-NEXT: sw a5, 16(sp) +; ILP32E-FPELIM-NEXT: sw a4, 12(sp) +; ILP32E-FPELIM-NEXT: sw a3, 8(sp) +; ILP32E-FPELIM-NEXT: addi a0, sp, 15 ; ILP32E-FPELIM-NEXT: andi a0, a0, -8 ; ILP32E-FPELIM-NEXT: addi a3, a0, 4 -; ILP32E-FPELIM-NEXT: sw a3, 12(sp) +; ILP32E-FPELIM-NEXT: sw a3, 0(sp) ; ILP32E-FPELIM-NEXT: lw a3, 0(a0) ; ILP32E-FPELIM-NEXT: addi a4, a0, 8 -; ILP32E-FPELIM-NEXT: sw a4, 12(sp) +; ILP32E-FPELIM-NEXT: sw a4, 0(sp) ; ILP32E-FPELIM-NEXT: lw a4, 4(a0) ; ILP32E-FPELIM-NEXT: add a0, a1, a3 ; ILP32E-FPELIM-NEXT: sltu a1, a0, a1 ; ILP32E-FPELIM-NEXT: add a2, a2, a4 ; ILP32E-FPELIM-NEXT: add a1, a2, a1 -; ILP32E-FPELIM-NEXT: addi sp, sp, 32 +; ILP32E-FPELIM-NEXT: addi sp, sp, 20 ; ILP32E-FPELIM-NEXT: ret ; ; ILP32E-WITHFP-LABEL: va3_va_arg: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -32 -; ILP32E-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: addi s0, sp, 16 +; ILP32E-WITHFP-NEXT: addi sp, sp, -28 +; ILP32E-WITHFP-NEXT: sw ra, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi s0, sp, 12 ; ILP32E-WITHFP-NEXT: sw a5, 12(s0) ; ILP32E-WITHFP-NEXT: sw a4, 8(s0) ; ILP32E-WITHFP-NEXT: sw a3, 4(s0) @@ -1633,9 +1633,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; ILP32E-WITHFP-NEXT: sltu a1, a0, a1 ; ILP32E-WITHFP-NEXT: add a2, a2, a4 ; ILP32E-WITHFP-NEXT: add a1, a2, a1 -; ILP32E-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 32 +; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 28 ; ILP32E-WITHFP-NEXT: ret ; ; LP64-LP64F-LP64D-FPELIM-LABEL: va3_va_arg: @@ -1675,23 +1675,23 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; ; LP64E-FPELIM-LABEL: va3_va_arg: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -48 -; LP64E-FPELIM-NEXT: sd a5, 40(sp) -; LP64E-FPELIM-NEXT: sd a4, 32(sp) -; LP64E-FPELIM-NEXT: sd a3, 24(sp) -; LP64E-FPELIM-NEXT: sd a2, 16(sp) -; LP64E-FPELIM-NEXT: addi a3, sp, 24 +; LP64E-FPELIM-NEXT: addi sp, sp, -40 +; LP64E-FPELIM-NEXT: sd a5, 32(sp) +; LP64E-FPELIM-NEXT: sd a4, 24(sp) +; LP64E-FPELIM-NEXT: sd a3, 16(sp) +; LP64E-FPELIM-NEXT: sd a2, 8(sp) +; LP64E-FPELIM-NEXT: addi a3, sp, 16 ; LP64E-FPELIM-NEXT: add a0, a1, a2 -; LP64E-FPELIM-NEXT: sd a3, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 48 +; LP64E-FPELIM-NEXT: sd a3, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 40 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va3_va_arg: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -64 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -56 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: sd a5, 24(s0) ; LP64E-WITHFP-NEXT: sd a4, 16(s0) ; LP64E-WITHFP-NEXT: sd a3, 8(s0) @@ -1699,9 +1699,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; LP64E-WITHFP-NEXT: addi a3, s0, 8 ; LP64E-WITHFP-NEXT: add a0, a1, a2 ; LP64E-WITHFP-NEXT: sd a3, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 64 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 56 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -2675,24 +2675,24 @@ define i32 @va6_no_fixed_args(...) nounwind { ; ; ILP32E-FPELIM-LABEL: va6_no_fixed_args: ; ILP32E-FPELIM: # %bb.0: -; ILP32E-FPELIM-NEXT: addi sp, sp, -32 -; ILP32E-FPELIM-NEXT: sw a5, 28(sp) -; ILP32E-FPELIM-NEXT: sw a4, 24(sp) -; ILP32E-FPELIM-NEXT: sw a3, 20(sp) -; ILP32E-FPELIM-NEXT: sw a2, 16(sp) -; ILP32E-FPELIM-NEXT: sw a1, 12(sp) -; ILP32E-FPELIM-NEXT: sw a0, 8(sp) -; ILP32E-FPELIM-NEXT: addi a1, sp, 12 -; ILP32E-FPELIM-NEXT: sw a1, 4(sp) -; ILP32E-FPELIM-NEXT: addi sp, sp, 32 +; ILP32E-FPELIM-NEXT: addi sp, sp, -28 +; ILP32E-FPELIM-NEXT: sw a5, 24(sp) +; ILP32E-FPELIM-NEXT: sw a4, 20(sp) +; ILP32E-FPELIM-NEXT: sw a3, 16(sp) +; ILP32E-FPELIM-NEXT: sw a2, 12(sp) +; ILP32E-FPELIM-NEXT: sw a1, 8(sp) +; ILP32E-FPELIM-NEXT: sw a0, 4(sp) +; ILP32E-FPELIM-NEXT: addi a1, sp, 8 +; ILP32E-FPELIM-NEXT: sw a1, 0(sp) +; ILP32E-FPELIM-NEXT: addi sp, sp, 28 ; ILP32E-FPELIM-NEXT: ret ; ; ILP32E-WITHFP-LABEL: va6_no_fixed_args: ; ILP32E-WITHFP: # %bb.0: -; ILP32E-WITHFP-NEXT: addi sp, sp, -48 -; ILP32E-WITHFP-NEXT: sw ra, 20(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: sw s0, 16(sp) # 4-byte Folded Spill -; ILP32E-WITHFP-NEXT: addi s0, sp, 24 +; ILP32E-WITHFP-NEXT: addi sp, sp, -36 +; ILP32E-WITHFP-NEXT: sw ra, 8(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: sw s0, 4(sp) # 4-byte Folded Spill +; ILP32E-WITHFP-NEXT: addi s0, sp, 12 ; ILP32E-WITHFP-NEXT: sw a5, 20(s0) ; ILP32E-WITHFP-NEXT: sw a4, 16(s0) ; ILP32E-WITHFP-NEXT: sw a3, 12(s0) @@ -2701,9 +2701,9 @@ define i32 @va6_no_fixed_args(...) nounwind { ; ILP32E-WITHFP-NEXT: sw a0, 0(s0) ; ILP32E-WITHFP-NEXT: addi a1, s0, 4 ; ILP32E-WITHFP-NEXT: sw a1, -12(s0) -; ILP32E-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: lw s0, 16(sp) # 4-byte Folded Reload -; ILP32E-WITHFP-NEXT: addi sp, sp, 48 +; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: lw s0, 4(sp) # 4-byte Folded Reload +; ILP32E-WITHFP-NEXT: addi sp, sp, 36 ; ILP32E-WITHFP-NEXT: ret ; ; LP64-LP64F-LP64D-FPELIM-LABEL: va6_no_fixed_args: @@ -2745,24 +2745,24 @@ define i32 @va6_no_fixed_args(...) nounwind { ; ; LP64E-FPELIM-LABEL: va6_no_fixed_args: ; LP64E-FPELIM: # %bb.0: -; LP64E-FPELIM-NEXT: addi sp, sp, -64 -; LP64E-FPELIM-NEXT: sd a5, 56(sp) -; LP64E-FPELIM-NEXT: sd a4, 48(sp) -; LP64E-FPELIM-NEXT: sd a3, 40(sp) -; LP64E-FPELIM-NEXT: sd a2, 32(sp) -; LP64E-FPELIM-NEXT: sd a1, 24(sp) -; LP64E-FPELIM-NEXT: sd a0, 16(sp) -; LP64E-FPELIM-NEXT: addi a1, sp, 24 -; LP64E-FPELIM-NEXT: sd a1, 8(sp) -; LP64E-FPELIM-NEXT: addi sp, sp, 64 +; LP64E-FPELIM-NEXT: addi sp, sp, -56 +; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 24(sp) +; LP64E-FPELIM-NEXT: sd a1, 16(sp) +; LP64E-FPELIM-NEXT: sd a0, 8(sp) +; LP64E-FPELIM-NEXT: addi a1, sp, 16 +; LP64E-FPELIM-NEXT: sd a1, 0(sp) +; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: ret ; ; LP64E-WITHFP-LABEL: va6_no_fixed_args: ; LP64E-WITHFP: # %bb.0: -; LP64E-WITHFP-NEXT: addi sp, sp, -80 -; LP64E-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; LP64E-WITHFP-NEXT: addi s0, sp, 32 +; LP64E-WITHFP-NEXT: addi sp, sp, -72 +; LP64E-WITHFP-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: sd a5, 40(s0) ; LP64E-WITHFP-NEXT: sd a4, 32(s0) ; LP64E-WITHFP-NEXT: sd a3, 24(s0) @@ -2771,9 +2771,9 @@ define i32 @va6_no_fixed_args(...) nounwind { ; LP64E-WITHFP-NEXT: sd a0, 0(s0) ; LP64E-WITHFP-NEXT: addi a1, s0, 8 ; LP64E-WITHFP-NEXT: sd a1, -24(s0) -; LP64E-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; LP64E-WITHFP-NEXT: addi sp, sp, 80 +; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LP64E-WITHFP-NEXT: addi sp, sp, 72 ; LP64E-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) @@ -2934,7 +2934,7 @@ define i32 @va_large_stack(ptr %fmt, ...) { ; ILP32E-WITHFP-NEXT: addi s0, sp, 2020 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 24 ; ILP32E-WITHFP-NEXT: lui a0, 24414 -; ILP32E-WITHFP-NEXT: addi a0, a0, -1740 +; ILP32E-WITHFP-NEXT: addi a0, a0, -1748 ; ILP32E-WITHFP-NEXT: sub sp, sp, a0 ; ILP32E-WITHFP-NEXT: mv a0, a1 ; ILP32E-WITHFP-NEXT: sw a5, 20(s0) @@ -2947,7 +2947,7 @@ define i32 @va_large_stack(ptr %fmt, ...) { ; ILP32E-WITHFP-NEXT: sub a2, s0, a2 ; ILP32E-WITHFP-NEXT: sw a1, -272(a2) ; ILP32E-WITHFP-NEXT: lui a1, 24414 -; ILP32E-WITHFP-NEXT: addi a1, a1, -1740 +; ILP32E-WITHFP-NEXT: addi a1, a1, -1748 ; ILP32E-WITHFP-NEXT: add sp, sp, a1 ; ILP32E-WITHFP-NEXT: lw ra, 2016(sp) # 4-byte Folded Reload ; ILP32E-WITHFP-NEXT: lw s0, 2012(sp) # 4-byte Folded Reload -- cgit v1.1 From 2a4a2558f1533a91519fcc4e7abf04f845f067bd Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Sat, 10 Feb 2024 08:54:13 +0100 Subject: Fix -Wunused-variable warning in Release build. --- llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index e852052..262e8e5 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -462,8 +462,7 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI, bool RISCVLegalizerInfo::shouldBeInConstantPool(APInt APImm, bool ShouldOptForSize) const { - unsigned BitWidth = APImm.getBitWidth(); - assert(BitWidth == 32 || BitWidth == 64); + assert(APImm.getBitWidth() == 32 || APImm.getBitWidth() == 64); int64_t Imm = APImm.getSExtValue(); // All simm32 constants should be handled by isel. // NOTE: The getMaxBuildIntsCost call below should return a value >= 2 making -- cgit v1.1 From 9308d6688c673606fee1625d777a52539ae72015 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 10 Feb 2024 08:19:49 +0000 Subject: [Flang] Correct initial limit value in float min/maxloc reductions. (#81260) I was looking through to check whether Nan was being handled correctly, and couldn't work out why simple cases were behaving differently than they should. It turns out the initial limit values was backwards for minloc/maxloc reductions in general. This fixes that, introduced in #79469. --- flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp | 2 +- flang/test/HLFIR/maxloc-elemental.fir | 1 + flang/test/HLFIR/minloc-elemental.fir | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp index b1165a5..523671f 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp @@ -854,7 +854,7 @@ public: const llvm::fltSemantics &sem = ty.getFloatSemantics(); return builder.createRealConstant( loc, elementType, - llvm::APFloat::getLargest(sem, /*Negative=*/!isMax)); + llvm::APFloat::getLargest(sem, /*Negative=*/isMax)); } unsigned bits = elementType.getIntOrFloatBitWidth(); int64_t limitInt = diff --git a/flang/test/HLFIR/maxloc-elemental.fir b/flang/test/HLFIR/maxloc-elemental.fir index 67cd9ee..b4a3ca0 100644 --- a/flang/test/HLFIR/maxloc-elemental.fir +++ b/flang/test/HLFIR/maxloc-elemental.fir @@ -110,6 +110,7 @@ func.func @_QPtest_float(%arg0: !fir.box> {fir.bindc_name = "a return } // CHECK-LABEL: _QPtest_float +// CHECK: %cst = arith.constant -3.40282347E+38 : f32 // CHECK: %[[V11:.*]] = fir.do_loop %arg3 = %c0 to %[[V10:.*]] step %c1 iter_args(%arg4 = %cst) -> (f32) { // CHECK-NEXT: %[[V14:.*]] = arith.addi %arg3, %c1 : index // CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[V1:.*]]#0 (%[[V14]]) : (!fir.box>, index) -> !fir.ref diff --git a/flang/test/HLFIR/minloc-elemental.fir b/flang/test/HLFIR/minloc-elemental.fir index cb483d5..5cc608b 100644 --- a/flang/test/HLFIR/minloc-elemental.fir +++ b/flang/test/HLFIR/minloc-elemental.fir @@ -295,6 +295,7 @@ func.func @_QPtest_float(%arg0: !fir.box> {fir.bindc_name = "a return } // CHECK-LABEL: _QPtest_float +// CHECK: %cst = arith.constant 3.40282347E+38 : f32 // CHECK: %[[V11:.*]] = fir.do_loop %arg3 = %c0 to %[[V10:.*]] step %c1 iter_args(%arg4 = %cst) -> (f32) { // CHECK-NEXT: %[[V14:.*]] = arith.addi %arg3, %c1 : index // CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[V1:.*]]#0 (%[[V14]]) : (!fir.box>, index) -> !fir.ref -- cgit v1.1 From d26b43ff4f7396f79de4b099160262c750d6aba7 Mon Sep 17 00:00:00 2001 From: Alexander Shaposhnikov <6532716+alexander-shaposhnikov@users.noreply.github.com> Date: Sat, 10 Feb 2024 01:12:46 -0800 Subject: Add JumpTableToSwitch pass (#77709) Add a pass to convert jump tables to switches. The new pass replaces an indirect call with a switch + direct calls if all the functions in the jump table are smaller than the provided threshold. The pass is currently disabled by default and can be enabled by -enable-jump-table-to-switch. Test plan: ninja check-all --- .../llvm/Transforms/Scalar/JumpTableToSwitch.h | 24 +++ llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassBuilderPipelines.cpp | 9 + llvm/lib/Passes/PassRegistry.def | 1 + llvm/lib/Transforms/Scalar/CMakeLists.txt | 1 + llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp | 190 +++++++++++++++++ llvm/test/Other/new-pm-defaults.ll | 5 + llvm/test/Transforms/JumpTableToSwitch/basic.ll | 228 +++++++++++++++++++++ .../JumpTableToSwitch/max_function_size.ll | 28 +++ llvm/test/Transforms/JumpTableToSwitch/remarks.ll | 36 ++++ llvm/test/Transforms/JumpTableToSwitch/skip.ll | 131 ++++++++++++ llvm/test/Transforms/JumpTableToSwitch/stride.ll | 36 ++++ llvm/test/Transforms/JumpTableToSwitch/struct.ll | 42 ++++ 13 files changed, 732 insertions(+) create mode 100644 llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h create mode 100644 llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp create mode 100644 llvm/test/Transforms/JumpTableToSwitch/basic.ll create mode 100644 llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll create mode 100644 llvm/test/Transforms/JumpTableToSwitch/remarks.ll create mode 100644 llvm/test/Transforms/JumpTableToSwitch/skip.ll create mode 100644 llvm/test/Transforms/JumpTableToSwitch/stride.ll create mode 100644 llvm/test/Transforms/JumpTableToSwitch/struct.ll diff --git a/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h b/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h new file mode 100644 index 0000000..6178622 --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h @@ -0,0 +1,24 @@ +//===- JumpTableToSwitch.h - ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_JUMP_TABLE_TO_SWITCH_H +#define LLVM_TRANSFORMS_SCALAR_JUMP_TABLE_TO_SWITCH_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class Function; + +struct JumpTableToSwitchPass : PassInfoMixin { + /// Run the pass over the function. + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_JUMP_TABLE_TO_SWITCH_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 007dc76..e3f2502 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -201,6 +201,7 @@ #include "llvm/Transforms/Scalar/InferAddressSpaces.h" #include "llvm/Transforms/Scalar/InferAlignment.h" #include "llvm/Transforms/Scalar/InstSimplifyPass.h" +#include "llvm/Transforms/Scalar/JumpTableToSwitch.h" #include "llvm/Transforms/Scalar/JumpThreading.h" #include "llvm/Transforms/Scalar/LICM.h" #include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 6ede863..4e233d9 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -91,6 +91,7 @@ #include "llvm/Transforms/Scalar/IndVarSimplify.h" #include "llvm/Transforms/Scalar/InferAlignment.h" #include "llvm/Transforms/Scalar/InstSimplifyPass.h" +#include "llvm/Transforms/Scalar/JumpTableToSwitch.h" #include "llvm/Transforms/Scalar/JumpThreading.h" #include "llvm/Transforms/Scalar/LICM.h" #include "llvm/Transforms/Scalar/LoopDeletion.h" @@ -237,6 +238,10 @@ static cl::opt EnableGVNSink("enable-gvn-sink", cl::desc("Enable the GVN sinking pass (default = off)")); +static cl::opt EnableJumpTableToSwitch( + "enable-jump-table-to-switch", + cl::desc("Enable JumpTableToSwitch pass (default = off)")); + // This option is used in simplifying testing SampleFDO optimizations for // profile loading. static cl::opt @@ -559,6 +564,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(JumpThreadingPass()); FPM.addPass(CorrelatedValuePropagationPass()); + // Jump table to switch conversion. + if (EnableJumpTableToSwitch) + FPM.addPass(JumpTableToSwitchPass()); + FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); FPM.addPass(InstCombinePass()); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 6cb87fb..afa5a65 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -348,6 +348,7 @@ FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass(TM)) FUNCTION_PASS("invalidate", InvalidateAllAnalysesPass()) FUNCTION_PASS("irce", IRCEPass()) FUNCTION_PASS("jump-threading", JumpThreadingPass()) +FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass()); FUNCTION_PASS("kcfi", KCFIPass()) FUNCTION_PASS("lcssa", LCSSAPass()) FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass()) diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt index 5527efa..ba09ebf 100644 --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_component_library(LLVMScalarOpts InferAlignment.cpp InstSimplifyPass.cpp JumpThreading.cpp + JumpTableToSwitch.cpp LICM.cpp LoopAccessAnalysisPrinter.cpp LoopBoundSplit.cpp diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp new file mode 100644 index 0000000..f9712db --- /dev/null +++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp @@ -0,0 +1,190 @@ +//===- JumpTableToSwitch.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/JumpTableToSwitch.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +static cl::opt + JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden, + cl::desc("Only split jump tables with size less or " + "equal than JumpTableSizeThreshold."), + cl::init(10)); + +// TODO: Consider adding a cost model for profitability analysis of this +// transformation. Currently we replace a jump table with a switch if all the +// functions in the jump table are smaller than the provided threshold. +static cl::opt FunctionSizeThreshold( + "jump-table-to-switch-function-size-threshold", cl::Hidden, + cl::desc("Only split jump tables containing functions whose sizes are less " + "or equal than this threshold."), + cl::init(50)); + +#define DEBUG_TYPE "jump-table-to-switch" + +namespace { +struct JumpTableTy { + Value *Index; + SmallVector Funcs; +}; +} // anonymous namespace + +static std::optional parseJumpTable(GetElementPtrInst *GEP, + PointerType *PtrTy) { + Constant *Ptr = dyn_cast(GEP->getPointerOperand()); + if (!Ptr) + return std::nullopt; + + GlobalVariable *GV = dyn_cast(Ptr); + if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) + return std::nullopt; + + Function &F = *GEP->getParent()->getParent(); + const DataLayout &DL = F.getParent()->getDataLayout(); + const unsigned BitWidth = + DL.getIndexSizeInBits(GEP->getPointerAddressSpace()); + MapVector VariableOffsets; + APInt ConstantOffset(BitWidth, 0); + if (!GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset)) + return std::nullopt; + if (VariableOffsets.size() != 1) + return std::nullopt; + // TODO: consider supporting more general patterns + if (!ConstantOffset.isZero()) + return std::nullopt; + APInt StrideBytes = VariableOffsets.front().second; + const uint64_t JumpTableSizeBytes = DL.getTypeAllocSize(GV->getValueType()); + if (JumpTableSizeBytes % StrideBytes.getZExtValue() != 0) + return std::nullopt; + const uint64_t N = JumpTableSizeBytes / StrideBytes.getZExtValue(); + if (N > JumpTableSizeThreshold) + return std::nullopt; + + JumpTableTy JumpTable; + JumpTable.Index = VariableOffsets.front().first; + JumpTable.Funcs.reserve(N); + for (uint64_t Index = 0; Index < N; ++Index) { + // ConstantOffset is zero. + APInt Offset = Index * StrideBytes; + Constant *C = + ConstantFoldLoadFromConst(GV->getInitializer(), PtrTy, Offset, DL); + auto *Func = dyn_cast_or_null(C); + if (!Func || Func->isDeclaration() || + Func->getInstructionCount() > FunctionSizeThreshold) + return std::nullopt; + JumpTable.Funcs.push_back(Func); + } + return JumpTable; +} + +static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT, + DomTreeUpdater &DTU, + OptimizationRemarkEmitter &ORE) { + const bool IsVoid = CB->getType() == Type::getVoidTy(CB->getContext()); + + SmallVector DTUpdates; + BasicBlock *BB = CB->getParent(); + BasicBlock *Tail = SplitBlock(BB, CB, &DTU, nullptr, nullptr, + BB->getName() + Twine(".tail")); + DTUpdates.push_back({DominatorTree::Delete, BB, Tail}); + BB->getTerminator()->eraseFromParent(); + + Function &F = *BB->getParent(); + BasicBlock *BBUnreachable = BasicBlock::Create( + F.getContext(), "default.switch.case.unreachable", &F, Tail); + IRBuilder<> BuilderUnreachable(BBUnreachable); + BuilderUnreachable.CreateUnreachable(); + + IRBuilder<> Builder(BB); + SwitchInst *Switch = Builder.CreateSwitch(JT.Index, BBUnreachable); + DTUpdates.push_back({DominatorTree::Insert, BB, BBUnreachable}); + + IRBuilder<> BuilderTail(CB); + PHINode *PHI = + IsVoid ? nullptr : BuilderTail.CreatePHI(CB->getType(), JT.Funcs.size()); + + for (auto [Index, Func] : llvm::enumerate(JT.Funcs)) { + BasicBlock *B = BasicBlock::Create(Func->getContext(), + "call." + Twine(Index), &F, Tail); + DTUpdates.push_back({DominatorTree::Insert, BB, B}); + DTUpdates.push_back({DominatorTree::Insert, B, Tail}); + + CallBase *Call = cast(CB->clone()); + Call->setCalledFunction(Func); + Call->insertInto(B, B->end()); + Switch->addCase( + cast(ConstantInt::get(JT.Index->getType(), Index)), B); + BranchInst::Create(Tail, B); + if (PHI) + PHI->addIncoming(Call, B); + } + DTU.applyUpdates(DTUpdates); + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "ReplacedJumpTableWithSwitch", CB) + << "expanded indirect call into switch"; + }); + if (PHI) + CB->replaceAllUsesWith(PHI); + CB->eraseFromParent(); + return Tail; +} + +PreservedAnalyses JumpTableToSwitchPass::run(Function &F, + FunctionAnalysisManager &AM) { + OptimizationRemarkEmitter &ORE = + AM.getResult(F); + DominatorTree *DT = AM.getCachedResult(F); + PostDominatorTree *PDT = AM.getCachedResult(F); + DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy); + bool Changed = false; + for (BasicBlock &BB : make_early_inc_range(F)) { + BasicBlock *CurrentBB = &BB; + while (CurrentBB) { + BasicBlock *SplittedOutTail = nullptr; + for (Instruction &I : make_early_inc_range(*CurrentBB)) { + auto *Call = dyn_cast(&I); + if (!Call || Call->getCalledFunction() || Call->isMustTailCall()) + continue; + auto *L = dyn_cast(Call->getCalledOperand()); + // Skip atomic or volatile loads. + if (!L || !L->isSimple()) + continue; + auto *GEP = dyn_cast(L->getPointerOperand()); + if (!GEP) + continue; + auto *PtrTy = dyn_cast(L->getType()); + assert(PtrTy && "call operand must be a pointer"); + std::optional JumpTable = parseJumpTable(GEP, PtrTy); + if (!JumpTable) + continue; + SplittedOutTail = expandToSwitch(Call, *JumpTable, DTU, ORE); + Changed = true; + break; + } + CurrentBB = SplittedOutTail ? SplittedOutTail : nullptr; + } + } + + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + if (DT) + PA.preserve(); + if (PDT) + PA.preserve(); + return PA; +} diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index ecdb5a5..51fb93d 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -72,6 +72,10 @@ ; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ +; RUN: -passes='default' -enable-jump-table-to-switch -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-JUMP-TABLE-TO-SWITCH,CHECK-O23SZ,%llvmcheckext + +; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -enable-matrix -S %s 2>&1 \ ; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX @@ -151,6 +155,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis +; CHECK-JUMP-TABLE-TO-SWITCH-NEXT: Running pass: JumpTableToSwitchPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass diff --git a/llvm/test/Transforms/JumpTableToSwitch/basic.ll b/llvm/test/Transforms/JumpTableToSwitch/basic.ll new file mode 100644 index 0000000..321f837 --- /dev/null +++ b/llvm/test/Transforms/JumpTableToSwitch/basic.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s +; RUN: opt < %s -passes=jump-table-to-switch -jump-table-to-switch-size-threshold=0 -verify-dom-info -S | FileCheck %s --check-prefix=THRESHOLD-0 + +@func_array = constant [2 x ptr] [ptr @func0, ptr @func1] + +define i32 @func0() { + ret i32 1 +} + +define i32 @func1() { + ret i32 2 +} + +define i32 @function_with_jump_table(i32 %index) { +; CHECK-LABEL: define i32 @function_with_jump_table( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [ +; CHECK-NEXT: i32 0, label [[CALL_0:%.*]] +; CHECK-NEXT: i32 1, label [[CALL_1:%.*]] +; CHECK-NEXT: ] +; CHECK: default.switch.case.unreachable: +; CHECK-NEXT: unreachable +; CHECK: call.0: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @func0() +; CHECK-NEXT: br label [[DOTTAIL:%.*]] +; CHECK: call.1: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @func1() +; CHECK-NEXT: br label [[DOTTAIL]] +; CHECK: .tail: +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ] +; CHECK-NEXT: ret i32 [[TMP3]] +; +; THRESHOLD-0-LABEL: define i32 @function_with_jump_table( +; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) { +; THRESHOLD-0-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]] +; THRESHOLD-0-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; THRESHOLD-0-NEXT: [[RESULT:%.*]] = call i32 [[FUNC_PTR]]() +; THRESHOLD-0-NEXT: ret i32 [[RESULT]] +; + %gep = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep + %result = call i32 %func_ptr() + ret i32 %result +} + +define i32 @basic_block_splitted_twice(i32 %index) { +; CHECK-LABEL: define i32 @basic_block_splitted_twice( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR1:%.*]] = load ptr, ptr [[GEP1]], align 8 +; CHECK-NEXT: switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [ +; CHECK-NEXT: i32 0, label [[CALL_0:%.*]] +; CHECK-NEXT: i32 1, label [[CALL_1:%.*]] +; CHECK-NEXT: ] +; CHECK: default.switch.case.unreachable: +; CHECK-NEXT: unreachable +; CHECK: call.0: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @func0() +; CHECK-NEXT: br label [[DOTTAIL:%.*]] +; CHECK: call.1: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @func1() +; CHECK-NEXT: br label [[DOTTAIL]] +; CHECK: .tail: +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR2:%.*]] = load ptr, ptr [[GEP2]], align 8 +; CHECK-NEXT: switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE1:%.*]] [ +; CHECK-NEXT: i32 0, label [[CALL_02:%.*]] +; CHECK-NEXT: i32 1, label [[CALL_13:%.*]] +; CHECK-NEXT: ] +; CHECK: default.switch.case.unreachable1: +; CHECK-NEXT: unreachable +; CHECK: call.02: +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @func0() +; CHECK-NEXT: br label [[DOTTAIL_TAIL:%.*]] +; CHECK: call.13: +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @func1() +; CHECK-NEXT: br label [[DOTTAIL_TAIL]] +; CHECK: .tail.tail: +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[TMP4]], [[CALL_02]] ], [ [[TMP5]], [[CALL_13]] ] +; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[TMP3]], [[TMP6]] +; CHECK-NEXT: ret i32 [[RESULT]] +; +; THRESHOLD-0-LABEL: define i32 @basic_block_splitted_twice( +; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) { +; THRESHOLD-0-NEXT: [[GEP1:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]] +; THRESHOLD-0-NEXT: [[FUNC_PTR1:%.*]] = load ptr, ptr [[GEP1]], align 8 +; THRESHOLD-0-NEXT: [[RESULT1:%.*]] = call i32 [[FUNC_PTR1]]() +; THRESHOLD-0-NEXT: [[GEP2:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]] +; THRESHOLD-0-NEXT: [[FUNC_PTR2:%.*]] = load ptr, ptr [[GEP2]], align 8 +; THRESHOLD-0-NEXT: [[RESULT2:%.*]] = call i32 [[FUNC_PTR2]]() +; THRESHOLD-0-NEXT: [[RESULT:%.*]] = add i32 [[RESULT1]], [[RESULT2]] +; THRESHOLD-0-NEXT: ret i32 [[RESULT]] +; + %gep1 = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index + %func_ptr1 = load ptr, ptr %gep1 + %result1 = call i32 %func_ptr1() + %gep2 = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index + %func_ptr2 = load ptr, ptr %gep2 + %result2 = call i32 %func_ptr2() + %result = add i32 %result1, %result2 + ret i32 %result +} + +define void @void_func0() { + ret void +} + +define void @void_func1() { + ret void +} + +@void_func_array = constant [2 x ptr] [ptr @void_func0, ptr @void_func1] + +define void @void_function_with_jump_table(i32 %index) { +; CHECK-LABEL: define void @void_function_with_jump_table( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [ +; CHECK-NEXT: i32 0, label [[CALL_0:%.*]] +; CHECK-NEXT: i32 1, label [[CALL_1:%.*]] +; CHECK-NEXT: ] +; CHECK: default.switch.case.unreachable: +; CHECK-NEXT: unreachable +; CHECK: call.0: +; CHECK-NEXT: call void @void_func0() +; CHECK-NEXT: br label [[DOTTAIL:%.*]] +; CHECK: call.1: +; CHECK-NEXT: call void @void_func1() +; CHECK-NEXT: br label [[DOTTAIL]] +; CHECK: .tail: +; CHECK-NEXT: ret void +; +; THRESHOLD-0-LABEL: define void @void_function_with_jump_table( +; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) { +; THRESHOLD-0-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]] +; THRESHOLD-0-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; THRESHOLD-0-NEXT: call void [[FUNC_PTR]]() +; THRESHOLD-0-NEXT: ret void +; + %gep = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep + call void %func_ptr() + ret void +} + +define void @void_function_with_jump_table_and_call_site_attr(i32 %index) { +; CHECK-LABEL: define void @void_function_with_jump_table_and_call_site_attr( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [ +; CHECK-NEXT: i32 0, label [[CALL_0:%.*]] +; CHECK-NEXT: i32 1, label [[CALL_1:%.*]] +; CHECK-NEXT: ] +; CHECK: default.switch.case.unreachable: +; CHECK-NEXT: unreachable +; CHECK: call.0: +; CHECK-NEXT: call void @void_func0() #[[ATTR0:[0-9]+]] +; CHECK-NEXT: br label [[DOTTAIL:%.*]] +; CHECK: call.1: +; CHECK-NEXT: call void @void_func1() #[[ATTR0]] +; CHECK-NEXT: br label [[DOTTAIL]] +; CHECK: .tail: +; CHECK-NEXT: ret void +; +; THRESHOLD-0-LABEL: define void @void_function_with_jump_table_and_call_site_attr( +; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) { +; THRESHOLD-0-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]] +; THRESHOLD-0-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; THRESHOLD-0-NEXT: call void [[FUNC_PTR]]() #[[ATTR0:[0-9]+]] +; THRESHOLD-0-NEXT: ret void +; + %gep = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep + call void %func_ptr() nounwind + ret void +} + + +define i32 @func0_addrspace_42() addrspace(42) { + ret i32 1 +} + +define i32 @func1_addrspace_42() addrspace(42) { + ret i32 2 +} + +@func_array_addrspace_42 = addrspace(42) constant [2 x ptr addrspace(42)] [ptr addrspace(42) @func0_addrspace_42, ptr addrspace(42) @func1_addrspace_42] + +define i32 @function_with_jump_table_addrspace_42(i32 %index) addrspace(42) { +; CHECK-LABEL: define i32 @function_with_jump_table_addrspace_42( +; CHECK-SAME: i32 [[INDEX:%.*]]) addrspace(42) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr addrspace(42)], ptr addrspace(42) @func_array_addrspace_42, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr addrspace(42), ptr addrspace(42) [[GEP]], align 8 +; CHECK-NEXT: switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [ +; CHECK-NEXT: i32 0, label [[CALL_0:%.*]] +; CHECK-NEXT: i32 1, label [[CALL_1:%.*]] +; CHECK-NEXT: ] +; CHECK: default.switch.case.unreachable: +; CHECK-NEXT: unreachable +; CHECK: call.0: +; CHECK-NEXT: [[TMP1:%.*]] = call addrspace(42) i32 @func0_addrspace_42() +; CHECK-NEXT: br label [[DOTTAIL:%.*]] +; CHECK: call.1: +; CHECK-NEXT: [[TMP2:%.*]] = call addrspace(42) i32 @func1_addrspace_42() +; CHECK-NEXT: br label [[DOTTAIL]] +; CHECK: .tail: +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ] +; CHECK-NEXT: ret i32 [[TMP3]] +; +; THRESHOLD-0-LABEL: define i32 @function_with_jump_table_addrspace_42( +; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) addrspace(42) { +; THRESHOLD-0-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr addrspace(42)], ptr addrspace(42) @func_array_addrspace_42, i32 0, i32 [[INDEX]] +; THRESHOLD-0-NEXT: [[FUNC_PTR:%.*]] = load ptr addrspace(42), ptr addrspace(42) [[GEP]], align 8 +; THRESHOLD-0-NEXT: [[RESULT:%.*]] = call addrspace(42) i32 [[FUNC_PTR]]() +; THRESHOLD-0-NEXT: ret i32 [[RESULT]] +; + %gep = getelementptr inbounds [2 x ptr addrspace(42)], ptr addrspace(42) @func_array_addrspace_42, i32 0, i32 %index + %func_ptr = load ptr addrspace(42), ptr addrspace(42) %gep, align 8 + %result = call addrspace(42) i32 %func_ptr() + ret i32 %result +} + diff --git a/llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll b/llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll new file mode 100644 index 0000000..f4e9911 --- /dev/null +++ b/llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=jump-table-to-switch -jump-table-to-switch-function-size-threshold=1 -verify-dom-info -S | FileCheck %s + +@func_array0 = constant [2 x ptr] [ptr @func0, ptr @large_func] + +define i32 @func0() { + ret i32 1 +} + +define i32 @large_func() { + %x = add i32 1, 2 + ret i32 %x +} + +define i32 @function_with_jump_table_with_large_func(i32 %index) { +; CHECK-LABEL: define i32 @function_with_jump_table_with_large_func( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: [[RESULT:%.*]] = call i32 [[FUNC_PTR]]() +; CHECK-NEXT: ret i32 [[RESULT]] +; + %gep = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep, align 8 + %result = call i32 %func_ptr() + ret i32 %result +} + diff --git a/llvm/test/Transforms/JumpTableToSwitch/remarks.ll b/llvm/test/Transforms/JumpTableToSwitch/remarks.ll new file mode 100644 index 0000000..84d4c19 --- /dev/null +++ b/llvm/test/Transforms/JumpTableToSwitch/remarks.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -passes=jump-table-to-switch -pass-remarks=jump-table-to-switch -S -o /dev/null 2>&1 | FileCheck %s + +; CHECK: remark: /tmp/tmp.cc:2:20: expanded indirect call into switch + +@func_array = constant [2 x ptr] [ptr @func0, ptr @func1] + +define i32 @func0() { + ret i32 1 +} + +define i32 @func1() { + ret i32 2 +} + +define i32 @function_with_jump_table(i32 %index) { + %gep = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep + %result = call i32 %func_ptr(), !dbg !8 + ret i32 %result +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!llvm.ident = !{!5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 18.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2) +!1 = !DIFile(filename: "/tmp/tmp.cc", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"PIC Level", i32 2} +!5 = !{!"clang version 18.0.0 "} +!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) +!7 = !DISubroutineType(types: !2) +!8 = !DILocation(line: 2, column: 20, scope: !6) +!9 = !DILocation(line: 2, column: 21, scope: !6) +!10 = !DILocation(line: 2, column: 22, scope: !6) diff --git a/llvm/test/Transforms/JumpTableToSwitch/skip.ll b/llvm/test/Transforms/JumpTableToSwitch/skip.ll new file mode 100644 index 0000000..4504423 --- /dev/null +++ b/llvm/test/Transforms/JumpTableToSwitch/skip.ll @@ -0,0 +1,131 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s + +@func_array0 = constant [2 x ptr] [ptr @func0, ptr @declared_only_func1] + +define i32 @func0() { + ret i32 1 +} + +declare i32 @declared_only_func1() + +define i32 @function_with_jump_table_with_a_declared_only_func(i32 %index) { +; CHECK-LABEL: define i32 @function_with_jump_table_with_a_declared_only_func( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: [[RESULT:%.*]] = call i32 [[FUNC_PTR]]() +; CHECK-NEXT: ret i32 [[RESULT]] +; + %gep = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep, align 8 + %result = call i32 %func_ptr() + ret i32 %result +} + +declare i32 @__gxx_personality_v0(...) + +define i32 @function_with_jump_table_invoke(i32 %index) personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: define i32 @function_with_jump_table_invoke( +; CHECK-SAME: i32 [[INDEX:%.*]]) personality ptr @__gxx_personality_v0 { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: [[RESULT:%.*]] = invoke i32 [[FUNC_PTR]]() +; CHECK-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTIONAL:%.*]] +; CHECK: normal: +; CHECK-NEXT: ret i32 [[RESULT]] +; CHECK: exceptional: +; CHECK-NEXT: [[LANDING_PAD:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: resume { ptr, i32 } [[LANDING_PAD]] +; + %gep = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep, align 8 + %result = invoke i32 %func_ptr() to label %normal unwind label %exceptional +normal: + ret i32 %result +exceptional: + %landing_pad = landingpad { ptr, i32 } catch ptr null + resume { ptr, i32 } %landing_pad +} + +@func_array1 = constant [1 x ptr] [ptr @func2] + +define i32 @func2(i32 %arg) { + ret i32 %arg +} + +define i32 @function_with_jump_table_musttail_call(i32 %index) { +; CHECK-LABEL: define i32 @function_with_jump_table_musttail_call( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: [[RESULT:%.*]] = musttail call i32 [[FUNC_PTR]](i32 [[INDEX]]) +; CHECK-NEXT: ret i32 [[RESULT]] +; + %gep = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep, align 8 + %result = musttail call i32 %func_ptr(i32 %index) + ret i32 %result +} + +define i32 @function_with_jump_table_and_volatile_load(i32 %index) { +; CHECK-LABEL: define i32 @function_with_jump_table_and_volatile_load( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load volatile ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]]) +; CHECK-NEXT: ret i32 [[RESULT]] +; + %gep = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 %index + %func_ptr = load volatile ptr, ptr %gep, align 8 + %result = call i32 %func_ptr(i32 %index) + ret i32 %result +} + +define i32 @function_with_jump_table_and_atomic_load(i32 %index) { +; CHECK-LABEL: define i32 @function_with_jump_table_and_atomic_load( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load atomic ptr, ptr [[GEP]] monotonic, align 8 +; CHECK-NEXT: [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]]) +; CHECK-NEXT: ret i32 [[RESULT]] +; + %gep = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 %index + %func_ptr = load atomic ptr, ptr %gep monotonic, align 8 + %result = call i32 %func_ptr(i32 %index) + ret i32 %result +} + +@func_array2 = global [1 x ptr] [ptr @func2] + +define i32 @function_with_nonconstant_jump_table(i32 %index) { +; CHECK-LABEL: define i32 @function_with_nonconstant_jump_table( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array2, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]]) +; CHECK-NEXT: ret i32 [[RESULT]] +; + %gep = getelementptr inbounds [1 x ptr], ptr @func_array2, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep, align 8 + %result = call i32 %func_ptr(i32 %index) + ret i32 %result +} + +@func_array3 = weak constant [1 x ptr] [ptr @func2] + +define i32 @function_with_constant_weak_jump_table(i32 %index) { +; CHECK-LABEL: define i32 @function_with_constant_weak_jump_table( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array3, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]]) +; CHECK-NEXT: ret i32 [[RESULT]] +; + %gep = getelementptr inbounds [1 x ptr], ptr @func_array3, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep, align 8 + %result = call i32 %func_ptr(i32 %index) + ret i32 %result +} + diff --git a/llvm/test/Transforms/JumpTableToSwitch/stride.ll b/llvm/test/Transforms/JumpTableToSwitch/stride.ll new file mode 100644 index 0000000..ef86e9d --- /dev/null +++ b/llvm/test/Transforms/JumpTableToSwitch/stride.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s + +@func_array = constant [2 x ptr] [ptr @func0, ptr @func1] + +define i32 @func0() { + ret i32 1 +} + +define i32 @func1() { + ret i32 2 +} + +define i32 @check_stride(i32 %index) { +; CHECK-LABEL: define i32 @check_stride( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x { ptr, ptr }], ptr @func_array, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [ +; CHECK-NEXT: i32 0, label [[CALL_0:%.*]] +; CHECK-NEXT: ] +; CHECK: default.switch.case.unreachable: +; CHECK-NEXT: unreachable +; CHECK: call.0: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @func0() +; CHECK-NEXT: br label [[DOTTAIL:%.*]] +; CHECK: .tail: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ] +; CHECK-NEXT: ret i32 [[TMP2]] +; + %gep = getelementptr inbounds [2 x { ptr, ptr }], ptr @func_array, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep + %result = call i32 %func_ptr() + ret i32 %result +} + diff --git a/llvm/test/Transforms/JumpTableToSwitch/struct.ll b/llvm/test/Transforms/JumpTableToSwitch/struct.ll new file mode 100644 index 0000000..7aa709c --- /dev/null +++ b/llvm/test/Transforms/JumpTableToSwitch/struct.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s + +%"struct_ty" = type { [2 x ptr] } + +@func_array = constant %"struct_ty" { [2 x ptr] [ptr @func0, ptr @func1] } + +define i32 @func0() { + ret i32 1 +} + +define i32 @func1() { + ret i32 2 +} + +define i32 @function_with_jump_table(i32 %index) { +; CHECK-LABEL: define i32 @function_with_jump_table( +; CHECK-SAME: i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]] +; CHECK-NEXT: [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8 +; CHECK-NEXT: switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [ +; CHECK-NEXT: i32 0, label [[CALL_0:%.*]] +; CHECK-NEXT: i32 1, label [[CALL_1:%.*]] +; CHECK-NEXT: ] +; CHECK: default.switch.case.unreachable: +; CHECK-NEXT: unreachable +; CHECK: call.0: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @func0() +; CHECK-NEXT: br label [[DOTTAIL:%.*]] +; CHECK: call.1: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @func1() +; CHECK-NEXT: br label [[DOTTAIL]] +; CHECK: .tail: +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ] +; CHECK-NEXT: ret i32 [[TMP3]] +; + %gep = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index + %func_ptr = load ptr, ptr %gep + %result = call i32 %func_ptr() + ret i32 %result +} + -- cgit v1.1 From fd140d4283652ff7a906f4ebaaa75c8fcf00d39b Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Sat, 10 Feb 2024 09:13:09 +0000 Subject: [gn build] Port d26b43ff4f73 --- llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn index a1c0427..f080c06 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn @@ -35,6 +35,7 @@ static_library("Scalar") { "InferAddressSpaces.cpp", "InferAlignment.cpp", "InstSimplifyPass.cpp", + "JumpTableToSwitch.cpp", "JumpThreading.cpp", "LICM.cpp", "LoopAccessAnalysisPrinter.cpp", -- cgit v1.1 From f022aaf4e722eae9d0feaf7715a5d8960f4d017b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 10 Feb 2024 11:33:41 +0200 Subject: Revert "[InstCombine] Optimise x / sqrt(y / z) with fast-math pattern. (#76737)" This reverts commit bb5c3899d1936ebdf7ebf5ca4347ee2e057bee7f. That commit caused failed asserts like this: $ cat repro.c float a, b; double sqrt(); void c() { b = a / sqrt(a); } $ clang -target x86_64-linux-gnu -c -O2 -ffast-math repro.c clang: ../lib/IR/Instruction.cpp:522: bool llvm::Instruction::hasAllowReassoc() const: Assertion `isa(this) && "getting fast-math flag on invalid op"' failed. --- .../InstCombine/InstCombineMulDivRem.cpp | 30 ---------------------- llvm/test/Transforms/InstCombine/fdiv-sqrt.ll | 18 ++++++------- 2 files changed, 9 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 5918567..f9cee9d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -1709,33 +1709,6 @@ static Instruction *foldFDivPowDivisor(BinaryOperator &I, return BinaryOperator::CreateFMulFMF(Op0, Pow, &I); } -/// Convert div to mul if we have an sqrt divisor iff sqrt's operand is a fdiv -/// instruction. -static Instruction *foldFDivSqrtDivisor(BinaryOperator &I, - InstCombiner::BuilderTy &Builder) { - // X / sqrt(Y / Z) --> X * sqrt(Z / Y) - if (!I.hasAllowReassoc() || !I.hasAllowReciprocal()) - return nullptr; - Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - auto *II = dyn_cast(Op1); - if (!II || II->getIntrinsicID() != Intrinsic::sqrt || !II->hasOneUse() || - !II->hasAllowReassoc() || !II->hasAllowReciprocal()) - return nullptr; - - Value *Y, *Z; - auto *DivOp = dyn_cast(II->getOperand(0)); - if (!DivOp || !DivOp->hasAllowReassoc() || !I.hasAllowReciprocal() || - !DivOp->hasOneUse()) - return nullptr; - if (match(DivOp, m_FDiv(m_Value(Y), m_Value(Z)))) { - Value *SwapDiv = Builder.CreateFDivFMF(Z, Y, DivOp); - Value *NewSqrt = - Builder.CreateUnaryIntrinsic(II->getIntrinsicID(), SwapDiv, II); - return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I); - } - return nullptr; -} - Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { Module *M = I.getModule(); @@ -1843,9 +1816,6 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { if (Instruction *Mul = foldFDivPowDivisor(I, Builder)) return Mul; - if (Instruction *Mul = foldFDivSqrtDivisor(I, Builder)) - return Mul; - // pow(X, Y) / X --> pow(X, Y-1) if (I.hasAllowReassoc() && match(Op0, m_OneUse(m_Intrinsic(m_Specific(Op1), diff --git a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll index 361837e..346271b 100644 --- a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll +++ b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll @@ -6,9 +6,9 @@ declare double @llvm.sqrt.f64(double) define double @sqrt_div_fast(double %x, double %y, double %z) { ; CHECK-LABEL: @sqrt_div_fast( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = fdiv fast double [[Z:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.sqrt.f64(double [[TMP0]]) -; CHECK-NEXT: [[DIV1:%.*]] = fmul fast double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]] ; CHECK-NEXT: ret double [[DIV1]] ; entry: @@ -36,9 +36,9 @@ entry: define double @sqrt_div_reassoc_arcp(double %x, double %y, double %z) { ; CHECK-LABEL: @sqrt_div_reassoc_arcp( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc arcp double [[Z:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[TMP0]]) -; CHECK-NEXT: [[DIV1:%.*]] = fmul reassoc arcp double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]] ; CHECK-NEXT: ret double [[DIV1]] ; entry: @@ -96,9 +96,9 @@ entry: define double @sqrt_div_arcp_missing(double %x, double %y, double %z) { ; CHECK-LABEL: @sqrt_div_arcp_missing( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double [[Z:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[TMP0]]) -; CHECK-NEXT: [[DIV1:%.*]] = fmul reassoc arcp double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc double [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]]) +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]] ; CHECK-NEXT: ret double [[DIV1]] ; entry: -- cgit v1.1 From 8884ba43a8485bebef5c4d41e7ed457e3fa84f07 Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Sat, 10 Feb 2024 10:08:53 +0000 Subject: [lldb] Fix FreeBSD build. (#81353) --- lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp b/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp index 997b590..abfbdb1 100644 --- a/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp +++ b/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp @@ -50,7 +50,7 @@ private: class ProcessFreeBSDKernelKVM : public ProcessFreeBSDKernel { public: ProcessFreeBSDKernelKVM(lldb::TargetSP target_sp, lldb::ListenerSP listener, - kvm_t *fvc); + kvm_t *fvc, const FileSpec &core_file); ~ProcessFreeBSDKernelKVM(); -- cgit v1.1 From 33c6b77d2a18862fb5b16160ef9d600382e93f19 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Sat, 10 Feb 2024 12:46:42 +0100 Subject: [llvm-lib][Object] Add support for EC importlib symbols. (#81059) ARM64EC import libraries expose two additional symbols: mangled thunk symbol (like `#func`) and auxiliary import symbol (like`__imp_aux_func`). The main functional change with this patch is that those symbols are properly added to static library ECSYMBOLS. --- llvm/include/llvm/Object/COFF.h | 41 ++++++ llvm/include/llvm/Object/COFFImportFile.h | 28 +++- llvm/lib/Object/COFFImportFile.cpp | 15 +++ .../Target/AArch64/AArch64Arm64ECCallLowering.cpp | 2 + llvm/lib/Target/AArch64/AArch64MCInstLower.cpp | 2 + llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h | 28 ---- llvm/test/tools/llvm-lib/arm64ec-implib.test | 141 ++++++++++++++++++++- 7 files changed, 225 insertions(+), 32 deletions(-) diff --git a/llvm/include/llvm/Object/COFF.h b/llvm/include/llvm/Object/COFF.h index a548b2c..2a5c3d8 100644 --- a/llvm/include/llvm/Object/COFF.h +++ b/llvm/include/llvm/Object/COFF.h @@ -1362,6 +1362,47 @@ public: SectionStrippedError() { setErrorCode(object_error::section_stripped); } }; +inline std::optional +getArm64ECMangledFunctionName(StringRef Name) { + bool IsCppFn = Name[0] == '?'; + if (IsCppFn && Name.find("$$h") != std::string::npos) + return std::nullopt; + if (!IsCppFn && Name[0] == '#') + return std::nullopt; + + StringRef Prefix = "$$h"; + size_t InsertIdx = 0; + if (IsCppFn) { + InsertIdx = Name.find("@@"); + size_t ThreeAtSignsIdx = Name.find("@@@"); + if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) { + InsertIdx += 2; + } else { + InsertIdx = Name.find("@"); + if (InsertIdx != std::string::npos) + InsertIdx++; + } + } else { + Prefix = "#"; + } + + return std::optional( + (Name.substr(0, InsertIdx) + Prefix + Name.substr(InsertIdx)).str()); +} + +inline std::optional +getArm64ECDemangledFunctionName(StringRef Name) { + if (Name[0] == '#') + return std::string(Name.substr(1)); + if (Name[0] != '?') + return std::nullopt; + + std::pair Pair = Name.split("$$h"); + if (Pair.second.empty()) + return std::nullopt; + return (Pair.first + Pair.second).str(); +} + } // end namespace object } // end namespace llvm diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h index 7c5846e9..46a982d 100644 --- a/llvm/include/llvm/Object/COFFImportFile.h +++ b/llvm/include/llvm/Object/COFFImportFile.h @@ -27,6 +27,9 @@ namespace llvm { namespace object { class COFFImportFile : public SymbolicFile { +private: + enum SymbolIndex { ImpSymbol, ThunkSymbol, ECAuxSymbol, ECThunkSymbol }; + public: COFFImportFile(MemoryBufferRef Source) : SymbolicFile(ID_COFFImportFile, Source) {} @@ -36,9 +39,23 @@ public: void moveSymbolNext(DataRefImpl &Symb) const override { ++Symb.p; } Error printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override { - if (Symb.p == 0) + switch (Symb.p) { + case ImpSymbol: OS << "__imp_"; - OS << StringRef(Data.getBufferStart() + sizeof(coff_import_header)); + break; + case ECAuxSymbol: + OS << "__imp_aux_"; + break; + } + const char *Name = Data.getBufferStart() + sizeof(coff_import_header); + if (Symb.p != ECThunkSymbol && COFF::isArm64EC(getMachine())) { + if (std::optional DemangledName = + getArm64ECDemangledFunctionName(Name)) { + OS << StringRef(*DemangledName); + return Error::success(); + } + } + OS << StringRef(Name); return Error::success(); } @@ -52,7 +69,12 @@ public: basic_symbol_iterator symbol_end() const override { DataRefImpl Symb; - Symb.p = isData() ? 1 : 2; + if (isData()) + Symb.p = ImpSymbol + 1; + else if (COFF::isArm64EC(getMachine())) + Symb.p = ECThunkSymbol + 1; + else + Symb.p = ThunkSymbol + 1; return BasicSymbolRef(Symb, this); } diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp index 51e6274..a3e5e78 100644 --- a/llvm/lib/Object/COFFImportFile.cpp +++ b/llvm/lib/Object/COFFImportFile.cpp @@ -684,6 +684,21 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path, NameType = getNameType(SymbolName, E.Name, Machine, MinGW); } + // On ARM64EC, use EXPORTAS to import demangled name for mangled symbols. + if (ImportType == IMPORT_CODE && isArm64EC(Machine)) { + if (std::optional MangledName = + getArm64ECMangledFunctionName(Name)) { + if (ExportName.empty()) { + NameType = IMPORT_NAME_EXPORTAS; + ExportName.swap(Name); + } + Name = std::move(*MangledName); + } else if (ExportName.empty()) { + NameType = IMPORT_NAME_EXPORTAS; + ExportName = std::move(*getArm64ECDemangledFunctionName(Name)); + } + } + Members.push_back(OF.createShortImport(Name, E.Ordinal, ImportType, NameType, ExportName, Machine)); } diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 91b4f18..c62582a 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -24,11 +24,13 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/InitializePasses.h" +#include "llvm/Object/COFF.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/TargetParser/Triple.h" using namespace llvm; +using namespace llvm::object; using OperandBundleDef = OperandBundleDefT; diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 1e12cf5..37d621c 100644 --- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -23,11 +23,13 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/Object/COFF.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; +using namespace llvm::object; extern cl::opt EnableAArch64ELFLocalDynamicTLSGeneration; diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index e3f1d25..ed8336a 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -248,34 +248,6 @@ static inline bool atomicBarrierDroppedOnZero(unsigned Opcode) { return false; } -static inline std::optional -getArm64ECMangledFunctionName(std::string Name) { - bool IsCppFn = Name[0] == '?'; - if (IsCppFn && Name.find("$$h") != std::string::npos) - return std::nullopt; - if (!IsCppFn && Name[0] == '#') - return std::nullopt; - - StringRef Prefix = "$$h"; - size_t InsertIdx = 0; - if (IsCppFn) { - InsertIdx = Name.find("@@"); - size_t ThreeAtSignsIdx = Name.find("@@@"); - if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) { - InsertIdx += 2; - } else { - InsertIdx = Name.find("@"); - if (InsertIdx != std::string::npos) - InsertIdx++; - } - } else { - Prefix = "#"; - } - - Name.insert(Name.begin() + InsertIdx, Prefix.begin(), Prefix.end()); - return std::optional(Name); -} - namespace AArch64CC { // The CondCodes constants map directly to the 4-bit encoding of the condition diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test index 4250c77..c583ef7 100644 --- a/llvm/test/tools/llvm-lib/arm64ec-implib.test +++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test @@ -11,9 +11,23 @@ ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll ARMAP-EMPTY: ARMAP-NEXT: Archive EC map +ARMAP-NEXT: #expname in test.dll +ARMAP-NEXT: #funcexp in test.dll +ARMAP-NEXT: #mangledfunc in test.dll +ARMAP-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test.dll +ARMAP-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll +ARMAP-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test.dll +ARMAP-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test.dll +ARMAP-NEXT: __imp_aux_expname in test.dll +ARMAP-NEXT: __imp_aux_funcexp in test.dll +ARMAP-NEXT: __imp_aux_mangledfunc in test.dll ARMAP-NEXT: __imp_dataexp in test.dll +ARMAP-NEXT: __imp_expname in test.dll ARMAP-NEXT: __imp_funcexp in test.dll +ARMAP-NEXT: __imp_mangledfunc in test.dll +ARMAP-NEXT: expname in test.dll ARMAP-NEXT: funcexp in test.dll +ARMAP-NEXT: mangledfunc in test.dll RUN: llvm-readobj test.lib | FileCheck -check-prefix=READOBJ %s @@ -35,10 +49,42 @@ READOBJ-EMPTY: READOBJ-NEXT: File: test.dll READOBJ-NEXT: Format: COFF-import-file-ARM64EC READOBJ-NEXT: Type: code -READOBJ-NEXT: Name type: name +READOBJ-NEXT: Name type: export as READOBJ-NEXT: Export name: funcexp READOBJ-NEXT: Symbol: __imp_funcexp READOBJ-NEXT: Symbol: funcexp +READOBJ-NEXT: Symbol: __imp_aux_funcexp +READOBJ-NEXT: Symbol: #funcexp +READOBJ-EMPTY: +READOBJ-NEXT: File: test.dll +READOBJ-NEXT: Format: COFF-import-file-ARM64EC +READOBJ-NEXT: Type: code +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: mangledfunc +READOBJ-NEXT: Symbol: __imp_mangledfunc +READOBJ-NEXT: Symbol: mangledfunc +READOBJ-NEXT: Symbol: __imp_aux_mangledfunc +READOBJ-NEXT: Symbol: #mangledfunc +READOBJ-EMPTY: +READOBJ-NEXT: File: test.dll +READOBJ-NEXT: Format: COFF-import-file-ARM64EC +READOBJ-NEXT: Type: code +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: ?test_cpp_func@@YAHPEAX@Z +READOBJ-NEXT: Symbol: __imp_?test_cpp_func@@YAHPEAX@Z +READOBJ-NEXT: Symbol: ?test_cpp_func@@YAHPEAX@Z +READOBJ-NEXT: Symbol: __imp_aux_?test_cpp_func@@YAHPEAX@Z +READOBJ-NEXT: Symbol: ?test_cpp_func@@$$hYAHPEAX@Z +READOBJ-EMPTY: +READOBJ-NEXT: File: test.dll +READOBJ-NEXT: Format: COFF-import-file-ARM64EC +READOBJ-NEXT: Type: code +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: expname +READOBJ-NEXT: Symbol: __imp_expname +READOBJ-NEXT: Symbol: expname +READOBJ-NEXT: Symbol: __imp_aux_expname +READOBJ-NEXT: Symbol: #expname READOBJ-EMPTY: READOBJ-NEXT: File: test.dll READOBJ-NEXT: Format: COFF-import-file-ARM64EC @@ -51,8 +97,101 @@ Creating a new lib containing the existing lib: RUN: llvm-lib -machine:arm64ec test.lib -out:test2.lib RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAP %s + +RUN: llvm-lib -machine:arm64ec -def:exportas.def -out:exportas.lib +RUN: llvm-nm --print-armap exportas.lib | FileCheck -check-prefix=EXPAS-ARMAP %s +RUN: llvm-readobj exportas.lib | FileCheck -check-prefix=EXPAS-READOBJ %s + +EXPAS-ARMAP: Archive EC map +EXPAS-ARMAP-NEXT: #func1 in test.dll +EXPAS-ARMAP-NEXT: #func2 in test.dll +EXPAS-ARMAP-NEXT: #func3 in test.dll +EXPAS-ARMAP-NEXT: #func4 in test.dll +EXPAS-ARMAP-NEXT: __imp_aux_func1 in test.dll +EXPAS-ARMAP-NEXT: __imp_aux_func2 in test.dll +EXPAS-ARMAP-NEXT: __imp_aux_func3 in test.dll +EXPAS-ARMAP-NEXT: __imp_aux_func4 in test.dll +EXPAS-ARMAP-NEXT: __imp_data1 in test.dll +EXPAS-ARMAP-NEXT: __imp_data2 in test.dll +EXPAS-ARMAP-NEXT: __imp_func1 in test.dll +EXPAS-ARMAP-NEXT: __imp_func2 in test.dll +EXPAS-ARMAP-NEXT: __imp_func3 in test.dll +EXPAS-ARMAP-NEXT: __imp_func4 in test.dll +EXPAS-ARMAP-NEXT: func1 in test.dll +EXPAS-ARMAP-NEXT: func2 in test.dll +EXPAS-ARMAP-NEXT: func3 in test.dll +EXPAS-ARMAP-NEXT: func4 in test.dll + +EXPAS-READOBJ: File: test.dll +EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC +EXPAS-READOBJ-NEXT: Type: code +EXPAS-READOBJ-NEXT: Name type: export as +EXPAS-READOBJ-NEXT: Export name: func1 +EXPAS-READOBJ-NEXT: Symbol: __imp_func1 +EXPAS-READOBJ-NEXT: Symbol: func1 +EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func1 +EXPAS-READOBJ-NEXT: Symbol: #func1 +EXPAS-READOBJ-EMPTY: +EXPAS-READOBJ-NEXT: File: test.dll +EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC +EXPAS-READOBJ-NEXT: Type: code +EXPAS-READOBJ-NEXT: Name type: export as +EXPAS-READOBJ-NEXT: Export name: func2 +EXPAS-READOBJ-NEXT: Symbol: __imp_func2 +EXPAS-READOBJ-NEXT: Symbol: func2 +EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func2 +EXPAS-READOBJ-NEXT: Symbol: #func2 +EXPAS-READOBJ-EMPTY: +EXPAS-READOBJ-NEXT: File: test.dll +EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC +EXPAS-READOBJ-NEXT: Type: code +EXPAS-READOBJ-NEXT: Name type: export as +EXPAS-READOBJ-NEXT: Export name: #func3 +EXPAS-READOBJ-NEXT: Symbol: __imp_func3 +EXPAS-READOBJ-NEXT: Symbol: func3 +EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func3 +EXPAS-READOBJ-NEXT: Symbol: #func3 +EXPAS-READOBJ-EMPTY: +EXPAS-READOBJ-NEXT: File: test.dll +EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC +EXPAS-READOBJ-NEXT: Type: code +EXPAS-READOBJ-NEXT: Name type: export as +EXPAS-READOBJ-NEXT: Export name: #func4 +EXPAS-READOBJ-NEXT: Symbol: __imp_func4 +EXPAS-READOBJ-NEXT: Symbol: func4 +EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func4 +EXPAS-READOBJ-NEXT: Symbol: #func4 +EXPAS-READOBJ-EMPTY: +EXPAS-READOBJ-NEXT: File: test.dll +EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC +EXPAS-READOBJ-NEXT: Type: data +EXPAS-READOBJ-NEXT: Name type: export as +EXPAS-READOBJ-NEXT: Export name: #data1 +EXPAS-READOBJ-NEXT: Symbol: __imp_data1 +EXPAS-READOBJ-EMPTY: +EXPAS-READOBJ-NEXT: File: test.dll +EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC +EXPAS-READOBJ-NEXT: Type: data +EXPAS-READOBJ-NEXT: Name type: export as +EXPAS-READOBJ-NEXT: Export name: data2 +EXPAS-READOBJ-NEXT: Symbol: __imp_data2 + + #--- test.def LIBRARY test.dll EXPORTS funcexp + #mangledfunc + ?test_cpp_func@@YAHPEAX@Z + expname=impname dataexp DATA + +#--- exportas.def +LIBRARY test.dll +EXPORTS + #func1 EXPORTAS func1 + func2 EXPORTAS func2 + func3 EXPORTAS #func3 + #func4 EXPORTAS #func4 + data1 DATA EXPORTAS #data1 + #data2 DATA EXPORTAS data2 -- cgit v1.1 From 7d9540ea96ecb1e83f19cc68a202e8fa697c513d Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sat, 10 Feb 2024 14:21:57 +0100 Subject: [libc++][chrono] Implements duration Rep constraints. (#80539) Applies LWG3050 to the constraints of operator*, operator/, and operator%. The changes to the constructor were done in https://reviews.llvm.org/D118902, but that patch did not identify the related LWG-issue, and only adjusted the constructor to the wording in the Standard. Implements: - LWG 3050: Conversion specification problem in chrono::duration constructor --------- Co-authored-by: h-vetinari --- libcxx/docs/Status/Cxx20Issues.csv | 2 +- libcxx/include/__chrono/duration.h | 8 +++--- libcxx/include/chrono | 2 +- libcxx/test/std/time/rep.h | 23 +++++++++++++++++ .../op_divide_duration.pass.cpp | 15 +++++++++-- .../op_mod_duration.pass.cpp | 15 +++++++++-- .../time.duration.nonmember/op_times_rep.pass.cpp | 30 ++++++++++++++++------ 7 files changed, 77 insertions(+), 18 deletions(-) diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index 316127f..f0e9c40 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -192,7 +192,7 @@ "`1203 `__","More useful rvalue stream insertion","Prague","|Complete|","12.0" "`2859 `__","Definition of *reachable* in [ptr.launder] misses pointer arithmetic from pointer-interconvertible object","Prague","","" "`3018 `__","``shared_ptr``\ of function type","Prague","","" -"`3050 `__","Conversion specification problem in ``chrono::duration``\ constructor","Prague","","","|chrono|" +"`3050 `__","Conversion specification problem in ``chrono::duration``\ constructor","Prague","|Complete|","19.0","|chrono|" "`3141 `__","``CopyConstructible``\ doesn't preserve source values","Prague","|Nothing to do|","" "`3150 `__","``UniformRandomBitGenerator``\ should validate ``min``\ and ``max``\ ","Prague","|Complete|","13.0","|ranges|" "`3175 `__","The ``CommonReference``\ requirement of concept ``SwappableWith``\ is not satisfied in the example","Prague","|Complete|","13.0" diff --git a/libcxx/include/__chrono/duration.h b/libcxx/include/__chrono/duration.h index 5693ee6..1e81420 100644 --- a/libcxx/include/__chrono/duration.h +++ b/libcxx/include/__chrono/duration.h @@ -412,7 +412,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR template ::type>::value, int> = 0> + __enable_if_t::type>::value, int> = 0> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration::type, _Period> operator*(const duration<_Rep1, _Period>& __d, const _Rep2& __s) { typedef typename common_type<_Rep1, _Rep2>::type _Cr; @@ -423,7 +423,7 @@ operator*(const duration<_Rep1, _Period>& __d, const _Rep2& __s) { template ::type>::value, int> = 0> + __enable_if_t::type>::value, int> = 0> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration::type, _Period> operator*(const _Rep1& __s, const duration<_Rep2, _Period>& __d) { return __d * __s; @@ -435,7 +435,7 @@ template ::value && - is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value, + is_convertible::type>::value, int> = 0> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration::type, _Period> operator/(const duration<_Rep1, _Period>& __d, const _Rep2& __s) { @@ -457,7 +457,7 @@ template ::value && - is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value, + is_convertible::type>::value, int> = 0> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration::type, _Period> operator%(const duration<_Rep1, _Period>& __d, const _Rep2& __s) { diff --git a/libcxx/include/chrono b/libcxx/include/chrono index c80fa78..f840741 100644 --- a/libcxx/include/chrono +++ b/libcxx/include/chrono @@ -58,7 +58,7 @@ public: constexpr explicit duration(const Rep2& r, typename enable_if < - is_convertible::value && + is_convertible::value && (treat_as_floating_point::value || !treat_as_floating_point::value && !treat_as_floating_point::value) >::type* = 0); diff --git a/libcxx/test/std/time/rep.h b/libcxx/test/std/time/rep.h index 80a0e3c..ddb5c0b 100644 --- a/libcxx/test/std/time/rep.h +++ b/libcxx/test/std/time/rep.h @@ -10,6 +10,7 @@ #define REP_H #include "test_macros.h" +#include class Rep { @@ -29,6 +30,28 @@ public: struct NotARep {}; +#if TEST_STD_VER >= 11 +// Several duration operators take a Rep parameter. Before LWG3050 this +// parameter was constrained to be convertible from a non-const object, +// but the code always uses a const object. So the function was SFINAE'd +// away for this type. LWG3050 fixes the constraint to use a const +// object. +struct RepConstConvertibleLWG3050 { + operator long() = delete; + operator long() const { return 2; } +}; +namespace std { +template <> +struct common_type { + using type = long; +}; +template <> +struct common_type { + using type = long; +}; +} // namespace std +#endif // TEST_STD_VER >= 11 + // std::chrono:::duration has only '*', '/' and '%' taking a "Rep" parameter // Multiplication is commutative, division is not. diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp index d580f4e..6cedd13 100644 --- a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp +++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp @@ -21,6 +21,7 @@ #include "test_macros.h" #include "truncate_fp.h" +#include "../../rep.h" int main(int, char**) { @@ -65,7 +66,17 @@ int main(int, char**) constexpr std::chrono::duration > s2(5); static_assert(s1 / s2 == 20./3, ""); } -#endif + { + std::chrono::duration d(5); + RepConstConvertibleLWG3050 x; + + { + auto r = d / x; + assert(r.count() == 2); + ASSERT_SAME_TYPE(std::chrono::duration, decltype(r)); + } + } +#endif // TEST_STD_VER >= 11 - return 0; + return 0; } diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp index 8b8b50d..df637e1 100644 --- a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp +++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp @@ -18,6 +18,7 @@ #include #include #include +#include "../../rep.h" #include "test_macros.h" @@ -60,7 +61,17 @@ int main(int, char**) constexpr std::chrono::duration > r = s1 % s2; static_assert(r.count() == 24, ""); } -#endif + { + std::chrono::duration d(5); + RepConstConvertibleLWG3050 x; + + { + auto r = d % x; + assert(r.count() == 1); + ASSERT_SAME_TYPE(std::chrono::duration, decltype(r)); + } + } +#endif // TEST_STD_VER >= 11 - return 0; + return 0; } diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp index c331032..d7c8c2d 100644 --- a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp +++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp @@ -26,28 +26,27 @@ #include "test_macros.h" #include "../../rep.h" -int main(int, char**) -{ - { +int main(int, char**) { + { std::chrono::nanoseconds ns(3); ns = ns * 5; assert(ns.count() == 15); ns = 6 * ns; assert(ns.count() == 90); - } + } #if TEST_STD_VER >= 11 - { + { constexpr std::chrono::nanoseconds ns(3); constexpr std::chrono::nanoseconds ns2 = ns * 5; static_assert(ns2.count() == 15, ""); constexpr std::chrono::nanoseconds ns3 = 6 * ns; static_assert(ns3.count() == 18, ""); - } + } #endif #if TEST_STD_VER >= 11 - { // This is related to PR#41130 + { // This is related to PR#41130 typedef std::chrono::nanoseconds Duration; Duration d(5); NotARep n; @@ -57,8 +56,23 @@ int main(int, char**) assert(d.count() == 5); d = n * d; assert(d.count() == 5); + } + { + std::chrono::duration d(8); + RepConstConvertibleLWG3050 x; + + { + auto r = d * x; + assert(r.count() == 16); + ASSERT_SAME_TYPE(std::chrono::duration, decltype(r)); } -#endif + { + auto r = x * d; + assert(r.count() == 16); + ASSERT_SAME_TYPE(std::chrono::duration, decltype(r)); + } + } +#endif // TEST_STD_VER >= 11 return 0; } -- cgit v1.1 From 9981f5a72e998e5334852695164731b01bf0307b Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 10 Feb 2024 13:25:53 +0000 Subject: [BasicAA] Add extra onevscale test for multiple dependent geps that lose the NSW flag. NFC --- llvm/test/Analysis/BasicAA/vscale.ll | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll index b2f5c66..895ae1e 100644 --- a/llvm/test/Analysis/BasicAA/vscale.ll +++ b/llvm/test/Analysis/BasicAA/vscale.ll @@ -469,11 +469,29 @@ define void @vscale_negativescale(ptr %p) vscale_range(1,16) { ret void } +; CHECK-LABEL: onevscale +; CHECK-DAG: MustAlias: * %vp161, * %vp162 +; CHECK-DAG: MayAlias: * %vp161, * %vp161b +; CHECK-DAG: MayAlias: * %vp161b, * %vp162 +define void @onevscale(ptr %p) vscale_range(1,16) { + %v1 = call i64 @llvm.vscale.i64() + %vp1 = mul nsw i64 %v1, 16 + %vp2 = mul nsw i64 %v1, 16 + %vp3 = mul nsw i64 %v1, 17 + %vp161 = getelementptr i8, ptr %p, i64 %vp1 + %vp162 = getelementptr i8, ptr %p, i64 %vp2 + %vp161b = getelementptr i8, ptr %vp161, i64 %vp3 + load , ptr %vp161 + load , ptr %vp162 + load , ptr %vp161b + ret void +} + ; CHECK-LABEL: twovscales ; CHECK-DAG: MayAlias: * %vp161, * %vp162 ; CHECK-DAG: MayAlias: * %vp161, * %vp161b ; CHECK-DAG: MayAlias: * %vp161b, * %vp162 -define void @twovscales(ptr %p) { +define void @twovscales(ptr %p) vscale_range(1,16) { %v1 = call i64 @llvm.vscale.i64() %v2 = call i64 @llvm.vscale.i64() %vp1 = mul nsw i64 %v1, 16 -- cgit v1.1 From 59037c0975de51ae29a5f9bd4260131ba3b7c22a Mon Sep 17 00:00:00 2001 From: Yeting Kuo <46629943+yetingk@users.noreply.github.com> Date: Sat, 10 Feb 2024 22:18:46 +0800 Subject: [RISCV] Add Zicfiss support to the shadow call stack implementation. (#68075) This patch enable hardware shadow stack with `Zicifss` and `mno-forced-sw-shadow-stack`. New feature forced-sw-shadow-stack disables hardware shadow stack even when `Zicfiss` enabled. --- clang/docs/ShadowCallStack.rst | 42 +++++---- clang/include/clang/Driver/Options.td | 4 + clang/test/Driver/riscv-features.c | 6 ++ llvm/lib/Target/RISCV/RISCVFeatures.td | 5 + llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 14 ++- llvm/test/CodeGen/RISCV/shadowcallstack.ll | 134 +++++++++++++++++++++++++++ 6 files changed, 187 insertions(+), 18 deletions(-) diff --git a/clang/docs/ShadowCallStack.rst b/clang/docs/ShadowCallStack.rst index 6e5192f..d7ece11 100644 --- a/clang/docs/ShadowCallStack.rst +++ b/clang/docs/ShadowCallStack.rst @@ -57,19 +57,25 @@ compiled application or the operating system. Integrating the runtime into the operating system should be preferred since otherwise all thread creation and destruction would need to be intercepted by the application. -The instrumentation makes use of the platform register ``x18`` on AArch64 and -``x3`` (``gp``) on RISC-V. For simplicity we will refer to this as the -``SCSReg``. On some platforms, ``SCSReg`` is reserved, and on others, it is -designated as a scratch register. This generally means that any code that may -run on the same thread as code compiled with ShadowCallStack must either target -one of the platforms whose ABI reserves ``SCSReg`` (currently Android, Darwin, -Fuchsia and Windows) or be compiled with a flag to reserve that register (e.g., -``-ffixed-x18``). If absolutely necessary, code compiled without reserving the -register may be run on the same thread as code that uses ShadowCallStack by -saving the register value temporarily on the stack (`example in Android`_) but -this should be done with care since it risks leaking the shadow call stack -address. - +The instrumentation makes use of the platform register ``x18`` on AArch64, +``x3`` (``gp``) on RISC-V with software shadow stack and ``ssp`` on RISC-V with +hardware shadow stack, which needs `Zicfiss`_ and ``-mno-forced-sw-shadow-stack`` +(default option). Note that with ``Zicfiss``_ the RISC-V backend will default to +the hardware based shadow call stack. Users can force the RISC-V backend to +generate the software shadow call stack with ``Zicfiss``_ by passing +``-mforced-sw-shadow-stack``. +For simplicity we will refer to this as the ``SCSReg``. On some platforms, +``SCSReg`` is reserved, and on others, it is designated as a scratch register. +This generally means that any code that may run on the same thread as code +compiled with ShadowCallStack must either target one of the platforms whose ABI +reserves ``SCSReg`` (currently Android, Darwin, Fuchsia and Windows) or be +compiled with a flag to reserve that register (e.g., ``-ffixed-x18``). If +absolutely necessary, code compiled without reserving the register may be run on +the same thread as code that uses ShadowCallStack by saving the register value +temporarily on the stack (`example in Android`_) but this should be done with +care since it risks leaking the shadow call stack address. + +.. _`Zicfiss`: https://github.com/riscv/riscv-cfi/blob/main/cfi_backward.adoc .. _`example in Android`: https://android-review.googlesource.com/c/platform/frameworks/base/+/803717 Because it requires a dedicated register, the ShadowCallStack feature is @@ -151,9 +157,13 @@ Usage To enable ShadowCallStack, just pass the ``-fsanitize=shadow-call-stack`` flag to both compile and link command lines. On aarch64, you also need to pass -``-ffixed-x18`` unless your target already reserves ``x18``. On RISC-V, ``x3`` -(``gp``) is always reserved. It is, however, important to disable GP relaxation -in the linker. This can be done with the ``--no-relax-gp`` flag in GNU ld. +``-ffixed-x18`` unless your target already reserves ``x18``. No additional flags +need to be passed on RISC-V because the software based shadow stack uses +``x3`` (``gp``), which is always reserved, and the hardware based shadow call +stack uses a dedicated register, ``ssp``. +However, it is important to disable GP relaxation in the linker when using the +software based shadow call stack on RISC-V. This can be done with the +``--no-relax-gp`` flag in GNU ld, and is off by default in LLD. Low-level API ------------- diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 31503fc..7f00732 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4614,6 +4614,10 @@ def msave_restore : Flag<["-"], "msave-restore">, Group, HelpText<"Enable using library calls for save and restore">; def mno_save_restore : Flag<["-"], "mno-save-restore">, Group, HelpText<"Disable using library calls for save and restore">; +def mforced_sw_shadow_stack : Flag<["-"], "mforced-sw-shadow-stack">, Group, + HelpText<"Force using software shadow stack when shadow-stack enabled">; +def mno_forced_sw_shadow_stack : Flag<["-"], "mno-forced-sw-shadow-stack">, Group, + HelpText<"Not force using software shadow stack when shadow-stack enabled">; } // let Flags = [TargetSpecific] let Flags = [TargetSpecific] in { def menable_experimental_extensions : Flag<["-"], "menable-experimental-extensions">, Group, diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c index d3700f7..a108383 100644 --- a/clang/test/Driver/riscv-features.c +++ b/clang/test/Driver/riscv-features.c @@ -27,6 +27,12 @@ // DEFAULT-NOT: "-target-feature" "-save-restore" // DEFAULT-NOT: "-target-feature" "+save-restore" +// RUN: %clang --target=riscv32-unknown-elf -### %s -mforced-sw-shadow-stack 2>&1 | FileCheck %s -check-prefix=FORCE-SW-SCS +// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-forced-sw-shadow-stack 2>&1 | FileCheck %s -check-prefix=NO-FORCE-SW-SCS +// FORCE-SW-SCS: "-target-feature" "+forced-sw-shadow-stack" +// NO-FORCE-SW-SCS: "-target-feature" "-forced-sw-shadow-stack" +// DEFAULT-NOT: "-target-feature" "+forced-sw-shadow-stack" + // RUN: %clang --target=riscv32-unknown-elf -### %s -munaligned-access 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-unaligned-access 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 03e0980..5b8d51f 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1227,3 +1227,8 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", "AllowTaggedGlobals", "true", "Use an instruction sequence for taking the address of a global " "that allows a memory tag in the upper address bits">; + +def FeatureForcedSWShadowStack : SubtargetFeature< + "forced-sw-shadow-stack", "HasForcedSWShadowStack", "true", + "Implement shadow stack with software.">; +def HasForcedSWShadowStack : Predicate<"Subtarget->hasForcedSWShadowStack()">; diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 0de4785..37672dd 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -66,9 +66,14 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, CSI, [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; })) return; + const RISCVInstrInfo *TII = STI.getInstrInfo(); + if (!STI.hasForcedSWShadowStack() && STI.hasStdExtZicfiss()) { + BuildMI(MBB, MI, DL, TII->get(RISCV::SSPUSH)).addReg(RAReg); + return; + } + Register SCSPReg = RISCVABI::getSCSPReg(); - const RISCVInstrInfo *TII = STI.getInstrInfo(); bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit); int64_t SlotSize = STI.getXLen() / 8; // Store return address to shadow call stack @@ -121,9 +126,14 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB, CSI, [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; })) return; + const RISCVInstrInfo *TII = STI.getInstrInfo(); + if (!STI.hasForcedSWShadowStack() && STI.hasStdExtZicfiss()) { + BuildMI(MBB, MI, DL, TII->get(RISCV::SSPOPCHK)).addReg(RAReg); + return; + } + Register SCSPReg = RISCVABI::getSCSPReg(); - const RISCVInstrInfo *TII = STI.getInstrInfo(); bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit); int64_t SlotSize = STI.getXLen() / 8; // Load return address from shadow call stack diff --git a/llvm/test/CodeGen/RISCV/shadowcallstack.ll b/llvm/test/CodeGen/RISCV/shadowcallstack.ll index b41b87a..a320b44 100644 --- a/llvm/test/CodeGen/RISCV/shadowcallstack.ll +++ b/llvm/test/CodeGen/RISCV/shadowcallstack.ll @@ -3,6 +3,14 @@ ; RUN: | FileCheck %s --check-prefix=RV32 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ ; RUN: | FileCheck %s --check-prefix=RV64 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfiss < %s \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefix=RV32-ZICFISS +; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfiss < %s \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefix=RV64-ZICFISS +; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfiss,forced-sw-shadow-stack \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfiss,forced-sw-shadow-stack \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64 define void @f1() shadowcallstack { ; RV32-LABEL: f1: @@ -12,6 +20,14 @@ define void @f1() shadowcallstack { ; RV64-LABEL: f1: ; RV64: # %bb.0: ; RV64-NEXT: ret +; +; RV32-ZICFISS-LABEL: f1: +; RV32-ZICFISS: # %bb.0: +; RV32-ZICFISS-NEXT: ret +; +; RV64-ZICFISS-LABEL: f1: +; RV64-ZICFISS: # %bb.0: +; RV64-ZICFISS-NEXT: ret ret void } @@ -25,6 +41,14 @@ define void @f2() shadowcallstack { ; RV64-LABEL: f2: ; RV64: # %bb.0: ; RV64-NEXT: tail foo +; +; RV32-ZICFISS-LABEL: f2: +; RV32-ZICFISS: # %bb.0: +; RV32-ZICFISS-NEXT: tail foo +; +; RV64-ZICFISS-LABEL: f2: +; RV64-ZICFISS: # %bb.0: +; RV64-ZICFISS-NEXT: tail foo tail call void @foo() ret void } @@ -65,6 +89,32 @@ define i32 @f3() shadowcallstack { ; RV64-NEXT: addi gp, gp, -8 ; RV64-NEXT: .cfi_restore gp ; RV64-NEXT: ret +; +; RV32-ZICFISS-LABEL: f3: +; RV32-ZICFISS: # %bb.0: +; RV32-ZICFISS-NEXT: sspush ra +; RV32-ZICFISS-NEXT: addi sp, sp, -16 +; RV32-ZICFISS-NEXT: .cfi_def_cfa_offset 16 +; RV32-ZICFISS-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-ZICFISS-NEXT: .cfi_offset ra, -4 +; RV32-ZICFISS-NEXT: call bar +; RV32-ZICFISS-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-ZICFISS-NEXT: addi sp, sp, 16 +; RV32-ZICFISS-NEXT: sspopchk ra +; RV32-ZICFISS-NEXT: ret +; +; RV64-ZICFISS-LABEL: f3: +; RV64-ZICFISS: # %bb.0: +; RV64-ZICFISS-NEXT: sspush ra +; RV64-ZICFISS-NEXT: addi sp, sp, -16 +; RV64-ZICFISS-NEXT: .cfi_def_cfa_offset 16 +; RV64-ZICFISS-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-ZICFISS-NEXT: .cfi_offset ra, -8 +; RV64-ZICFISS-NEXT: call bar +; RV64-ZICFISS-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-ZICFISS-NEXT: addi sp, sp, 16 +; RV64-ZICFISS-NEXT: sspopchk ra +; RV64-ZICFISS-NEXT: ret %res = call i32 @bar() %res1 = add i32 %res, 1 ret i32 %res @@ -140,6 +190,68 @@ define i32 @f4() shadowcallstack { ; RV64-NEXT: addi gp, gp, -8 ; RV64-NEXT: .cfi_restore gp ; RV64-NEXT: ret +; +; RV32-ZICFISS-LABEL: f4: +; RV32-ZICFISS: # %bb.0: +; RV32-ZICFISS-NEXT: sspush ra +; RV32-ZICFISS-NEXT: addi sp, sp, -16 +; RV32-ZICFISS-NEXT: .cfi_def_cfa_offset 16 +; RV32-ZICFISS-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-ZICFISS-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32-ZICFISS-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32-ZICFISS-NEXT: sw s2, 0(sp) # 4-byte Folded Spill +; RV32-ZICFISS-NEXT: .cfi_offset ra, -4 +; RV32-ZICFISS-NEXT: .cfi_offset s0, -8 +; RV32-ZICFISS-NEXT: .cfi_offset s1, -12 +; RV32-ZICFISS-NEXT: .cfi_offset s2, -16 +; RV32-ZICFISS-NEXT: call bar +; RV32-ZICFISS-NEXT: mv s0, a0 +; RV32-ZICFISS-NEXT: call bar +; RV32-ZICFISS-NEXT: mv s1, a0 +; RV32-ZICFISS-NEXT: call bar +; RV32-ZICFISS-NEXT: mv s2, a0 +; RV32-ZICFISS-NEXT: call bar +; RV32-ZICFISS-NEXT: add s0, s0, s1 +; RV32-ZICFISS-NEXT: add a0, s2, a0 +; RV32-ZICFISS-NEXT: add a0, s0, a0 +; RV32-ZICFISS-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-ZICFISS-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32-ZICFISS-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32-ZICFISS-NEXT: lw s2, 0(sp) # 4-byte Folded Reload +; RV32-ZICFISS-NEXT: addi sp, sp, 16 +; RV32-ZICFISS-NEXT: sspopchk ra +; RV32-ZICFISS-NEXT: ret +; +; RV64-ZICFISS-LABEL: f4: +; RV64-ZICFISS: # %bb.0: +; RV64-ZICFISS-NEXT: sspush ra +; RV64-ZICFISS-NEXT: addi sp, sp, -32 +; RV64-ZICFISS-NEXT: .cfi_def_cfa_offset 32 +; RV64-ZICFISS-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-ZICFISS-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-ZICFISS-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64-ZICFISS-NEXT: sd s2, 0(sp) # 8-byte Folded Spill +; RV64-ZICFISS-NEXT: .cfi_offset ra, -8 +; RV64-ZICFISS-NEXT: .cfi_offset s0, -16 +; RV64-ZICFISS-NEXT: .cfi_offset s1, -24 +; RV64-ZICFISS-NEXT: .cfi_offset s2, -32 +; RV64-ZICFISS-NEXT: call bar +; RV64-ZICFISS-NEXT: mv s0, a0 +; RV64-ZICFISS-NEXT: call bar +; RV64-ZICFISS-NEXT: mv s1, a0 +; RV64-ZICFISS-NEXT: call bar +; RV64-ZICFISS-NEXT: mv s2, a0 +; RV64-ZICFISS-NEXT: call bar +; RV64-ZICFISS-NEXT: add s0, s0, s1 +; RV64-ZICFISS-NEXT: add a0, s2, a0 +; RV64-ZICFISS-NEXT: addw a0, s0, a0 +; RV64-ZICFISS-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-ZICFISS-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-ZICFISS-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64-ZICFISS-NEXT: ld s2, 0(sp) # 8-byte Folded Reload +; RV64-ZICFISS-NEXT: addi sp, sp, 32 +; RV64-ZICFISS-NEXT: sspopchk ra +; RV64-ZICFISS-NEXT: ret %res1 = call i32 @bar() %res2 = call i32 @bar() %res3 = call i32 @bar() @@ -176,6 +288,28 @@ define i32 @f5() shadowcallstack nounwind { ; RV64-NEXT: ld ra, -8(gp) ; RV64-NEXT: addi gp, gp, -8 ; RV64-NEXT: ret +; +; RV32-ZICFISS-LABEL: f5: +; RV32-ZICFISS: # %bb.0: +; RV32-ZICFISS-NEXT: sspush ra +; RV32-ZICFISS-NEXT: addi sp, sp, -16 +; RV32-ZICFISS-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-ZICFISS-NEXT: call bar +; RV32-ZICFISS-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-ZICFISS-NEXT: addi sp, sp, 16 +; RV32-ZICFISS-NEXT: sspopchk ra +; RV32-ZICFISS-NEXT: ret +; +; RV64-ZICFISS-LABEL: f5: +; RV64-ZICFISS: # %bb.0: +; RV64-ZICFISS-NEXT: sspush ra +; RV64-ZICFISS-NEXT: addi sp, sp, -16 +; RV64-ZICFISS-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-ZICFISS-NEXT: call bar +; RV64-ZICFISS-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-ZICFISS-NEXT: addi sp, sp, 16 +; RV64-ZICFISS-NEXT: sspopchk ra +; RV64-ZICFISS-NEXT: ret %res = call i32 @bar() %res1 = add i32 %res, 1 ret i32 %res -- cgit v1.1 From 30cd1838dc334775f7a29f57b581f2bdda3f0ea1 Mon Sep 17 00:00:00 2001 From: Po-yao Chang Date: Sat, 10 Feb 2024 22:22:16 +0800 Subject: [libc++][modules] Fix disabling Unicode (#81294) -DLIBCXX_ENABLE_UNICODE=OFF or -D_LIBCPP_HAS_NO_UNICODE doesn't build without this change. --- libcxx/modules/std/ostream.inc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libcxx/modules/std/ostream.inc b/libcxx/modules/std/ostream.inc index 8fcbfb4..0e0e2d54 100644 --- a/libcxx/modules/std/ostream.inc +++ b/libcxx/modules/std/ostream.inc @@ -33,8 +33,10 @@ export namespace std { using std::println; using std::vprint_nonunicode; +# ifndef _LIBCPP_HAS_NO_UNICODE using std::vprint_unicode; -# endif // _LIBCPP_STD_VER >= 23 +# endif // _LIBCPP_HAS_NO_UNICODE +# endif // _LIBCPP_STD_VER >= 23 #endif // _LIBCPP_HAS_NO_LOCALIZATION } // namespace std -- cgit v1.1 From f66f44eb0c194f6bd0b6387d778624b303b6edc1 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sat, 10 Feb 2024 15:25:30 +0100 Subject: [libc++][modules] Regenerates files. After applying the review comments of https://github.com/llvm/llvm-project/pull/80478 I've forgotten to update the generated files. This fixes the issue and removes trailing whitespace. --- libcxx/modules/std.compat.cppm.in | 10 +++++----- libcxx/modules/std.cppm.in | 10 +++++----- libcxx/utils/generate_libcxx_cppm_in.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/libcxx/modules/std.compat.cppm.in b/libcxx/modules/std.compat.cppm.in index 1636371..b44dbab 100644 --- a/libcxx/modules/std.compat.cppm.in +++ b/libcxx/modules/std.compat.cppm.in @@ -47,11 +47,11 @@ module; // *** Headers not yet available *** // -// This validation is mainly to aid libc++ developers to add modules for new -// headers. On Windows the Windows SDK can be in the include path. This SDK -// contains the MSVC STL headers. This may give false positives when MSVC STL -// provides a header libc++ has not implemented yet. Therefore this validation -// is not done on Windows. +// This validation is mainly to catch when a new header is added but adding the +// corresponding .inc file is forgotten. However, the check based on __has_include +// alone doesn't work on Windows because the Windows SDK is on the include path, +// and that means the MSVC STL headers can be found as well, tricking __has_include +// into thinking that libc++ provides the header. // #ifndef _WIN32 # if __has_include() diff --git a/libcxx/modules/std.cppm.in b/libcxx/modules/std.cppm.in index 3b59c28..b8d8913 100644 --- a/libcxx/modules/std.cppm.in +++ b/libcxx/modules/std.cppm.in @@ -169,11 +169,11 @@ module; // *** Headers not yet available *** // -// This validation is mainly to aid libc++ developers to add modules for new -// headers. On Windows the Windows SDK can be in the include path. This SDK -// contains the MSVC STL headers. This may give false positives when MSVC STL -// provides a header libc++ has not implemented yet. Therefore this validation -// is not done on Windows. +// This validation is mainly to catch when a new header is added but adding the +// corresponding .inc file is forgotten. However, the check based on __has_include +// alone doesn't work on Windows because the Windows SDK is on the include path, +// and that means the MSVC STL headers can be found as well, tricking __has_include +// into thinking that libc++ provides the header. // #ifndef _WIN32 # if __has_include() diff --git a/libcxx/utils/generate_libcxx_cppm_in.py b/libcxx/utils/generate_libcxx_cppm_in.py index 0390ce5..e98ac1b 100644 --- a/libcxx/utils/generate_libcxx_cppm_in.py +++ b/libcxx/utils/generate_libcxx_cppm_in.py @@ -61,7 +61,7 @@ module; """ // *** Headers not yet available *** // -// This validation is mainly to catch when a new header is added but adding the +// This validation is mainly to catch when a new header is added but adding the // corresponding .inc file is forgotten. However, the check based on __has_include // alone doesn't work on Windows because the Windows SDK is on the include path, // and that means the MSVC STL headers can be found as well, tricking __has_include -- cgit v1.1 From a4ac099487d057dde8151700b3802eaeb69cead2 Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Sat, 10 Feb 2024 11:00:00 -0500 Subject: [Flang] Support passing a function that returns procedure pointer as actual corresponding to a procedure dummy. (#80891) Flang crashes with the following case. The problem is we missed the case when passing a reference to a function that returns a procedure pointer as actual that corresponds to a procedure dummy. This PR is to fix that. ``` PROGRAM main IMPLICIT NONE INTERFACE FUNCTION IntF(Arg) integer :: Arg, IntF END FUNCTION END INTERFACE INTERFACE FUNCTION RetPtr(Arg) IMPORT PROCEDURE(IntF) :: Arg PROCEDURE(IntF), POINTER :: RetPtr END FUNCTION END INTERFACE CALL ModSub(RetPtr(IntF)) contains SUBROUTINE ModSub(Fun1) PROCEDURE(IntF) :: Fun1 END SUBROUTINE END ``` --- flang/lib/Lower/ConvertCall.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp index f60cdbb..d8271b1 100644 --- a/flang/lib/Lower/ConvertCall.cpp +++ b/flang/lib/Lower/ConvertCall.cpp @@ -922,7 +922,8 @@ static PreparedDummyArgument preparePresentUserCallActualArgument( // Handle procedure arguments (procedure pointers should go through // prepareProcedurePointerActualArgument). if (hlfir::isFortranProcedureValue(dummyType)) { - // Procedure pointer actual to procedure dummy. + // Procedure pointer or function returns procedure pointer actual to + // procedure dummy. if (actual.isProcedurePointer()) { actual = hlfir::derefPointersAndAllocatables(loc, builder, actual); return PreparedDummyArgument{actual, /*cleanups=*/{}}; @@ -931,7 +932,11 @@ static PreparedDummyArgument preparePresentUserCallActualArgument( assert(actual.isProcedure()); // Do nothing if this is a procedure argument. It is already a // fir.boxproc/fir.tuple as it should. - if (actual.getType() != dummyType) + if (!actual.getType().isa() && + actual.getType() != dummyType) + // The actual argument may be a procedure that returns character (a + // fir.tuple) while the dummy is not. Extract the tuple + // in that case. actual = fixProcedureDummyMismatch(loc, builder, actual, dummyType); return PreparedDummyArgument{actual, /*cleanups=*/{}}; } -- cgit v1.1 From 4fb7b3301bfbd439eb3d30d6a36c7cdb26941a0d Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sat, 10 Feb 2024 17:09:53 +0100 Subject: [libc++][print] Moves is_terminal to the dylib. (#80464) Having the test in the header requires including unistd.h on POSIX platforms. This header has other declarations which may conflict with code that uses named declarations provided by this header. For example code using "int pipe;" would conflict with the function pipe in this header. Moving the code to the dylib means std::print would not be available on Apple backdeployment targets. On POSIX platforms there is no transcoding required so a not Standard conforming implementation is still a useful and the observable differences are minimal. This behaviour has been done for print before https://github.com/llvm/llvm-project/pull/76293. Note questions have been raised in LWG4044 "Confusing requirements for std::print on POSIX platforms", whether or not the isatty check on POSIX platforms is required. When this LWG issue is resolved the backdeployment targets could become Standard compliant. This patch is intended to be backported to the LLVM-18 branch. Fixes: https://github.com/llvm/llvm-project/issues/79782 --- libcxx/include/print | 14 ++++++------ libcxx/lib/abi/CHANGELOG.TXT | 8 +++++++ ...in.libcxxabi.v1.stable.exceptions.nonew.abilist | 1 + ...21.libcxxabi.v1.stable.exceptions.nonew.abilist | 1 + ...ix.libcxxabi.v1.stable.exceptions.nonew.abilist | 1 + ...ix.libcxxabi.v1.stable.exceptions.nonew.abilist | 1 + ...in.libcxxabi.v1.stable.exceptions.nonew.abilist | 1 + ...21.libcxxabi.v1.stable.exceptions.nonew.abilist | 1 + ...sd.libcxxabi.v1.stable.exceptions.nonew.abilist | 1 + ...nu.libcxxabi.v1.stable.exceptions.nonew.abilist | 1 + ....libcxxabi.v1.stable.noexceptions.nonew.abilist | 1 + libcxx/src/print.cpp | 25 ++++++++++++++-------- 12 files changed, 40 insertions(+), 16 deletions(-) diff --git a/libcxx/include/print b/libcxx/include/print index 7f2b5ba..543a540 100644 --- a/libcxx/include/print +++ b/libcxx/include/print @@ -32,6 +32,7 @@ namespace std { */ #include <__assert> // all public C++ headers provide the assertion handler +#include <__availability> #include <__concepts/same_as.h> #include <__config> #include <__system_error/system_error.h> @@ -43,10 +44,6 @@ namespace std { #include #include -#if __has_include() -# include -#endif - #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif @@ -68,7 +65,8 @@ _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream); // Note the function is only implemented on the Windows platform. _LIBCPP_EXPORTED_FROM_ABI void __write_to_windows_console(FILE* __stream, wstring_view __view); # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS - +#elif __has_include() +_LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream); #endif // _LIBCPP_WIN32API #if _LIBCPP_STD_VER >= 23 @@ -195,15 +193,17 @@ inline constexpr bool __use_unicode_execution_charset = _MSVC_EXECUTION_CHARACTE inline constexpr bool __use_unicode_execution_charset = true; # endif -_LIBCPP_HIDE_FROM_ABI inline bool __is_terminal(FILE* __stream) { +_LIBCPP_HIDE_FROM_ABI inline bool __is_terminal([[maybe_unused]] FILE* __stream) { // The macro _LIBCPP_TESTING_PRINT_IS_TERMINAL is used to change // the behavior in the test. This is not part of the public API. # ifdef _LIBCPP_TESTING_PRINT_IS_TERMINAL return _LIBCPP_TESTING_PRINT_IS_TERMINAL(__stream); +# elif _LIBCPP_AVAILABILITY_HAS_PRINT == 0 + return false; # elif defined(_LIBCPP_WIN32API) return std::__is_windows_terminal(__stream); # elif __has_include() - return isatty(fileno(__stream)); + return std::__is_posix_terminal(__stream); # else # error "Provide a way to determine whether a FILE* is a terminal" # endif diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT index 1179c25..7ff6049 100644 --- a/libcxx/lib/abi/CHANGELOG.TXT +++ b/libcxx/lib/abi/CHANGELOG.TXT @@ -16,6 +16,14 @@ New entries should be added directly below the "Version" header. Version 18.0 ------------ +* [libc++] Moves is_terminal to the dylib + + The patch moves the POSIX implementation of is_terminal to the dylib. This is + needed to avoid using in public headers. + + All platforms + Symbol added: _ZNSt6__ndk119__is_posix_terminalEP7__sFILE + * [libc++abi] Implement __cxa_init_primary_exception and use it to optimize std::make_exception_ptr (#65534) This patch implements __cxa_init_primary_exception, an extension to the Itanium C++ ABI. diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist index c2fea4d..2064f45 100644 --- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1495,6 +1495,7 @@ {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '__ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist index a60f099..fec3a45 100644 --- a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1176,6 +1176,7 @@ {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt6__ndk119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist index a159ff52..e52cf98 100644 --- a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -534,6 +534,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP4FILE', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist index 5749a75..52a0470 100644 --- a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -534,6 +534,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP4FILE', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist index e827114..bced6b2 100644 --- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1495,6 +1495,7 @@ {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '__ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist index f4077ad..efa2189 100644 --- a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1176,6 +1176,7 @@ {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt6__ndk119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist index e3d3fcb..ebda5b0 100644 --- a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1190,6 +1190,7 @@ {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist index 1692330..6432ad3 100644 --- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1188,6 +1188,7 @@ {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP8_IO_FILE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist index 2380ffb..1fe84e1 100644 --- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist @@ -1159,6 +1159,7 @@ {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP8_IO_FILE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/src/print.cpp b/libcxx/src/print.cpp index 3692187..8fa59fd 100644 --- a/libcxx/src/print.cpp +++ b/libcxx/src/print.cpp @@ -8,22 +8,26 @@ #include <__config> -#if defined(_LIBCPP_WIN32API) +#include +#include + +#include <__system_error/system_error.h> -# include -# include +#include "filesystem/error.h" +#if defined(_LIBCPP_WIN32API) # define WIN32_LEAN_AND_MEAN # define NOMINMAX # include # include - -# include <__system_error/system_error.h> - -# include "filesystem/error.h" +#elif __has_include() +# include +#endif _LIBCPP_BEGIN_NAMESPACE_STD +#if defined(_LIBCPP_WIN32API) + _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream) { // Note the Standard does this in one call, but it's unclear whether // an invalid handle is allowed when calling GetConsoleMode. @@ -52,6 +56,9 @@ __write_to_windows_console([[maybe_unused]] FILE* __stream, [[maybe_unused]] wst } # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS -_LIBCPP_END_NAMESPACE_STD +#elif __has_include() // !_LIBCPP_WIN32API -#endif // !_LIBCPP_WIN32API +_LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream) { return isatty(fileno(__stream)); } +#endif + +_LIBCPP_END_NAMESPACE_STD -- cgit v1.1 From b4c6ab600f2ef6f3a842afee569dcf86bce7a43a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 9 Feb 2024 19:03:18 +0100 Subject: [clang][Interp][NFC] Don't use visitLocalInitializer in visitExpr We were unnecessarily getting the pointer of the local variable twice. --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index aaa8ac8..6993d75 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -2555,10 +2555,13 @@ bool ByteCodeExprGen::visitExpr(const Expr *E) { // For us, that means everything we don't // have a PrimType for. if (std::optional LocalOffset = this->allocateLocal(E)) { - if (!this->visitLocalInitializer(E, *LocalOffset)) + if (!this->emitGetPtrLocal(*LocalOffset, E)) return false; - if (!this->emitGetPtrLocal(*LocalOffset, E)) + if (!visitInitializer(E)) + return false; + + if (!this->emitInitPtr(E)) return false; return this->emitRetValue(E); } -- cgit v1.1 From d2e4a725da5b4cbef8b5c1446f29fed1487aeab0 Mon Sep 17 00:00:00 2001 From: Frederic Cambus Date: Sat, 10 Feb 2024 17:39:30 +0100 Subject: [clang] Update Clang version from 18 to 19 in scan-build.1. Similar to D110763. --- clang/tools/scan-build/man/scan-build.1 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/tools/scan-build/man/scan-build.1 b/clang/tools/scan-build/man/scan-build.1 index 29edbca..e2b37f6 100644 --- a/clang/tools/scan-build/man/scan-build.1 +++ b/clang/tools/scan-build/man/scan-build.1 @@ -2,9 +2,9 @@ .\" See https://llvm.org/LICENSE.txt for license information. .\" SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception .\" $Id$ -.Dd Sep 21, 2023 +.Dd Feb 10, 2024 .Dt SCAN-BUILD 1 -.Os "clang" "18" +.Os "clang" "19" .Sh NAME .Nm scan-build .Nd Clang static analyzer -- cgit v1.1 From dce77a357948709e335910ddc07f9c3f2eb2ac4b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 10 Feb 2024 18:11:17 +0000 Subject: [IndVars] Preserve flags of narrow IV inc if replacing with wider inc. (#80446) We are replacing a narrow IV increment with a wider one. If the original (narrow) increment did not wrap, the wider one should not wrap either. Set the flags to be the union of both wide increment and original increment; this ensures we preserve flags SCEV could infer for the wider increment. Fixes https://github.com/llvm/llvm-project/issues/71517. --- llvm/lib/Transforms/Utils/SimplifyIndVar.cpp | 21 ++++++ llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll | 4 +- llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll | 2 +- .../Transforms/IndVarSimplify/pr30806-phi-scev.ll | 2 +- .../preserve-nsw-during-expansion.ll | 2 +- .../Transforms/IndVarSimplify/widen-i32-i8ptr.ll | 2 +- llvm/test/Transforms/LoopFlatten/widen-iv.ll | 6 +- llvm/test/Transforms/LoopFlatten/widen-iv2.ll | 4 +- llvm/test/Transforms/LoopFlatten/widen-iv3.ll | 4 +- .../PhaseOrdering/AArch64/indvars-vectorization.ll | 81 ++++++++++++++++++++-- .../PhaseOrdering/AArch64/loopflatten.ll | 2 +- 11 files changed, 111 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index 1b142f1..5aa6df4 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -1985,7 +1985,28 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { // increment to the new (widened) increment. auto *OrigInc = cast(OrigPhi->getIncomingValueForBlock(LatchBlock)); + WideInc->setDebugLoc(OrigInc->getDebugLoc()); + // We are replacing a narrow IV increment with a wider IV increment. If + // the original (narrow) increment did not wrap, the wider increment one + // should not wrap either. Set the flags to be the union of both wide + // increment and original increment; this ensures we preserve flags SCEV + // could infer for the wider increment. Limit this only to cases where + // both increments directly increment the corresponding PHI nodes and have + // the same opcode. It is not safe to re-use the flags from the original + // increment, if it is more complex and SCEV expansion may have yielded a + // more simplified wider increment. + bool MatchingOps = + match(OrigInc, m_c_BinOp(m_Specific(OrigPhi), m_Value())) && + match(WideInc, m_c_BinOp(m_Specific(WidePhi), m_Value())) && + OrigInc->getOpcode() == WideInc->getOpcode(); + if (MatchingOps && isa(OrigInc) && + isa(WideInc)) { + WideInc->setHasNoUnsignedWrap(WideInc->hasNoUnsignedWrap() || + OrigInc->hasNoUnsignedWrap()); + WideInc->setHasNoSignedWrap(WideInc->hasNoSignedWrap() || + OrigInc->hasNoSignedWrap()); + } } } diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll index 6efe86d..b7d0700 100644 --- a/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll @@ -11,7 +11,7 @@ define i32 @fn2() personality ptr @__CxxFrameHandler3 { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[INDVARS1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: invoke void @fn1(i64 [[INDVARS_IV]]) -; CHECK-NEXT: to label [[FOR_INC]] unwind label [[CATCH_DISPATCH:%.*]] +; CHECK-NEXT: to label [[FOR_INC]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: catch.dispatch: ; CHECK-NEXT: [[C_0_LCSSA:%.*]] = phi i32 [ [[INDVARS1]], [[FOR_COND]] ] ; CHECK-NEXT: [[TMP0:%.*]] = catchswitch within none [label %catch] unwind to caller @@ -21,7 +21,7 @@ define i32 @fn2() personality ptr @__CxxFrameHandler3 { ; CHECK: exit: ; CHECK-NEXT: ret i32 [[C_0_LCSSA]] ; CHECK: for.inc: -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: br label [[FOR_COND]] ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll b/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll index 8aa698a..7409fc8 100644 --- a/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll +++ b/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll @@ -148,7 +148,7 @@ define void @guardedloop(ptr %matrix, ptr %vector, ; CHECK-NEXT: [[VECTORP:%.*]] = getelementptr inbounds [0 x double], ptr [[VECTOR:%.*]], i32 0, i64 [[INDVARS_IV2]] ; CHECK-NEXT: [[V2:%.*]] = load double, ptr [[VECTORP]], align 8 ; CHECK-NEXT: call void @use(double [[V2]]) -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP0]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[RETURN_LOOPEXIT:%.*]] diff --git a/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll b/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll index b45f094..6a2bbfa 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll @@ -43,7 +43,7 @@ define void @foo(ptr %buf, i32 %denominator, ptr %flag) local_unnamed_addr { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_LR_PH]] ] ; CHECK-NEXT: [[BUF_ADDR_07:%.*]] = phi ptr [ [[BUF]], [[WHILE_BODY_LR_PH]] ], [ [[CALL:%.*]], [[WHILE_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[DIV]] to i64 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP2]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr @theSize, align 4 ; CHECK-NEXT: store i32 [[TMP3]], ptr [[I]], align 4 ; CHECK-NEXT: call void @bar(ptr nonnull [[I]], i64 [[INDVARS_IV_NEXT]]) diff --git a/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll b/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll index 9c2237c..080bc9b 100644 --- a/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll +++ b/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll @@ -23,7 +23,7 @@ define void @test_s172(i32 noundef %xa, i32 noundef %xb, ptr nocapture noundef % ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP1]] ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 32000 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: for.end.loopexit: diff --git a/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll b/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll index 17ce13d..35e6ca6 100644 --- a/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll +++ b/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll @@ -15,7 +15,7 @@ define dso_local void @Widen_i32_i8ptr() local_unnamed_addr { ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[GID_0]], i64 1 ; CHECK-NEXT: [[ARRAYIDX2115:%.*]] = getelementptr inbounds [15 x ptr], ptr [[PTRIDS]], i64 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: store ptr [[GID_0]], ptr [[ARRAYIDX2115]], align 8 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: br label [[FOR_COND2106]] ; entry: diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv.ll b/llvm/test/Transforms/LoopFlatten/widen-iv.ll index 2feca40..ac42acb9 100644 --- a/llvm/test/Transforms/LoopFlatten/widen-iv.ll +++ b/llvm/test/Transforms/LoopFlatten/widen-iv.ll @@ -36,7 +36,7 @@ define void @foo(ptr %A, i32 %N, i32 %M) { ; CHECK-NEXT: tail call void @f(ptr [[ARRAYIDX_US]]) ; CHECK-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: -; CHECK-NEXT: [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1 +; CHECK-NEXT: [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1 ; CHECK-NEXT: [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]] ; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: for.cond.cleanup.loopexit: @@ -143,7 +143,7 @@ define void @foo2_sext(ptr nocapture readonly %A, i32 %N, i32 %M) { ; CHECK-NEXT: tail call void @g(i32 [[TMP2]]) ; CHECK-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: -; CHECK-NEXT: [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1 +; CHECK-NEXT: [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1 ; CHECK-NEXT: [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]] ; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: for.cond1.preheader: @@ -1005,7 +1005,7 @@ define void @foo_M_sext(ptr %A, i32 %N, i16 %M) { ; CHECK-NEXT: tail call void @f(ptr [[ARRAYIDX_US]]) ; CHECK-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: -; CHECK-NEXT: [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1 +; CHECK-NEXT: [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1 ; CHECK-NEXT: [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]] ; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: for.cond.cleanup.loopexit: diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv2.ll b/llvm/test/Transforms/LoopFlatten/widen-iv2.ll index 946b984..7b1caa7 100644 --- a/llvm/test/Transforms/LoopFlatten/widen-iv2.ll +++ b/llvm/test/Transforms/LoopFlatten/widen-iv2.ll @@ -45,12 +45,12 @@ define dso_local i32 @fn1() local_unnamed_addr #0 { ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP7]] ; CHECK-NEXT: store i32 32, ptr [[ARRAYIDX_US]], align 4 -; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[INDVAR_NEXT]] = add nuw nsw i64 [[INDVAR]], 1 ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[J_014_US]], 1 ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP1]] ; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY3_US]], label [[FOR_COND1_FOR_INC4_CRIT_EDGE_US]] ; CHECK: for.cond1.for.inc4_crit_edge.us: -; CHECK-NEXT: [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1 +; CHECK-NEXT: [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1 ; CHECK-NEXT: [[INC5_US]] = add nuw nsw i32 [[I_016_US]], 1 ; CHECK-NEXT: [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[TMP3]] ; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_END6_LOOPEXIT:%.*]] diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll index df8ee6f..6e6c045 100644 --- a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll +++ b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll @@ -25,7 +25,7 @@ define i16 @foo() { ; CHECK-NEXT: ret i16 [[ADD5_LCSSA_LCSSA]] ; CHECK: for.cond.cleanup3: ; CHECK-NEXT: [[ADD5_LCSSA]] = phi i16 [ [[ADD5:%.*]], [[FOR_BODY4]] ] -; CHECK-NEXT: [[INDVAR_NEXT3]] = add i32 [[INDVAR2]], 1 +; CHECK-NEXT: [[INDVAR_NEXT3]] = add nuw nsw i32 [[INDVAR2]], 1 ; CHECK-NEXT: [[INC7]] = add nuw nsw i16 [[I_013]], 1 ; CHECK-NEXT: [[EXITCOND14_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT3]], 4 ; CHECK-NEXT: br i1 [[EXITCOND14_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] @@ -39,7 +39,7 @@ define i16 @foo() { ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i16], ptr @v, i16 0, i16 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ADD5]] = add nsw i16 [[TMP4]], [[SUM_110]] -; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 +; CHECK-NEXT: [[INDVAR_NEXT]] = add nuw nsw i32 [[INDVAR]], 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i16 [[J_011]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT]], 16 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]] diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll index a7e8e15..af24a9a 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll @@ -14,18 +14,81 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef ; CHECK-NEXT: [[SUB:%.*]] = add i32 [[XA]], -1 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[SUB]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[XB]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[SMAX7:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP2]], i64 32000) +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i64 [[TMP2]], 32000 +; CHECK-NEXT: [[UMIN8:%.*]] = zext i1 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP2]], [[UMIN8]] +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[SMAX7]], [[TMP4]] +; CHECK-NEXT: [[UMAX9:%.*]] = tail call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1) +; CHECK-NEXT: [[TMP6:%.*]] = udiv i64 [[TMP5]], [[UMAX9]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], [[UMIN8]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP8]], 23 +; CHECK-NEXT: [[IDENT_CHECK_NOT:%.*]] = icmp eq i32 [[XB]], 1 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[MIN_ITERS_CHECK]], [[IDENT_CHECK_NOT]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[VECTOR_MEMCHECK:%.*]], label [[FOR_BODY_PREHEADER13:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = add nsw i64 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP10]], i64 32000) +; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i64 [[TMP10]], 32000 +; CHECK-NEXT: [[UMIN:%.*]] = zext i1 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP10]], [[UMIN]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[SMAX]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], [[UMIN]] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP0]] +; CHECK-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 4 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]] +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP6]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP5]], [[SCEVGEP4]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER13]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP8]], -8 +; CHECK-NEXT: [[TMP18:%.*]] = mul nuw i64 [[N_VEC]], [[TMP1]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP18]], [[TMP0]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP19]], [[TMP0]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: [[TMP24:%.*]] = add nsw <4 x i32> [[WIDE_LOAD11]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD12]], [[WIDE_LOAD10]] +; CHECK-NEXT: store <4 x i32> [[TMP24]], ptr [[TMP22]], align 4, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store <4 x i32> [[TMP25]], ptr [[TMP23]], align 4, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]] +; CHECK: for.body.preheader13: +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER13]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L_A]], [[L_B]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP_A]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP1]] ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 32000 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -63,6 +126,14 @@ for.end: !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.mustprogress"} ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"} +; CHECK: [[META0]] = !{[[META1:![0-9]+]]} +; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"} +; CHECK: [[META3]] = !{[[META4:![0-9]+]]} +; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]]} +; CHECK: [[META6]] = !{!"llvm.loop.mustprogress"} +; CHECK: [[META7]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META6]], [[META7]]} ;. diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll index 77f53ad..e514def 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll @@ -21,7 +21,7 @@ define dso_local void @_Z3fooPiii(ptr %A, i32 %N, i32 %M) #0 { ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVAR6]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4 ; CHECK-NEXT: tail call void @_Z1fi(i32 [[TMP2]]) -; CHECK-NEXT: [[INDVAR_NEXT7]] = add nuw nsw i64 [[INDVAR6]], 1 +; CHECK-NEXT: [[INDVAR_NEXT7]] = add nuw i64 [[INDVAR6]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVAR_NEXT7]], [[FLATTEN_TRIPCOUNT]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]] ; CHECK: for.cond.cleanup: -- cgit v1.1 From ba451c80ba67ab6834305f35d47e36b6b446ce83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Sat, 10 Feb 2024 17:28:48 +0100 Subject: [clang][Interp][NFC] Only set result invalid if empty This is currently NFC but required for later changes. A Ret op might fail and set the result to invalid, causing another setInvalid() call, which asserts that the result is still empty. --- clang/lib/AST/Interp/EvalEmitter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp index a60f893..945b78d7 100644 --- a/clang/lib/AST/Interp/EvalEmitter.cpp +++ b/clang/lib/AST/Interp/EvalEmitter.cpp @@ -36,7 +36,7 @@ EvalEmitter::~EvalEmitter() { EvaluationResult EvalEmitter::interpretExpr(const Expr *E) { EvalResult.setSource(E); - if (!this->visitExpr(E)) + if (!this->visitExpr(E) && EvalResult.empty()) EvalResult.setInvalid(); return std::move(this->EvalResult); @@ -45,7 +45,7 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E) { EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD) { EvalResult.setSource(VD); - if (!this->visitDecl(VD)) + if (!this->visitDecl(VD) && EvalResult.empty()) EvalResult.setInvalid(); return std::move(this->EvalResult); -- cgit v1.1 From bc034baaff1f6ce4e18b68c20df3be45bfb5104f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Sat, 10 Feb 2024 17:42:36 +0100 Subject: [clang][Interp] Protect InitPtr from non-initializable pointers This can happen when an initializer returns a dummy pointer. --- clang/lib/AST/Interp/Interp.h | 7 +++++-- clang/test/AST/Interp/complex.c | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 clang/test/AST/Interp/complex.c diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 290edc0..15c1370 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -1278,13 +1278,16 @@ inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) { inline bool InitPtrPop(InterpState &S, CodePtr OpPC) { const Pointer &Ptr = S.Stk.pop(); - Ptr.initialize(); + if (Ptr.canBeInitialized()) + Ptr.initialize(); return true; } inline bool InitPtr(InterpState &S, CodePtr OpPC) { const Pointer &Ptr = S.Stk.peek(); - Ptr.initialize(); + + if (Ptr.canBeInitialized()) + Ptr.initialize(); return true; } diff --git a/clang/test/AST/Interp/complex.c b/clang/test/AST/Interp/complex.c new file mode 100644 index 0000000..b07d024 --- /dev/null +++ b/clang/test/AST/Interp/complex.c @@ -0,0 +1,14 @@ +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both -Wno-unused-value %s +// RUN: %clang_cc1 -verify=ref,both -Wno-unused-value %s + +// expected-no-diagnostics +// ref-no-diagnostics + +void blah() { + __complex__ unsigned xx; + __complex__ signed yy; + __complex__ int result; + + /// The following line calls into the constant interpreter. + result = xx * yy; +} -- cgit v1.1 From 0a255fcf4a90f9e864ae9321b28e4956f7c865fb Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Sat, 10 Feb 2024 19:14:28 +0000 Subject: [compiler-rt][profile] Fix InstrProfilingFile possible resource leak. (#81363) close #79708 --- compiler-rt/lib/profile/InstrProfilingFile.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 867ae73..f3b457d 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -677,6 +677,7 @@ static void initializeProfileForContinuousMode(void) { PROF_ERR("Continuous counter sync mode is enabled, but raw profile is not" "page-aligned. CurrentFileOffset = %" PRIu64 ", pagesz = %u.\n", (uint64_t)CurrentFileOffset, PageSize); + fclose(File); return; } if (writeProfileWithFileObject(Filename, File) != 0) { @@ -692,6 +693,8 @@ static void initializeProfileForContinuousMode(void) { if (doMerging()) { lprofUnlockFileHandle(File); + } + if (File != NULL) { fclose(File); } } -- cgit v1.1 From 5e9eaf87b374c3f6638543682b523827834494a8 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sat, 10 Feb 2024 20:44:14 +0100 Subject: [lldb][libc++] Adds valarray data formatters. (#80609) The code is heavily based on the vector data formatter. --- .../Plugins/Language/CPlusPlus/CMakeLists.txt | 1 + .../Language/CPlusPlus/CPlusPlusLanguage.cpp | 9 ++ lldb/source/Plugins/Language/CPlusPlus/LibCxx.h | 4 + .../Plugins/Language/CPlusPlus/LibCxxValarray.cpp | 145 +++++++++++++++++++++ .../data-formatter-stl/libcxx/valarray/Makefile | 5 + .../valarray/TestDataFormatterLibcxxValarray.py | 78 +++++++++++ .../data-formatter-stl/libcxx/valarray/main.cpp | 17 +++ 7 files changed, 259 insertions(+) create mode 100644 lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp create mode 100644 lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile create mode 100644 lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py create mode 100644 lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp diff --git a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt index 21108b2..97fa894 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt +++ b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt @@ -17,6 +17,7 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN LibCxxTuple.cpp LibCxxUnorderedMap.cpp LibCxxVariant.cpp + LibCxxValarray.cpp LibCxxVector.cpp LibStdcpp.cpp LibStdcppTuple.cpp diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp index 1dcda53..675ca38 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp @@ -752,6 +752,11 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { "^std::__[[:alnum:]]+::vector<.+>$", stl_deref_flags, true); AddCXXSynthetic( cpp_category_sp, + lldb_private::formatters::LibcxxStdValarraySyntheticFrontEndCreator, + "libc++ std::valarray synthetic children", + "^std::__[[:alnum:]]+::valarray<.+>$", stl_deref_flags, true); + AddCXXSynthetic( + cpp_category_sp, lldb_private::formatters::LibcxxStdForwardListSyntheticFrontEndCreator, "libc++ std::forward_list synthetic children", "^std::__[[:alnum:]]+::forward_list<.+>$", stl_synth_flags, true); @@ -871,6 +876,10 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { lldb_private::formatters::LibcxxContainerSummaryProvider, "libc++ std::vector summary provider", "^std::__[[:alnum:]]+::vector<.+>$", stl_summary_flags, true); + AddCXXSummary(cpp_category_sp, + lldb_private::formatters::LibcxxContainerSummaryProvider, + "libc++ std::valarray summary provider", + "^std::__[[:alnum:]]+::valarray<.+>$", stl_summary_flags, true); AddCXXSummary( cpp_category_sp, lldb_private::formatters::LibcxxContainerSummaryProvider, "libc++ std::list summary provider", diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h index cc8e13d..d823fbd 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h @@ -220,6 +220,10 @@ LibcxxStdVectorSyntheticFrontEndCreator(CXXSyntheticChildren *, lldb::ValueObjectSP); SyntheticChildrenFrontEnd * +LibcxxStdValarraySyntheticFrontEndCreator(CXXSyntheticChildren *, + lldb::ValueObjectSP); + +SyntheticChildrenFrontEnd * LibcxxStdListSyntheticFrontEndCreator(CXXSyntheticChildren *, lldb::ValueObjectSP); diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp new file mode 100644 index 0000000..7c8fd25 --- /dev/null +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp @@ -0,0 +1,145 @@ +//===-- LibCxxValarray.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LibCxx.h" + +#include "lldb/Core/ValueObject.h" +#include "lldb/DataFormatters/FormattersHelpers.h" +#include + +using namespace lldb; +using namespace lldb_private; +using namespace lldb_private::formatters; + +namespace lldb_private { +namespace formatters { +class LibcxxStdValarraySyntheticFrontEnd : public SyntheticChildrenFrontEnd { +public: + LibcxxStdValarraySyntheticFrontEnd(lldb::ValueObjectSP valobj_sp); + + ~LibcxxStdValarraySyntheticFrontEnd() override; + + size_t CalculateNumChildren() override; + + lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; + + lldb::ChildCacheState Update() override; + + bool MightHaveChildren() override; + + size_t GetIndexOfChildWithName(ConstString name) override; + +private: + /// A non-owning pointer to valarray's __begin_ member. + ValueObject *m_start = nullptr; + /// A non-owning pointer to valarray's __end_ member. + ValueObject *m_finish = nullptr; + /// The type of valarray's template argument T. + CompilerType m_element_type; + /// The sizeof valarray's template argument T. + uint32_t m_element_size = 0; +}; + +} // namespace formatters +} // namespace lldb_private + +lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd:: + LibcxxStdValarraySyntheticFrontEnd(lldb::ValueObjectSP valobj_sp) + : SyntheticChildrenFrontEnd(*valobj_sp), m_element_type() { + if (valobj_sp) + Update(); +} + +lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd:: + ~LibcxxStdValarraySyntheticFrontEnd() { + // these need to stay around because they are child objects who will follow + // their parent's life cycle + // delete m_start; + // delete m_finish; +} + +size_t lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd:: + CalculateNumChildren() { + if (!m_start || !m_finish) + return 0; + uint64_t start_val = m_start->GetValueAsUnsigned(0); + uint64_t finish_val = m_finish->GetValueAsUnsigned(0); + + if (start_val == 0 || finish_val == 0) + return 0; + + if (start_val >= finish_val) + return 0; + + size_t num_children = (finish_val - start_val); + if (num_children % m_element_size) + return 0; + return num_children / m_element_size; +} + +lldb::ValueObjectSP +lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::GetChildAtIndex( + size_t idx) { + if (!m_start || !m_finish) + return lldb::ValueObjectSP(); + + uint64_t offset = idx * m_element_size; + offset = offset + m_start->GetValueAsUnsigned(0); + StreamString name; + name.Printf("[%" PRIu64 "]", (uint64_t)idx); + return CreateValueObjectFromAddress(name.GetString(), offset, + m_backend.GetExecutionContextRef(), + m_element_type); +} + +lldb::ChildCacheState +lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::Update() { + m_start = m_finish = nullptr; + + CompilerType type = m_backend.GetCompilerType(); + if (type.GetNumTemplateArguments() == 0) + return ChildCacheState::eRefetch; + + m_element_type = type.GetTypeTemplateArgument(0); + if (std::optional size = m_element_type.GetByteSize(nullptr)) + m_element_size = *size; + + if (m_element_size == 0) + return ChildCacheState::eRefetch; + + ValueObjectSP start = m_backend.GetChildMemberWithName("__begin_"); + ValueObjectSP finish = m_backend.GetChildMemberWithName("__end_"); + + if (!start || !finish) + return ChildCacheState::eRefetch; + + m_start = start.get(); + m_finish = finish.get(); + + return ChildCacheState::eRefetch; +} + +bool lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd:: + MightHaveChildren() { + return true; +} + +size_t lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd:: + GetIndexOfChildWithName(ConstString name) { + if (!m_start || !m_finish) + return std::numeric_limits::max(); + return ExtractIndexFromString(name.GetCString()); +} + +lldb_private::SyntheticChildrenFrontEnd * +lldb_private::formatters::LibcxxStdValarraySyntheticFrontEndCreator( + CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) { + if (!valobj_sp) + return nullptr; + return new LibcxxStdValarraySyntheticFrontEnd(valobj_sp); +} diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile new file mode 100644 index 0000000..c5df567 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile @@ -0,0 +1,5 @@ +CXX_SOURCES := main.cpp + +USE_LIBCPP := 1 + +include Makefile.rules diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py new file mode 100644 index 0000000..7b54b34 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py @@ -0,0 +1,78 @@ +""" +Test lldb data formatter subsystem. +""" + + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class LibcxxChronoDataFormatterTestCase(TestBase): + @add_test_categories(["libc++"]) + def test_with_run_command(self): + """Test that that file and class static variables display correctly.""" + self.build() + (self.target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "break here", lldb.SBFileSpec("main.cpp", False) + ) + + self.expect( + "frame variable va_int", + substrs=[ + "va_int = size=4", + "[0] = 0", + "[1] = 0", + "[2] = 0", + "[3] = 0", + "}", + ], + ) + + lldbutil.continue_to_breakpoint(process, bkpt) + self.expect( + "frame variable va_int", + substrs=[ + "va_int = size=4", + "[0] = 1", + "[1] = 12", + "[2] = 123", + "[3] = 1234", + "}", + ], + ) + + # check access-by-index + self.expect("frame variable va_int[0]", substrs=["1"]) + self.expect("frame variable va_int[1]", substrs=["12"]) + self.expect("frame variable va_int[2]", substrs=["123"]) + self.expect("frame variable va_int[3]", substrs=["1234"]) + self.expect( + "frame variable va_int[4]", + error=True, + substrs=['array index 4 is not valid for "(valarray) va_int"'], + ) + + self.expect( + "frame variable va_double", + substrs=[ + "va_double = size=4", + "[0] = 1", + "[1] = 0.5", + "[2] = 0.25", + "[3] = 0.125", + "}", + ], + ) + + # check access-by-index + self.expect("frame variable va_double[0]", substrs=["1"]) + self.expect("frame variable va_double[1]", substrs=["0.5"]) + self.expect("frame variable va_double[2]", substrs=["0.25"]) + self.expect("frame variable va_double[3]", substrs=["0.125"]) + self.expect( + "frame variable va_double[4]", + error=True, + substrs=['array index 4 is not valid for "(valarray) va_double"'], + ) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp new file mode 100644 index 0000000..f32921e --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp @@ -0,0 +1,17 @@ +#include +#include + +int main() { + + std::valarray va_int(4); + std::cout << "break here"; + + va_int[0] = 1; + va_int[1] = 12; + va_int[2] = 123; + va_int[3] = 1234; + + std::valarray va_double({1.0, 0.5, 0.25, 0.125}); + + std::cout << "break here\n"; +} -- cgit v1.1 From 2a51c56d8e0e410bf896be2c6bebe37344a996e1 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Sat, 10 Feb 2024 19:44:41 +0000 Subject: [gn build] Port 5e9eaf87b374 --- llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn index 60562ef..6c667b2 100644 --- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn @@ -37,6 +37,7 @@ static_library("CPlusPlus") { "LibCxxSpan.cpp", "LibCxxTuple.cpp", "LibCxxUnorderedMap.cpp", + "LibCxxValarray.cpp", "LibCxxVariant.cpp", "LibCxxVector.cpp", "LibStdcpp.cpp", -- cgit v1.1 From 3a05e7651bc71b3c71757bb406f211645c1c1a37 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sat, 10 Feb 2024 23:35:29 +0300 Subject: [clang][NFC] Annotate `Sema/DeclSpec.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Sema/DeclSpec.h | 56 +++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h index 77638de..d161147 100644 --- a/clang/include/clang/Sema/DeclSpec.h +++ b/clang/include/clang/Sema/DeclSpec.h @@ -353,36 +353,57 @@ public: private: // storage-class-specifier - /*SCS*/unsigned StorageClassSpec : 3; - /*TSCS*/unsigned ThreadStorageClassSpec : 2; + LLVM_PREFERRED_TYPE(SCS) + unsigned StorageClassSpec : 3; + LLVM_PREFERRED_TYPE(TSCS) + unsigned ThreadStorageClassSpec : 2; + LLVM_PREFERRED_TYPE(bool) unsigned SCS_extern_in_linkage_spec : 1; // type-specifier - /*TypeSpecifierWidth*/ unsigned TypeSpecWidth : 2; - /*TSC*/unsigned TypeSpecComplex : 2; - /*TSS*/unsigned TypeSpecSign : 2; - /*TST*/unsigned TypeSpecType : 7; + LLVM_PREFERRED_TYPE(TypeSpecifierWidth) + unsigned TypeSpecWidth : 2; + LLVM_PREFERRED_TYPE(TSC) + unsigned TypeSpecComplex : 2; + LLVM_PREFERRED_TYPE(TypeSpecifierSign) + unsigned TypeSpecSign : 2; + LLVM_PREFERRED_TYPE(TST) + unsigned TypeSpecType : 7; + LLVM_PREFERRED_TYPE(bool) unsigned TypeAltiVecVector : 1; + LLVM_PREFERRED_TYPE(bool) unsigned TypeAltiVecPixel : 1; + LLVM_PREFERRED_TYPE(bool) unsigned TypeAltiVecBool : 1; + LLVM_PREFERRED_TYPE(bool) unsigned TypeSpecOwned : 1; + LLVM_PREFERRED_TYPE(bool) unsigned TypeSpecPipe : 1; + LLVM_PREFERRED_TYPE(bool) unsigned TypeSpecSat : 1; + LLVM_PREFERRED_TYPE(bool) unsigned ConstrainedAuto : 1; // type-qualifiers + LLVM_PREFERRED_TYPE(TQ) unsigned TypeQualifiers : 5; // Bitwise OR of TQ. // function-specifier + LLVM_PREFERRED_TYPE(bool) unsigned FS_inline_specified : 1; + LLVM_PREFERRED_TYPE(bool) unsigned FS_forceinline_specified: 1; + LLVM_PREFERRED_TYPE(bool) unsigned FS_virtual_specified : 1; + LLVM_PREFERRED_TYPE(bool) unsigned FS_noreturn_specified : 1; // friend-specifier + LLVM_PREFERRED_TYPE(bool) unsigned Friend_specified : 1; // constexpr-specifier + LLVM_PREFERRED_TYPE(ConstexprSpecKind) unsigned ConstexprSpecifier : 2; union { @@ -1246,6 +1267,7 @@ struct DeclaratorChunk { struct PointerTypeInfo { /// The type qualifiers: const/volatile/restrict/unaligned/atomic. + LLVM_PREFERRED_TYPE(DeclSpec::TQ) unsigned TypeQuals : 5; /// The location of the const-qualifier, if any. @@ -1279,12 +1301,15 @@ struct DeclaratorChunk { struct ArrayTypeInfo { /// The type qualifiers for the array: /// const/volatile/restrict/__unaligned/_Atomic. + LLVM_PREFERRED_TYPE(DeclSpec::TQ) unsigned TypeQuals : 5; /// True if this dimension included the 'static' keyword. + LLVM_PREFERRED_TYPE(bool) unsigned hasStatic : 1; /// True if this dimension was [*]. In this case, NumElts is null. + LLVM_PREFERRED_TYPE(bool) unsigned isStar : 1; /// This is the size of the array, or null if [] or [*] was specified. @@ -1331,28 +1356,35 @@ struct DeclaratorChunk { /// hasPrototype - This is true if the function had at least one typed /// parameter. If the function is () or (a,b,c), then it has no prototype, /// and is treated as a K&R-style function. + LLVM_PREFERRED_TYPE(bool) unsigned hasPrototype : 1; /// isVariadic - If this function has a prototype, and if that /// proto ends with ',...)', this is true. When true, EllipsisLoc /// contains the location of the ellipsis. + LLVM_PREFERRED_TYPE(bool) unsigned isVariadic : 1; /// Can this declaration be a constructor-style initializer? + LLVM_PREFERRED_TYPE(bool) unsigned isAmbiguous : 1; /// Whether the ref-qualifier (if any) is an lvalue reference. /// Otherwise, it's an rvalue reference. + LLVM_PREFERRED_TYPE(bool) unsigned RefQualifierIsLValueRef : 1; /// ExceptionSpecType - An ExceptionSpecificationType value. + LLVM_PREFERRED_TYPE(ExceptionSpecificationType) unsigned ExceptionSpecType : 4; /// DeleteParams - If this is true, we need to delete[] Params. + LLVM_PREFERRED_TYPE(bool) unsigned DeleteParams : 1; /// HasTrailingReturnType - If this is true, a trailing return type was /// specified. + LLVM_PREFERRED_TYPE(bool) unsigned HasTrailingReturnType : 1; /// The location of the left parenthesis in the source. @@ -1567,6 +1599,7 @@ struct DeclaratorChunk { struct BlockPointerTypeInfo { /// For now, sema will catch these as invalid. /// The type qualifiers: const/volatile/restrict/__unaligned/_Atomic. + LLVM_PREFERRED_TYPE(DeclSpec::TQ) unsigned TypeQuals : 5; void destroy() { @@ -1575,6 +1608,7 @@ struct DeclaratorChunk { struct MemberPointerTypeInfo { /// The type qualifiers: const/volatile/restrict/__unaligned/_Atomic. + LLVM_PREFERRED_TYPE(DeclSpec::TQ) unsigned TypeQuals : 5; /// Location of the '*' token. SourceLocation StarLoc; @@ -1767,6 +1801,7 @@ private: /// The bindings. Binding *Bindings; unsigned NumBindings : 31; + LLVM_PREFERRED_TYPE(bool) unsigned DeleteBindings : 1; friend class Declarator; @@ -1883,33 +1918,42 @@ private: SmallVector DeclTypeInfo; /// InvalidType - Set by Sema::GetTypeForDeclarator(). + LLVM_PREFERRED_TYPE(bool) unsigned InvalidType : 1; /// GroupingParens - Set by Parser::ParseParenDeclarator(). + LLVM_PREFERRED_TYPE(bool) unsigned GroupingParens : 1; /// FunctionDefinition - Is this Declarator for a function or member /// definition and, if so, what kind? /// /// Actually a FunctionDefinitionKind. + LLVM_PREFERRED_TYPE(FunctionDefinitionKind) unsigned FunctionDefinition : 2; /// Is this Declarator a redeclaration? + LLVM_PREFERRED_TYPE(bool) unsigned Redeclaration : 1; /// true if the declaration is preceded by \c __extension__. + LLVM_PREFERRED_TYPE(bool) unsigned Extension : 1; /// Indicates whether this is an Objective-C instance variable. + LLVM_PREFERRED_TYPE(bool) unsigned ObjCIvar : 1; /// Indicates whether this is an Objective-C 'weak' property. + LLVM_PREFERRED_TYPE(bool) unsigned ObjCWeakProperty : 1; /// Indicates whether the InlineParams / InlineBindings storage has been used. + LLVM_PREFERRED_TYPE(bool) unsigned InlineStorageUsed : 1; /// Indicates whether this declarator has an initializer. + LLVM_PREFERRED_TYPE(bool) unsigned HasInitializer : 1; /// Attributes attached to the declarator. -- cgit v1.1 From 76e3759d8d2dc5af755737a764b237ff04aaf7f4 Mon Sep 17 00:00:00 2001 From: Ikhlas Ajbar Date: Sat, 10 Feb 2024 14:42:50 -0600 Subject: [Hexagon] Order objects on the stack by their alignments (#81280) This patch sorts stack objects by their alignment value from the largest to the smallest. If two objects have the same alignment, then they are sorted by their size from the largest to the smallest. This minimizes padding and reduces run time stack size. --- llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp | 64 ++++++++++++++++++++++ llvm/lib/Target/Hexagon/HexagonFrameLowering.h | 4 ++ llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll | 16 +++--- llvm/test/CodeGen/Hexagon/order-stack-object.ll | 42 ++++++++++++++ .../test/CodeGen/Hexagon/store-imm-stack-object.ll | 12 ++-- .../hexagon_generated_funcs.ll.generated.expected | 34 ++++++------ ...hexagon_generated_funcs.ll.nogenerated.expected | 34 ++++++------ 7 files changed, 158 insertions(+), 48 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/order-stack-object.ll diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 812e5f7..2326511 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -2688,3 +2688,67 @@ bool HexagonFrameLowering::mayOverflowFrameOffset(MachineFunction &MF) const { return false; } + +namespace { +// Struct used by orderFrameObjects to help sort the stack objects. +struct HexagonFrameSortingObject { + bool IsValid = false; + unsigned Index = 0; // Index of Object into MFI list. + unsigned Size = 0; + Align ObjectAlignment = Align(1); // Alignment of Object in bytes. +}; + +struct HexagonFrameSortingComparator { + inline bool operator()(const HexagonFrameSortingObject &A, + const HexagonFrameSortingObject &B) const { + return std::make_tuple(!A.IsValid, A.ObjectAlignment, A.Size) < + std::make_tuple(!B.IsValid, B.ObjectAlignment, B.Size); + } +}; +} // namespace + +// Sort objects on the stack by alignment value and then by size to minimize +// padding. +void HexagonFrameLowering::orderFrameObjects( + const MachineFunction &MF, SmallVectorImpl &ObjectsToAllocate) const { + + if (ObjectsToAllocate.empty()) + return; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + int NObjects = ObjectsToAllocate.size(); + + // Create an array of all MFI objects. + SmallVector SortingObjects( + MFI.getObjectIndexEnd()); + + for (int i = 0, j = 0, e = MFI.getObjectIndexEnd(); i < e && j != NObjects; + ++i) { + if (i != ObjectsToAllocate[j]) + continue; + j++; + + // A variable size object has size equal to 0. Since Hexagon sets + // getUseLocalStackAllocationBlock() to true, a local block is allocated + // earlier. This case is not handled here for now. + int Size = MFI.getObjectSize(i); + if (Size == 0) + return; + + SortingObjects[i].IsValid = true; + SortingObjects[i].Index = i; + SortingObjects[i].Size = Size; + SortingObjects[i].ObjectAlignment = MFI.getObjectAlign(i); + } + + // Sort objects by alignment and then by size. + llvm::stable_sort(SortingObjects, HexagonFrameSortingComparator()); + + // Modify the original list to represent the final order. + int i = NObjects; + for (auto &Obj : SortingObjects) { + if (i == 0) + break; + ObjectsToAllocate[--i] = Obj.Index; + } +} diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h index b2222f0..98e69dc 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h @@ -35,6 +35,10 @@ public: explicit HexagonFrameLowering() : TargetFrameLowering(StackGrowsDown, Align(8), 0, Align(1), true) {} + void + orderFrameObjects(const MachineFunction &MF, + SmallVectorImpl &ObjectsToAllocate) const override; + // All of the prolog/epilog functionality, including saving and restoring // callee-saved registers is handled in emitPrologue. This is to have the // logic for shrink-wrapping in one place. diff --git a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll index 6000b9b..9ca1b17 100644 --- a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll +++ b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll @@ -42,7 +42,7 @@ define dso_local void @f2() #0 { ; CHECK-NEXT: } // 8-byte Folded Spill ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vsplat(r16) -; CHECK-NEXT: vmem(r29+#6) = v0.new +; CHECK-NEXT: vmem(r29+#2) = v0.new ; CHECK-NEXT: } // 128-byte Folded Spill ; CHECK-NEXT: { ; CHECK-NEXT: q0 = vand(v0,r0) @@ -56,7 +56,7 @@ define dso_local void @f2() #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vand(q0,r17) ; CHECK-NEXT: r19 = ##g0+128 -; CHECK-NEXT: vmem(r29+#7) = v0.new +; CHECK-NEXT: vmem(r29+#1) = v0.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r20 = ##g0 @@ -78,15 +78,15 @@ define dso_local void @f2() #0 { ; CHECK-NEXT: vmem(r20+#0) = v30 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmem(r29+#6) +; CHECK-NEXT: v0 = vmem(r29+#2) ; CHECK-NEXT: } // 128-byte Folded Reload ; CHECK-NEXT: { ; CHECK-NEXT: v1:0.h = vadd(v0.ub,v0.ub) ; CHECK-NEXT: r0 = ##g2 -; CHECK-NEXT: vmem(r29+#1) = v0.new +; CHECK-NEXT: vmem(r29+#6) = v0.new ; CHECK-NEXT: } // 256-byte Folded Spill ; CHECK-NEXT: { -; CHECK-NEXT: vmem(r29+#2) = v1 +; CHECK-NEXT: vmem(r29+#7) = v1 ; CHECK-NEXT: } // 256-byte Folded Spill ; CHECK-NEXT: { ; CHECK-NEXT: v1:0.uw = vrmpy(v1:0.ub,r17.ub,#0) @@ -98,10 +98,10 @@ define dso_local void @f2() #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r0 = ##2147483647 -; CHECK-NEXT: v0 = vmem(r29+#1) +; CHECK-NEXT: v0 = vmem(r29+#6) ; CHECK-NEXT: } // 256-byte Folded Reload ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmem(r29+#2) +; CHECK-NEXT: v1 = vmem(r29+#7) ; CHECK-NEXT: } // 256-byte Folded Reload ; CHECK-NEXT: { ; CHECK-NEXT: v1:0.uw = vrmpy(v1:0.ub,r0.ub,#1) @@ -142,7 +142,7 @@ define dso_local void @f2() #0 { ; CHECK-NEXT: vmem(r20+#0) = v0 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmem(r29+#6) +; CHECK-NEXT: v0 = vmem(r29+#2) ; CHECK-NEXT: } // 128-byte Folded Reload ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vmem(r29+#3) diff --git a/llvm/test/CodeGen/Hexagon/order-stack-object.ll b/llvm/test/CodeGen/Hexagon/order-stack-object.ll new file mode 100644 index 0000000..bdc16e9 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/order-stack-object.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=hexagon -mattr=+hvxv68,+hvx-length128b < %s | FileCheck %s + +; Check that ordering objects on the stack from the largest to the smallest has +; decreased the space allocated on the stack by 512 Bytes. + +; CHECK: allocframe(r29,#2432):raw + +define void @test(ptr nocapture readonly %arg, ptr nocapture writeonly %arg1, i32 %arg2) local_unnamed_addr { +bb: + %shl = shl i32 %arg2, 5 + br label %bb3 + +bb3: + %phi = phi i32 [ 0, %bb ], [ %add13, %bb3 ] + %add = add i32 %phi, %shl + %sext = sext i32 %add to i64 + %getelementptr = getelementptr float, ptr %arg, i64 %sext + %load = load <32 x float>, ptr %getelementptr, align 4 + %fmul = fmul <32 x float> %load, + %fmul4 = fmul <32 x float> %load, + %fmul5 = fmul <32 x float> %load, + %fmul6 = fmul <32 x float> %load, %fmul5 + %fmul7 = fmul <32 x float> %load, %fmul6 + %fadd = fadd <32 x float> %fmul4, %fmul7 + %fmul8 = fmul <32 x float> %fadd, + %call = tail call <32 x float> @llvm.exp.v32f32(<32 x float> %fmul8) + %fsub = fsub <32 x float> , %call + %fadd9 = fadd <32 x float> %call, + %fdiv = fdiv <32 x float> %fsub, %fadd9 + %fadd10 = fadd <32 x float> %fdiv, + %fmul11 = fmul <32 x float> %fmul, %fadd10 + %getelementptr12 = getelementptr float, ptr %arg1, i64 %sext + store <32 x float> %fmul11, ptr %getelementptr12, align 128 + %add13 = add nuw nsw i32 %phi, 128 + %icmp = icmp ult i32 %phi, 8064 + br i1 %icmp, label %bb3, label %bb14 + +bb14: + ret void +} + +declare <32 x float> @llvm.exp.v32f32(<32 x float>) diff --git a/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll b/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll index 8c5b11d..bb9f7cf 100644 --- a/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll +++ b/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll @@ -3,10 +3,10 @@ target triple = "hexagon" ; CHECK-LABEL: test1: -; CHECK-DAG: memw(r29+#4) = ##875770417 +; CHECK-DAG: memw(r29+#12) = ##875770417 ; CHECK-DAG: memw(r29+#8) = #51 -; CHECK-DAG: memh(r29+#12) = #50 -; CHECK-DAG: memb(r29+#15) = #49 +; CHECK-DAG: memh(r29+#6) = #50 +; CHECK-DAG: memb(r29+#5) = #49 define void @test1() { b0: %v1 = alloca [1 x i8], align 1 @@ -30,9 +30,9 @@ b0: } ; CHECK-LABEL: test2: -; CHECK-DAG: memw(r29+#208) = #51 -; CHECK-DAG: memh(r29+#212) = r{{[0-9]+}} -; CHECK-DAG: memb(r29+#215) = r{{[0-9]+}} +; CHECK-DAG: memw(r29+#8) = #51 +; CHECK-DAG: memh(r29+#6) = r{{[0-9]+}} +; CHECK-DAG: memb(r29+#5) = r{{[0-9]+}} define void @test2() { b0: %v1 = alloca [1 x i8], align 1 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected index 2ab769f..cd135ce 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected @@ -75,31 +75,31 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } ; CHECK-NEXT: .cfi_offset r31, -4 ; CHECK-NEXT: .cfi_offset r30, -8 ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#20) = #0 +; CHECK-NEXT: memw(r29+#4) = #0 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#16) = #0 -; CHECK-NEXT: memw(r29+#16) = #1 +; CHECK-NEXT: memw(r29+#8) = #0 +; CHECK-NEXT: memw(r29+#8) = #1 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r1 = memw(r29+#16) +; CHECK-NEXT: r1 = memw(r29+#8) ; CHECK-NEXT: memw(r29+#12) = #2 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#8) = #3 -; CHECK-NEXT: memw(r29+#4) = #4 +; CHECK-NEXT: memw(r29+#16) = #3 +; CHECK-NEXT: memw(r29+#20) = #4 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: p0 = cmp.eq(r1,#0) -; CHECK-NEXT: if (p0.new) memw(r29+#8) = #3 +; CHECK-NEXT: if (p0.new) memw(r29+#16) = #3 ; CHECK-NEXT: if (p0.new) memw(r29+#12) = #2 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: if (p0) memw(r29+#4) = #4 -; CHECK-NEXT: if (p0) memw(r29+#16) = #1 +; CHECK-NEXT: if (p0) memw(r29+#20) = #4 +; CHECK-NEXT: if (p0) memw(r29+#8) = #1 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: if (!p0) memw(r29+#8) = #1 +; CHECK-NEXT: if (!p0) memw(r29+#16) = #1 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r31:30 = dealloc_return(r30):raw @@ -116,27 +116,27 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } ; CHECK-NEXT: .cfi_offset r31, -4 ; CHECK-NEXT: .cfi_offset r30, -8 ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#20) = #0 +; CHECK-NEXT: memw(r29+#4) = #0 ; CHECK-NEXT: memw(r0+#0) = #1 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#16) = #1 +; CHECK-NEXT: memw(r29+#8) = #1 ; CHECK-NEXT: memw(r29+#12) = #2 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#8) = #3 -; CHECK-NEXT: memw(r29+#4) = #4 +; CHECK-NEXT: memw(r29+#16) = #3 +; CHECK-NEXT: memw(r29+#20) = #4 ; CHECK-NEXT: } ; CHECK-NEXT: //# InlineAsm Start ; CHECK-NEXT: //# InlineAsm End ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #0 -; CHECK-NEXT: memw(r29+#16) = #1 +; CHECK-NEXT: memw(r29+#8) = #1 ; CHECK-NEXT: memw(r29+#12) = #2 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#8) = #3 -; CHECK-NEXT: memw(r29+#4) = #4 +; CHECK-NEXT: memw(r29+#16) = #3 +; CHECK-NEXT: memw(r29+#20) = #4 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r31:30 = dealloc_return(r30):raw diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected index 52dd5f1..833bf68 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected @@ -16,31 +16,31 @@ define dso_local i32 @check_boundaries() #0 { ; CHECK-NEXT: .cfi_offset r31, -4 ; CHECK-NEXT: .cfi_offset r30, -8 ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#20) = #0 +; CHECK-NEXT: memw(r29+#4) = #0 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#16) = #0 -; CHECK-NEXT: memw(r29+#16) = #1 +; CHECK-NEXT: memw(r29+#8) = #0 +; CHECK-NEXT: memw(r29+#8) = #1 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r1 = memw(r29+#16) +; CHECK-NEXT: r1 = memw(r29+#8) ; CHECK-NEXT: memw(r29+#12) = #2 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#8) = #3 -; CHECK-NEXT: memw(r29+#4) = #4 +; CHECK-NEXT: memw(r29+#16) = #3 +; CHECK-NEXT: memw(r29+#20) = #4 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: p0 = cmp.eq(r1,#0) -; CHECK-NEXT: if (p0.new) memw(r29+#8) = #3 +; CHECK-NEXT: if (p0.new) memw(r29+#16) = #3 ; CHECK-NEXT: if (p0.new) memw(r29+#12) = #2 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: if (p0) memw(r29+#4) = #4 -; CHECK-NEXT: if (p0) memw(r29+#16) = #1 +; CHECK-NEXT: if (p0) memw(r29+#20) = #4 +; CHECK-NEXT: if (p0) memw(r29+#8) = #1 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: if (!p0) memw(r29+#8) = #1 +; CHECK-NEXT: if (!p0) memw(r29+#16) = #1 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r31:30 = dealloc_return(r30):raw @@ -93,27 +93,27 @@ define dso_local i32 @main() #0 { ; CHECK-NEXT: .cfi_offset r31, -4 ; CHECK-NEXT: .cfi_offset r30, -8 ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#20) = #0 +; CHECK-NEXT: memw(r29+#4) = #0 ; CHECK-NEXT: memw(r0+#0) = #1 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#16) = #1 +; CHECK-NEXT: memw(r29+#8) = #1 ; CHECK-NEXT: memw(r29+#12) = #2 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#8) = #3 -; CHECK-NEXT: memw(r29+#4) = #4 +; CHECK-NEXT: memw(r29+#16) = #3 +; CHECK-NEXT: memw(r29+#20) = #4 ; CHECK-NEXT: } ; CHECK-NEXT: //# InlineAsm Start ; CHECK-NEXT: //# InlineAsm End ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #0 -; CHECK-NEXT: memw(r29+#16) = #1 +; CHECK-NEXT: memw(r29+#8) = #1 ; CHECK-NEXT: memw(r29+#12) = #2 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw(r29+#8) = #3 -; CHECK-NEXT: memw(r29+#4) = #4 +; CHECK-NEXT: memw(r29+#16) = #3 +; CHECK-NEXT: memw(r29+#20) = #4 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r31:30 = dealloc_return(r30):raw -- cgit v1.1 From 4e16a75902d5718f4932fae9b2a07c410cd0ba34 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sat, 10 Feb 2024 23:58:26 +0300 Subject: [clang][NFC] Annotate `Sema/ScopeInfo.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Sema/ScopeInfo.h | 37 ++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h index 6eaa7438..076dcaa 100644 --- a/clang/include/clang/Sema/ScopeInfo.h +++ b/clang/include/clang/Sema/ScopeInfo.h @@ -97,6 +97,8 @@ public: : PD(PD), Loc(Loc), Stmts(Stmts) {} }; +enum class FirstCoroutineStmtKind { co_return, co_await, co_yield }; + /// Retains information about a function, method, or block that is /// currently being parsed. class FunctionScopeInfo { @@ -170,6 +172,7 @@ public: /// An enumeration representing the kind of the first coroutine statement /// in the function. One of co_return, co_await, or co_yield. + LLVM_PREFERRED_TYPE(FirstCoroutineStmtKind) unsigned char FirstCoroutineStmtKind : 2; /// Whether we found an immediate-escalating expression. @@ -502,22 +505,30 @@ public: assert(FirstCoroutineStmtLoc.isInvalid() && "first coroutine statement location already set"); FirstCoroutineStmtLoc = Loc; - FirstCoroutineStmtKind = llvm::StringSwitch(Keyword) - .Case("co_return", 0) - .Case("co_await", 1) - .Case("co_yield", 2); + FirstCoroutineStmtKind = + llvm::StringSwitch(Keyword) + .Case("co_return", + llvm::to_underlying(FirstCoroutineStmtKind::co_return)) + .Case("co_await", + llvm::to_underlying(FirstCoroutineStmtKind::co_await)) + .Case("co_yield", + llvm::to_underlying(FirstCoroutineStmtKind::co_yield)); } StringRef getFirstCoroutineStmtKeyword() const { assert(FirstCoroutineStmtLoc.isValid() && "no coroutine statement available"); - switch (FirstCoroutineStmtKind) { - case 0: return "co_return"; - case 1: return "co_await"; - case 2: return "co_yield"; - default: - llvm_unreachable("FirstCoroutineStmtKind has an invalid value"); + auto Value = + static_cast(FirstCoroutineStmtKind); + switch (Value) { + case FirstCoroutineStmtKind::co_return: + return "co_return"; + case FirstCoroutineStmtKind::co_await: + return "co_await"; + case FirstCoroutineStmtKind::co_yield: + return "co_yield"; }; + llvm_unreachable("FirstCoroutineStmtKind has an invalid value"); } void setNeedsCoroutineSuspends(bool value = true) { @@ -582,25 +593,31 @@ class Capture { QualType CaptureType; /// The CaptureKind of this capture. + LLVM_PREFERRED_TYPE(CaptureKind) unsigned Kind : 2; /// Whether this is a nested capture (a capture of an enclosing capturing /// scope's capture). + LLVM_PREFERRED_TYPE(bool) unsigned Nested : 1; /// Whether this is a capture of '*this'. + LLVM_PREFERRED_TYPE(bool) unsigned CapturesThis : 1; /// Whether an explicit capture has been odr-used in the body of the /// lambda. + LLVM_PREFERRED_TYPE(bool) unsigned ODRUsed : 1; /// Whether an explicit capture has been non-odr-used in the body of /// the lambda. + LLVM_PREFERRED_TYPE(bool) unsigned NonODRUsed : 1; /// Whether the capture is invalid (a capture was required but the entity is /// non-capturable). + LLVM_PREFERRED_TYPE(bool) unsigned Invalid : 1; public: -- cgit v1.1 From d2812d2d1a9b4edb64e95a9a86a2599a24bcb5ec Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 00:11:13 +0300 Subject: [clang][NFC] Annotate `Sema/Overload.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Sema/Overload.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index 6ccabad..9b342c0 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -278,40 +278,50 @@ class Sema; /// Whether this is the deprecated conversion of a /// string literal to a pointer to non-const character data /// (C++ 4.2p2). + LLVM_PREFERRED_TYPE(bool) unsigned DeprecatedStringLiteralToCharPtr : 1; /// Whether the qualification conversion involves a change in the /// Objective-C lifetime (for automatic reference counting). + LLVM_PREFERRED_TYPE(bool) unsigned QualificationIncludesObjCLifetime : 1; /// IncompatibleObjC - Whether this is an Objective-C conversion /// that we should warn about (if we actually use it). + LLVM_PREFERRED_TYPE(bool) unsigned IncompatibleObjC : 1; /// ReferenceBinding - True when this is a reference binding /// (C++ [over.ics.ref]). + LLVM_PREFERRED_TYPE(bool) unsigned ReferenceBinding : 1; /// DirectBinding - True when this is a reference binding that is a /// direct binding (C++ [dcl.init.ref]). + LLVM_PREFERRED_TYPE(bool) unsigned DirectBinding : 1; /// Whether this is an lvalue reference binding (otherwise, it's /// an rvalue reference binding). + LLVM_PREFERRED_TYPE(bool) unsigned IsLvalueReference : 1; /// Whether we're binding to a function lvalue. + LLVM_PREFERRED_TYPE(bool) unsigned BindsToFunctionLvalue : 1; /// Whether we're binding to an rvalue. + LLVM_PREFERRED_TYPE(bool) unsigned BindsToRvalue : 1; /// Whether this binds an implicit object argument to a /// non-static member function without a ref-qualifier. + LLVM_PREFERRED_TYPE(bool) unsigned BindsImplicitObjectArgumentWithoutRefQualifier : 1; /// Whether this binds a reference to an object with a different /// Objective-C lifetime qualifier. + LLVM_PREFERRED_TYPE(bool) unsigned ObjCLifetimeConversionBinding : 1; /// FromType - The type that this conversion is converting @@ -541,9 +551,11 @@ class Sema; }; /// ConversionKind - The kind of implicit conversion sequence. + LLVM_PREFERRED_TYPE(Kind) unsigned ConversionKind : 31; // Whether the initializer list was of an incomplete array. + LLVM_PREFERRED_TYPE(bool) unsigned InitializerListOfIncompleteArray : 1; /// When initializing an array or std::initializer_list from an @@ -878,6 +890,7 @@ class Sema; CallExpr::ADLCallKind IsADLCandidate : 1; /// Whether this is a rewritten candidate, and if so, of what kind? + LLVM_PREFERRED_TYPE(OverloadCandidateRewriteKind) unsigned RewriteKind : 2; /// FailureKind - The reason why this candidate is not viable. -- cgit v1.1 From 425fd3eb10f29e73d722b4c2bc9cb50798de18e8 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 00:15:25 +0300 Subject: [clang][NFC] Rename FirstCoroutineStmtKind enumerators So that they do not use coroutine keywords. Fixed buildbot failure https://lab.llvm.org/buildbot/#/builders/86/builds/74100 --- clang/include/clang/Sema/ScopeInfo.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h index 076dcaa..ca3d0a0 100644 --- a/clang/include/clang/Sema/ScopeInfo.h +++ b/clang/include/clang/Sema/ScopeInfo.h @@ -97,7 +97,7 @@ public: : PD(PD), Loc(Loc), Stmts(Stmts) {} }; -enum class FirstCoroutineStmtKind { co_return, co_await, co_yield }; +enum class FirstCoroutineStmtKind { CoReturn, CoAwait, CoYield }; /// Retains information about a function, method, or block that is /// currently being parsed. @@ -508,11 +508,11 @@ public: FirstCoroutineStmtKind = llvm::StringSwitch(Keyword) .Case("co_return", - llvm::to_underlying(FirstCoroutineStmtKind::co_return)) + llvm::to_underlying(FirstCoroutineStmtKind::CoReturn)) .Case("co_await", - llvm::to_underlying(FirstCoroutineStmtKind::co_await)) + llvm::to_underlying(FirstCoroutineStmtKind::CoAwait)) .Case("co_yield", - llvm::to_underlying(FirstCoroutineStmtKind::co_yield)); + llvm::to_underlying(FirstCoroutineStmtKind::CoYield)); } StringRef getFirstCoroutineStmtKeyword() const { @@ -521,11 +521,11 @@ public: auto Value = static_cast(FirstCoroutineStmtKind); switch (Value) { - case FirstCoroutineStmtKind::co_return: + case FirstCoroutineStmtKind::CoReturn: return "co_return"; - case FirstCoroutineStmtKind::co_await: + case FirstCoroutineStmtKind::CoAwait: return "co_await"; - case FirstCoroutineStmtKind::co_yield: + case FirstCoroutineStmtKind::CoYield: return "co_yield"; }; llvm_unreachable("FirstCoroutineStmtKind has an invalid value"); -- cgit v1.1 From 6a7cf806a66c67df01818fda01116a2dd2d90b0d Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 00:21:37 +0300 Subject: [clang][NFC] Annotate `Sema/ParsedAttr.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Sema/ParsedAttr.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h index 8c0edca..8c3ba39 100644 --- a/clang/include/clang/Sema/ParsedAttr.h +++ b/clang/include/clang/Sema/ParsedAttr.h @@ -82,7 +82,9 @@ struct AvailabilityData { struct TypeTagForDatatypeData { ParsedType MatchingCType; + LLVM_PREFERRED_TYPE(bool) unsigned LayoutCompatible : 1; + LLVM_PREFERRED_TYPE(bool) unsigned MustBeNull : 1; }; struct PropertyData { @@ -149,33 +151,41 @@ private: unsigned NumArgs : 16; /// True if already diagnosed as invalid. + LLVM_PREFERRED_TYPE(bool) mutable unsigned Invalid : 1; /// True if this attribute was used as a type attribute. + LLVM_PREFERRED_TYPE(bool) mutable unsigned UsedAsTypeAttr : 1; /// True if this has the extra information associated with an /// availability attribute. + LLVM_PREFERRED_TYPE(bool) unsigned IsAvailability : 1; /// True if this has extra information associated with a /// type_tag_for_datatype attribute. + LLVM_PREFERRED_TYPE(bool) unsigned IsTypeTagForDatatype : 1; /// True if this has extra information associated with a /// Microsoft __delcspec(property) attribute. + LLVM_PREFERRED_TYPE(bool) unsigned IsProperty : 1; /// True if this has a ParsedType + LLVM_PREFERRED_TYPE(bool) unsigned HasParsedType : 1; /// True if the processing cache is valid. + LLVM_PREFERRED_TYPE(bool) mutable unsigned HasProcessingCache : 1; /// A cached value. mutable unsigned ProcessingCache : 8; /// True if the attribute is specified using '#pragma clang attribute'. + LLVM_PREFERRED_TYPE(bool) mutable unsigned IsPragmaClangAttribute : 1; /// The location of the 'unavailable' keyword in an -- cgit v1.1 From 0df8aed6c30f08ded526038a6bbb4daf113a31c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 10 Feb 2024 23:57:12 +0200 Subject: [LLD] [COFF] Pick timestamps from the SOURCE_DATE_EPOCH variable (#81326) The SOURCE_DATE_EPOCH environment variable can be set in order to get reproducible build. When linking PE/COFF modules with LLD, the timestamp field is set to the current time, unless either the /timestamp: or /Brepro option is set. If neither of them is set, check the SOURCE_DATE_EPOCH variable, before resorting to using the actual current date and time. See https://reproducible-builds.org/docs/source-date-epoch/ for reference on the use of this variable. --- lld/COFF/Driver.cpp | 10 +++++++++- lld/test/COFF/timestamp.test | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index e0afb6b..22ee2f1 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -1825,7 +1825,15 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { } } else { config->repro = false; - config->timestamp = time(nullptr); + if (std::optional epoch = + Process::GetEnv("SOURCE_DATE_EPOCH")) { + StringRef value(*epoch); + if (value.getAsInteger(0, config->timestamp)) + fatal(Twine("invalid SOURCE_DATE_EPOCH timestamp: ") + value + + ". Expected 32-bit integer"); + } else { + config->timestamp = time(nullptr); + } } // Handle /alternatename diff --git a/lld/test/COFF/timestamp.test b/lld/test/COFF/timestamp.test index fbdc5788..c0658d6 100644 --- a/lld/test/COFF/timestamp.test +++ b/lld/test/COFF/timestamp.test @@ -3,9 +3,19 @@ RUN: yaml2obj %p/Inputs/generic.yaml -o %t.obj RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.1.exe RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.2.exe RUN: lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.3.exe +RUN: env SOURCE_DATE_EPOCH=0 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.4.exe +RUN: lld-link %t.obj /debug /timestamp:4294967295 /entry:main /nodefaultlib /out:%t.5.exe +RUN: env SOURCE_DATE_EPOCH=4294967295 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.6.exe +RUN: env SOURCE_DATE_EPOCH=12345 lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.7.exe +RUN: env LLD_IN_TEST=1 not lld-link %t.obj /debug /timestamp:4294967296 /entry:main /nodefaultlib /out:%t.8.exe 2>&1 | FileCheck %s --check-prefix=ERROR +RUN: env SOURCE_DATE_EPOCH=4294967296 env LLD_IN_TEST=1 not lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.9.exe 2>&1 | FileCheck %s --check-prefix=ERROR2 RUN: llvm-readobj --file-headers --coff-debug-directory %t.1.exe | FileCheck %s --check-prefix=HASH RUN: llvm-readobj --file-headers --coff-debug-directory %t.2.exe | FileCheck %s --check-prefix=HASH RUN: llvm-readobj --file-headers --coff-debug-directory %t.3.exe | FileCheck %s --check-prefix=ZERO +RUN: llvm-readobj --file-headers --coff-debug-directory %t.4.exe | FileCheck %s --check-prefix=ZERO +RUN: llvm-readobj --file-headers --coff-debug-directory %t.5.exe | FileCheck %s --check-prefix=MAX +RUN: llvm-readobj --file-headers --coff-debug-directory %t.6.exe | FileCheck %s --check-prefix=MAX +RUN: llvm-readobj --file-headers --coff-debug-directory %t.7.exe | FileCheck %s --check-prefix=ZERO HASH: ImageFileHeader { HASH: TimeDateStamp: [[STAMP:.*]] @@ -16,3 +26,11 @@ ZERO: ImageFileHeader { ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0) ZERO: DebugDirectory [ ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0) + +MAX: ImageFileHeader { +MAX: TimeDateStamp: 2106-02-07 06:28:15 (0xFFFFFFFF) +MAX: DebugDirectory [ +MAX: TimeDateStamp: 2106-02-07 06:28:15 (0xFFFFFFFF) + +ERROR: error: invalid timestamp: 4294967296. Expected 32-bit integer +ERROR2: error: invalid SOURCE_DATE_EPOCH timestamp: 4294967296. Expected 32-bit integer -- cgit v1.1 From b17348c3b541d7fc7ec441c98db75c18d8959910 Mon Sep 17 00:00:00 2001 From: Kai Sasaki Date: Sun, 11 Feb 2024 07:35:19 +0900 Subject: [mlir][complex] Prevent underflow in complex.abs (#79786) (#81092) --- .../ComplexToStandard/ComplexToStandard.cpp | 58 +++++++--- .../ComplexToStandard/convert-to-standard.mlir | 125 +++++++++++++++++---- .../ComplexToStandard/full-conversion.mlir | 27 ++++- .../Dialect/Complex/CPU/correctness.mlir | 54 +++++++++ 4 files changed, 224 insertions(+), 40 deletions(-) diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp index 4c9dad9..cc31511 100644 --- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp +++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp @@ -26,29 +26,59 @@ namespace mlir { using namespace mlir; namespace { +// The algorithm is listed in https://dl.acm.org/doi/pdf/10.1145/363717.363780. struct AbsOpConversion : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(complex::AbsOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto type = op.getType(); + mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter); arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr(); - Value real = - rewriter.create(loc, type, adaptor.getComplex()); - Value imag = - rewriter.create(loc, type, adaptor.getComplex()); - Value realSqr = - rewriter.create(loc, real, real, fmf.getValue()); - Value imagSqr = - rewriter.create(loc, imag, imag, fmf.getValue()); - Value sqNorm = - rewriter.create(loc, realSqr, imagSqr, fmf.getValue()); - - rewriter.replaceOpWithNewOp(op, sqNorm); + Type elementType = op.getType(); + Value arg = adaptor.getComplex(); + + Value zero = + b.create(elementType, b.getZeroAttr(elementType)); + Value one = b.create(elementType, + b.getFloatAttr(elementType, 1.0)); + + Value real = b.create(elementType, arg); + Value imag = b.create(elementType, arg); + + Value realIsZero = + b.create(arith::CmpFPredicate::OEQ, real, zero); + Value imagIsZero = + b.create(arith::CmpFPredicate::OEQ, imag, zero); + + // Real > Imag + Value imagDivReal = b.create(imag, real, fmf.getValue()); + Value imagSq = + b.create(imagDivReal, imagDivReal, fmf.getValue()); + Value imagSqPlusOne = b.create(imagSq, one, fmf.getValue()); + Value imagSqrt = b.create(imagSqPlusOne, fmf.getValue()); + Value realAbs = b.create(real, fmf.getValue()); + Value absImag = b.create(imagSqrt, realAbs, fmf.getValue()); + + // Real <= Imag + Value realDivImag = b.create(real, imag, fmf.getValue()); + Value realSq = + b.create(realDivImag, realDivImag, fmf.getValue()); + Value realSqPlusOne = b.create(realSq, one, fmf.getValue()); + Value realSqrt = b.create(realSqPlusOne, fmf.getValue()); + Value imagAbs = b.create(imag, fmf.getValue()); + Value absReal = b.create(realSqrt, imagAbs, fmf.getValue()); + + rewriter.replaceOpWithNewOp( + op, realIsZero, imagAbs, + b.create( + imagIsZero, realAbs, + b.create( + b.create(arith::CmpFPredicate::OGT, real, imag), + absImag, absReal))); + return success(); } }; diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir index 8fa29ea..1fe843b 100644 --- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir +++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir @@ -7,13 +7,30 @@ func.func @complex_abs(%arg: complex) -> f32 { %abs = complex.abs %arg: complex return %abs : f32 } + +// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex -// CHECK-DAG: %[[REAL_SQ:.*]] = arith.mulf %[[REAL]], %[[REAL]] : f32 -// CHECK-DAG: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] : f32 -// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[REAL_SQ]], %[[IMAG_SQ]] : f32 -// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32 -// CHECK: return %[[NORM]] : f32 +// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32 +// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32 +// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] : f32 +// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] : f32 +// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] : f32 +// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] : f32 +// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] : f32 +// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] : f32 +// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] : f32 +// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32 +// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] : f32 +// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] : f32 +// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] : f32 +// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] : f32 +// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32 +// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32 +// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32 +// CHECK: %[[ABS3:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32 +// CHECK: return %[[ABS3]] : f32 // ----- @@ -241,12 +258,28 @@ func.func @complex_log(%arg: complex) -> complex { %log = complex.log %arg: complex return %log : complex } +// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex -// CHECK: %[[SQR_REAL:.*]] = arith.mulf %[[REAL]], %[[REAL]] : f32 -// CHECK: %[[SQR_IMAG:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] : f32 -// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[SQR_REAL]], %[[SQR_IMAG]] : f32 -// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32 +// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32 +// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32 +// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] : f32 +// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] : f32 +// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] : f32 +// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] : f32 +// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] : f32 +// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] : f32 +// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] : f32 +// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32 +// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] : f32 +// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] : f32 +// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] : f32 +// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] : f32 +// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32 +// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32 +// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32 +// CHECK: %[[NORM:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32 // CHECK: %[[RESULT_REAL:.*]] = math.log %[[NORM]] : f32 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex // CHECK: %[[IMAG2:.*]] = complex.im %[[ARG]] : complex @@ -469,12 +502,28 @@ func.func @complex_sign(%arg: complex) -> complex { // CHECK: %[[REAL_IS_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32 // CHECK: %[[IMAG_IS_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32 // CHECK: %[[IS_ZERO:.*]] = arith.andi %[[REAL_IS_ZERO]], %[[IMAG_IS_ZERO]] : i1 +// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex // CHECK: %[[IMAG2:.*]] = complex.im %[[ARG]] : complex -// CHECK: %[[SQR_REAL:.*]] = arith.mulf %[[REAL2]], %[[REAL2]] : f32 -// CHECK: %[[SQR_IMAG:.*]] = arith.mulf %[[IMAG2]], %[[IMAG2]] : f32 -// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[SQR_REAL]], %[[SQR_IMAG]] : f32 -// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32 +// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL2]], %[[ZERO]] : f32 +// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG2]], %[[ZERO]] : f32 +// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG2]], %[[REAL2]] : f32 +// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] : f32 +// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] : f32 +// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] : f32 +// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL2]] : f32 +// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] : f32 +// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL2]], %[[IMAG2]] : f32 +// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32 +// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] : f32 +// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] : f32 +// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG2]] : f32 +// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] : f32 +// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL2]], %[[IMAG2]] : f32 +// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32 +// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32 +// CHECK: %[[NORM:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32 // CHECK: %[[REAL_SIGN:.*]] = arith.divf %[[REAL]], %[[NORM]] : f32 // CHECK: %[[IMAG_SIGN:.*]] = arith.divf %[[IMAG]], %[[NORM]] : f32 // CHECK: %[[SIGN:.*]] = complex.create %[[REAL_SIGN]], %[[IMAG_SIGN]] : complex @@ -716,13 +765,29 @@ func.func @complex_abs_with_fmf(%arg: complex) -> f32 { %abs = complex.abs %arg fastmath : complex return %abs : f32 } +// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex -// CHECK-DAG: %[[REAL_SQ:.*]] = arith.mulf %[[REAL]], %[[REAL]] fastmath : f32 -// CHECK-DAG: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] fastmath : f32 -// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[REAL_SQ]], %[[IMAG_SQ]] fastmath : f32 -// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32 -// CHECK: return %[[NORM]] : f32 +// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32 +// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32 +// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] fastmath : f32 +// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] fastmath : f32 +// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] fastmath : f32 +// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] fastmath : f32 +// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] fastmath : f32 +// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] fastmath : f32 +// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] fastmath : f32 +// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] fastmath : f32 +// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] fastmath : f32 +// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] fastmath : f32 +// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32 +// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32 +// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32 +// CHECK: %[[ABS3:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32 +// CHECK: return %[[ABS3]] : f32 // ----- @@ -807,12 +872,28 @@ func.func @complex_log_with_fmf(%arg: complex) -> complex { %log = complex.log %arg fastmath : complex return %log : complex } +// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex -// CHECK: %[[SQR_REAL:.*]] = arith.mulf %[[REAL]], %[[REAL]] fastmath : f32 -// CHECK: %[[SQR_IMAG:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] fastmath : f32 -// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[SQR_REAL]], %[[SQR_IMAG]] fastmath : f32 -// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32 +// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32 +// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32 +// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] fastmath : f32 +// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] fastmath : f32 +// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] fastmath : f32 +// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] fastmath : f32 +// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] fastmath : f32 +// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] fastmath : f32 +// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] fastmath : f32 +// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] fastmath : f32 +// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] fastmath : f32 +// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] fastmath : f32 +// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32 +// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32 +// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32 +// CHECK: %[[NORM:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32 // CHECK: %[[RESULT_REAL:.*]] = math.log %[[NORM]] fastmath : f32 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex // CHECK: %[[IMAG2:.*]] = complex.im %[[ARG]] : complex diff --git a/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir b/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir index 9983dd4..0f23e20 100644 --- a/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir +++ b/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir @@ -6,12 +6,31 @@ func.func @complex_abs(%arg: complex) -> f32 { %abs = complex.abs %arg: complex return %abs : f32 } +// CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32 +// CHECK: %[[ONE:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : f32 // CHECK: %[[REAL:.*]] = llvm.extractvalue %[[ARG]][0] : ![[C_TY]] // CHECK: %[[IMAG:.*]] = llvm.extractvalue %[[ARG]][1] : ![[C_TY]] -// CHECK-DAG: %[[REAL_SQ:.*]] = llvm.fmul %[[REAL]], %[[REAL]] : f32 -// CHECK-DAG: %[[IMAG_SQ:.*]] = llvm.fmul %[[IMAG]], %[[IMAG]] : f32 -// CHECK: %[[SQ_NORM:.*]] = llvm.fadd %[[REAL_SQ]], %[[IMAG_SQ]] : f32 -// CHECK: %[[NORM:.*]] = llvm.intr.sqrt(%[[SQ_NORM]]) : (f32) -> f32 +// CHECK: %[[REAL_IS_ZERO:.*]] = llvm.fcmp "oeq" %[[REAL]], %[[ZERO]] : f32 +// CHECK: %[[IMAG_IS_ZERO:.*]] = llvm.fcmp "oeq" %[[IMAG]], %[[ZERO]] : f32 + +// CHECK: %[[IMAG_DIV_REAL:.*]] = llvm.fdiv %[[IMAG]], %[[REAL]] : f32 +// CHECK: %[[IMAG_SQ:.*]] = llvm.fmul %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] : f32 +// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = llvm.fadd %[[IMAG_SQ]], %[[ONE]] : f32 +// CHECK: %[[IMAG_SQRT:.*]] = llvm.intr.sqrt(%[[IMAG_SQ_PLUS_ONE]]) : (f32) -> f32 +// CHECK: %[[REAL_ABS:.*]] = llvm.intr.fabs(%[[REAL]]) : (f32) -> f32 +// CHECK: %[[ABS_IMAG:.*]] = llvm.fmul %[[IMAG_SQRT]], %[[REAL_ABS]] : f32 + +// CHECK: %[[REAL_DIV_IMAG:.*]] = llvm.fdiv %[[REAL]], %[[IMAG]] : f32 +// CHECK: %[[REAL_SQ:.*]] = llvm.fmul %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32 +// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = llvm.fadd %[[REAL_SQ]], %[[ONE]] : f32 +// CHECK: %[[REAL_SQRT:.*]] = llvm.intr.sqrt(%[[REAL_SQ_PLUS_ONE]]) : (f32) -> f32 +// CHECK: %[[IMAG_ABS:.*]] = llvm.intr.fabs(%[[IMAG]]) : (f32) -> f32 +// CHECK: %[[ABS_REAL:.*]] = llvm.fmul %[[REAL_SQRT]], %[[IMAG_ABS]] : f32 + +// CHECK: %[[REAL_GT_IMAG:.*]] = llvm.fcmp "ogt" %[[REAL]], %[[IMAG]] : f32 +// CHECK: %[[ABS1:.*]] = llvm.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : i1, f32 +// CHECK: %[[ABS2:.*]] = llvm.select %[[IMAG_IS_ZERO]], %[[REAL_ABS]], %[[ABS1]] : i1, f32 +// CHECK: %[[NORM:.*]] = llvm.select %[[REAL_IS_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : i1, f32 // CHECK: llvm.return %[[NORM]] : f32 // CHECK-LABEL: llvm.func @complex_eq diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir index 349b92a..a42ed69 100644 --- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir +++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir @@ -106,6 +106,27 @@ func.func @angle(%arg: complex) -> f32 { func.return %angle : f32 } +func.func @test_element_f64(%input: tensor>, + %func: (complex) -> f64) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %size = tensor.dim %input, %c0: tensor> + + scf.for %i = %c0 to %size step %c1 { + %elem = tensor.extract %input[%i]: tensor> + + %val = func.call_indirect %func(%elem) : (complex) -> f64 + vector.print %val : f64 + scf.yield + } + func.return +} + +func.func @abs(%arg: complex) -> f64 { + %abs = complex.abs %arg : complex + func.return %abs : f64 +} + func.func @entry() { // complex.sqrt test %sqrt_test = arith.constant dense<[ @@ -300,5 +321,38 @@ func.func @entry() { call @test_element(%angle_test_cast, %angle_func) : (tensor>, (complex) -> f32) -> () + // complex.abs test + %abs_test = arith.constant dense<[ + (1.0, 1.0), + // CHECK: 1.414 + (1.0e300, 1.0e300), + // CHECK-NEXT: 1.41421e+300 + (1.0e-300, 1.0e-300), + // CHECK-NEXT: 1.41421e-300 + (5.0, 0.0), + // CHECK-NEXT: 5 + (0.0, 6.0), + // CHECK-NEXT: 6 + (7.0, 8.0), + // CHECK-NEXT: 10.6301 + (-1.0, -1.0), + // CHECK-NEXT: 1.414 + (-1.0e300, -1.0e300), + // CHECK-NEXT: 1.41421e+300 + (-1.0, 0.0), + // CHECK-NOT: -1 + // CHECK-NEXT: 1 + (0.0, -1.0) + // CHECK-NOT: -1 + // CHECK-NEXT: 1 + ]> : tensor<10xcomplex> + %abs_test_cast = tensor.cast %abs_test + : tensor<10xcomplex> to tensor> + + %abs_func = func.constant @abs : (complex) -> f64 + + call @test_element_f64(%abs_test_cast, %abs_func) + : (tensor>, (complex) -> f64) -> () + func.return } -- cgit v1.1 From d70b1c1206d93b5cdf31fa330d5717eb73e8794a Mon Sep 17 00:00:00 2001 From: Po-yao Chang Date: Sun, 11 Feb 2024 09:36:59 +0800 Subject: [LLDB][Docs] Replace LLDB_RELOCATABLE_PYTHON with LLDB_EMBED_PYTHON_HOME (#81310) LLDB_RELOCATABLE_PYTHON was removed in LLVM 11 (https://github.com/llvm/llvm-project/commit/3ec3f62f0a0b1ac13230922c91ffc988c1b1e8d5). --- lldb/docs/resources/build.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst index 0ccfef32..55fe73c 100644 --- a/lldb/docs/resources/build.rst +++ b/lldb/docs/resources/build.rst @@ -278,12 +278,12 @@ are commonly used on Windows. crash, rather than having to reproduce a failure or use a crash dump. * ``PYTHON_HOME`` (Required): Path to the folder where the Python distribution is installed. For example, ``C:\Python35``. -* ``LLDB_RELOCATABLE_PYTHON`` (Default=0): When this is 0, LLDB will bind +* ``LLDB_EMBED_PYTHON_HOME`` (Default=1 on Windows): When this is 1, LLDB will bind statically to the location specified in the ``PYTHON_HOME`` CMake variable, ignoring any value of ``PYTHONHOME`` set in the environment. This is most useful for developers who simply want to run LLDB after they build it. If you wish to move a build of LLDB to a different machine where Python will be in a - different location, setting ``LLDB_RELOCATABLE_PYTHON`` to 1 will cause + different location, setting ``LLDB_EMBED_PYTHON_HOME`` to 0 will cause Python to use its default mechanism for finding the python installation at runtime (looking for installed Pythons, or using the ``PYTHONHOME`` environment variable if it is specified). -- cgit v1.1 From d0f4663f488dee869ed797b684d4c3361539ac1c Mon Sep 17 00:00:00 2001 From: darkbuck Date: Sat, 10 Feb 2024 21:44:05 -0500 Subject: [GlobalISel][Mips] Global ISel for `brcond` - Enable equivalent between `brcond` and `G_BRCOND`. - Remove the manual selection of `G_BRCOND` in Mips. Revise test cases. Reviewers: petar-avramovic, bcardosolopes, arsenm Reviewed By: arsenm Pull Request: https://github.com/llvm/llvm-project/pull/81306 --- .../llvm/Target/GlobalISel/SelectionDAGCompat.td | 1 + llvm/lib/Target/Mips/MipsInstructionSelector.cpp | 7 - .../Mips/GlobalISel/instruction-select/branch.mir | 2 +- .../instruction-select/jump_table_and_brjt.mir | 297 ++++++++++++--------- .../Mips/GlobalISel/instruction-select/phi.mir | 16 +- 5 files changed, 176 insertions(+), 147 deletions(-) diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index f792237..6bc1942 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -130,6 +130,7 @@ let IfConvergent = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS in { } def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp index 4478a57..654f29d 100644 --- a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp +++ b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp @@ -357,13 +357,6 @@ bool MipsInstructionSelector::select(MachineInstr &I) { .addImm(0); break; } - case G_BRCOND: { - MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::BNE)) - .add(I.getOperand(0)) - .addUse(Mips::ZERO) - .add(I.getOperand(1)); - break; - } case G_BRJT: { unsigned EntrySize = MF.getJumpTableInfo()->getEntrySize(MF.getDataLayout()); diff --git a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir index 2de4096..1311632 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir +++ b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir @@ -77,7 +77,7 @@ body: | ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1 ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $a2 ; MIPS32-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1 - ; MIPS32-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def $at + ; MIPS32-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at ; MIPS32-NEXT: J %bb.2, implicit-def dead $at ; MIPS32-NEXT: {{ $}} ; MIPS32-NEXT: bb.1.if.then: diff --git a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir index b8450ff..6022e7a 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir +++ b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir @@ -73,139 +73,174 @@ jumpTable: body: | ; MIPS32-LABEL: name: mod4_0_to_11 ; MIPS32: bb.0.entry: - ; MIPS32: successors: %bb.6(0x40000000), %bb.1(0x40000000) - ; MIPS32: liveins: $a0 - ; MIPS32: [[COPY:%[0-9]+]]:gpr32 = COPY $a0 - ; MIPS32: [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7 - ; MIPS32: [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3 - ; MIPS32: [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2 - ; MIPS32: [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1 - ; MIPS32: [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0 - ; MIPS32: [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535 - ; MIPS32: [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0 - ; MIPS32: [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]] - ; MIPS32: [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]] - ; MIPS32: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1 - ; MIPS32: BNE [[ANDi]], $zero, %bb.6, implicit-def $at - ; MIPS32: bb.1.entry: - ; MIPS32: successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000) - ; MIPS32: [[LUi:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.0 - ; MIPS32: [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2 - ; MIPS32: [[ADDu:%[0-9]+]]:gpr32 = ADDu [[LUi]], [[SLL]] - ; MIPS32: [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32)) - ; MIPS32: PseudoIndirectBranch [[LW]] - ; MIPS32: bb.2.sw.bb: - ; MIPS32: $v0 = COPY [[ORi4]] - ; MIPS32: RetRA implicit $v0 - ; MIPS32: bb.3.sw.bb1: - ; MIPS32: $v0 = COPY [[ORi3]] - ; MIPS32: RetRA implicit $v0 - ; MIPS32: bb.4.sw.bb2: - ; MIPS32: $v0 = COPY [[ORi2]] - ; MIPS32: RetRA implicit $v0 - ; MIPS32: bb.5.sw.bb3: - ; MIPS32: $v0 = COPY [[ORi1]] - ; MIPS32: RetRA implicit $v0 - ; MIPS32: bb.6.sw.default: - ; MIPS32: successors: %bb.7(0x80000000) - ; MIPS32: bb.7.sw.epilog: - ; MIPS32: successors: %bb.13(0x40000000), %bb.8(0x40000000) - ; MIPS32: [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8 - ; MIPS32: [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]] - ; MIPS32: [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]] - ; MIPS32: [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1 - ; MIPS32: BNE [[ANDi1]], $zero, %bb.13, implicit-def $at - ; MIPS32: bb.8.sw.epilog: - ; MIPS32: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) - ; MIPS32: [[LUi1:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.1 - ; MIPS32: [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2 - ; MIPS32: [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LUi1]], [[SLL1]] - ; MIPS32: [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32)) - ; MIPS32: PseudoIndirectBranch [[LW1]] - ; MIPS32: bb.9.sw.bb4: - ; MIPS32: $v0 = COPY [[ORi4]] - ; MIPS32: RetRA implicit $v0 - ; MIPS32: bb.10.sw.bb5: - ; MIPS32: $v0 = COPY [[ORi3]] - ; MIPS32: RetRA implicit $v0 - ; MIPS32: bb.11.sw.bb6: - ; MIPS32: $v0 = COPY [[ORi2]] - ; MIPS32: RetRA implicit $v0 - ; MIPS32: bb.12.sw.bb7: - ; MIPS32: $v0 = COPY [[ORi1]] - ; MIPS32: RetRA implicit $v0 - ; MIPS32: bb.13.sw.default8: - ; MIPS32: $v0 = COPY [[ADDiu]] - ; MIPS32: RetRA implicit $v0 + ; MIPS32-NEXT: successors: %bb.6(0x40000000), %bb.1(0x40000000) + ; MIPS32-NEXT: liveins: $a0 + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $a0 + ; MIPS32-NEXT: [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7 + ; MIPS32-NEXT: [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3 + ; MIPS32-NEXT: [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2 + ; MIPS32-NEXT: [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1 + ; MIPS32-NEXT: [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0 + ; MIPS32-NEXT: [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535 + ; MIPS32-NEXT: [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0 + ; MIPS32-NEXT: [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]] + ; MIPS32-NEXT: [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]] + ; MIPS32-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1 + ; MIPS32-NEXT: BNE [[ANDi]], $zero, %bb.6, implicit-def dead $at + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.1.entry: + ; MIPS32-NEXT: successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000) + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: [[LUi:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.0 + ; MIPS32-NEXT: [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2 + ; MIPS32-NEXT: [[ADDu:%[0-9]+]]:gpr32 = ADDu [[LUi]], [[SLL]] + ; MIPS32-NEXT: [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32)) + ; MIPS32-NEXT: PseudoIndirectBranch [[LW]] + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.2.sw.bb: + ; MIPS32-NEXT: $v0 = COPY [[ORi4]] + ; MIPS32-NEXT: RetRA implicit $v0 + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.3.sw.bb1: + ; MIPS32-NEXT: $v0 = COPY [[ORi3]] + ; MIPS32-NEXT: RetRA implicit $v0 + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.4.sw.bb2: + ; MIPS32-NEXT: $v0 = COPY [[ORi2]] + ; MIPS32-NEXT: RetRA implicit $v0 + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.5.sw.bb3: + ; MIPS32-NEXT: $v0 = COPY [[ORi1]] + ; MIPS32-NEXT: RetRA implicit $v0 + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.6.sw.default: + ; MIPS32-NEXT: successors: %bb.7(0x80000000) + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.7.sw.epilog: + ; MIPS32-NEXT: successors: %bb.13(0x40000000), %bb.8(0x40000000) + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8 + ; MIPS32-NEXT: [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]] + ; MIPS32-NEXT: [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]] + ; MIPS32-NEXT: [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1 + ; MIPS32-NEXT: BNE [[ANDi1]], $zero, %bb.13, implicit-def dead $at + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.8.sw.epilog: + ; MIPS32-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: [[LUi1:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.1 + ; MIPS32-NEXT: [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2 + ; MIPS32-NEXT: [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LUi1]], [[SLL1]] + ; MIPS32-NEXT: [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32)) + ; MIPS32-NEXT: PseudoIndirectBranch [[LW1]] + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.9.sw.bb4: + ; MIPS32-NEXT: $v0 = COPY [[ORi4]] + ; MIPS32-NEXT: RetRA implicit $v0 + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.10.sw.bb5: + ; MIPS32-NEXT: $v0 = COPY [[ORi3]] + ; MIPS32-NEXT: RetRA implicit $v0 + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.11.sw.bb6: + ; MIPS32-NEXT: $v0 = COPY [[ORi2]] + ; MIPS32-NEXT: RetRA implicit $v0 + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.12.sw.bb7: + ; MIPS32-NEXT: $v0 = COPY [[ORi1]] + ; MIPS32-NEXT: RetRA implicit $v0 + ; MIPS32-NEXT: {{ $}} + ; MIPS32-NEXT: bb.13.sw.default8: + ; MIPS32-NEXT: $v0 = COPY [[ADDiu]] + ; MIPS32-NEXT: RetRA implicit $v0 + ; ; MIPS32_PIC-LABEL: name: mod4_0_to_11 ; MIPS32_PIC: bb.0.entry: - ; MIPS32_PIC: successors: %bb.6(0x40000000), %bb.1(0x40000000) - ; MIPS32_PIC: liveins: $a0, $t9, $v0 - ; MIPS32_PIC: [[ADDu:%[0-9]+]]:gpr32 = ADDu $v0, $t9 - ; MIPS32_PIC: [[COPY:%[0-9]+]]:gpr32 = COPY $a0 - ; MIPS32_PIC: [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7 - ; MIPS32_PIC: [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3 - ; MIPS32_PIC: [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2 - ; MIPS32_PIC: [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1 - ; MIPS32_PIC: [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0 - ; MIPS32_PIC: [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535 - ; MIPS32_PIC: [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0 - ; MIPS32_PIC: [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]] - ; MIPS32_PIC: [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]] - ; MIPS32_PIC: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1 - ; MIPS32_PIC: BNE [[ANDi]], $zero, %bb.6, implicit-def $at - ; MIPS32_PIC: bb.1.entry: - ; MIPS32_PIC: successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000) - ; MIPS32_PIC: [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.0 :: (load (s32) from got) - ; MIPS32_PIC: [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2 - ; MIPS32_PIC: [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LW]], [[SLL]] - ; MIPS32_PIC: [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32)) - ; MIPS32_PIC: [[ADDu2:%[0-9]+]]:gpr32 = ADDu [[LW1]], [[ADDu]] - ; MIPS32_PIC: PseudoIndirectBranch [[ADDu2]] - ; MIPS32_PIC: bb.2.sw.bb: - ; MIPS32_PIC: $v0 = COPY [[ORi4]] - ; MIPS32_PIC: RetRA implicit $v0 - ; MIPS32_PIC: bb.3.sw.bb1: - ; MIPS32_PIC: $v0 = COPY [[ORi3]] - ; MIPS32_PIC: RetRA implicit $v0 - ; MIPS32_PIC: bb.4.sw.bb2: - ; MIPS32_PIC: $v0 = COPY [[ORi2]] - ; MIPS32_PIC: RetRA implicit $v0 - ; MIPS32_PIC: bb.5.sw.bb3: - ; MIPS32_PIC: $v0 = COPY [[ORi1]] - ; MIPS32_PIC: RetRA implicit $v0 - ; MIPS32_PIC: bb.6.sw.default: - ; MIPS32_PIC: successors: %bb.7(0x80000000) - ; MIPS32_PIC: bb.7.sw.epilog: - ; MIPS32_PIC: successors: %bb.13(0x40000000), %bb.8(0x40000000) - ; MIPS32_PIC: [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8 - ; MIPS32_PIC: [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]] - ; MIPS32_PIC: [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]] - ; MIPS32_PIC: [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1 - ; MIPS32_PIC: BNE [[ANDi1]], $zero, %bb.13, implicit-def $at - ; MIPS32_PIC: bb.8.sw.epilog: - ; MIPS32_PIC: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) - ; MIPS32_PIC: [[LW2:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.1 :: (load (s32) from got) - ; MIPS32_PIC: [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2 - ; MIPS32_PIC: [[ADDu3:%[0-9]+]]:gpr32 = ADDu [[LW2]], [[SLL1]] - ; MIPS32_PIC: [[LW3:%[0-9]+]]:gpr32 = LW [[ADDu3]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32)) - ; MIPS32_PIC: [[ADDu4:%[0-9]+]]:gpr32 = ADDu [[LW3]], [[ADDu]] - ; MIPS32_PIC: PseudoIndirectBranch [[ADDu4]] - ; MIPS32_PIC: bb.9.sw.bb4: - ; MIPS32_PIC: $v0 = COPY [[ORi4]] - ; MIPS32_PIC: RetRA implicit $v0 - ; MIPS32_PIC: bb.10.sw.bb5: - ; MIPS32_PIC: $v0 = COPY [[ORi3]] - ; MIPS32_PIC: RetRA implicit $v0 - ; MIPS32_PIC: bb.11.sw.bb6: - ; MIPS32_PIC: $v0 = COPY [[ORi2]] - ; MIPS32_PIC: RetRA implicit $v0 - ; MIPS32_PIC: bb.12.sw.bb7: - ; MIPS32_PIC: $v0 = COPY [[ORi1]] - ; MIPS32_PIC: RetRA implicit $v0 - ; MIPS32_PIC: bb.13.sw.default8: - ; MIPS32_PIC: $v0 = COPY [[ADDiu]] - ; MIPS32_PIC: RetRA implicit $v0 + ; MIPS32_PIC-NEXT: successors: %bb.6(0x40000000), %bb.1(0x40000000) + ; MIPS32_PIC-NEXT: liveins: $a0, $t9, $v0 + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: [[ADDu:%[0-9]+]]:gpr32 = ADDu $v0, $t9 + ; MIPS32_PIC-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $a0 + ; MIPS32_PIC-NEXT: [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7 + ; MIPS32_PIC-NEXT: [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3 + ; MIPS32_PIC-NEXT: [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2 + ; MIPS32_PIC-NEXT: [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1 + ; MIPS32_PIC-NEXT: [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0 + ; MIPS32_PIC-NEXT: [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535 + ; MIPS32_PIC-NEXT: [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0 + ; MIPS32_PIC-NEXT: [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]] + ; MIPS32_PIC-NEXT: [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]] + ; MIPS32_PIC-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1 + ; MIPS32_PIC-NEXT: BNE [[ANDi]], $zero, %bb.6, implicit-def dead $at + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.1.entry: + ; MIPS32_PIC-NEXT: successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000) + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.0 :: (load (s32) from got) + ; MIPS32_PIC-NEXT: [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2 + ; MIPS32_PIC-NEXT: [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LW]], [[SLL]] + ; MIPS32_PIC-NEXT: [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32)) + ; MIPS32_PIC-NEXT: [[ADDu2:%[0-9]+]]:gpr32 = ADDu [[LW1]], [[ADDu]] + ; MIPS32_PIC-NEXT: PseudoIndirectBranch [[ADDu2]] + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.2.sw.bb: + ; MIPS32_PIC-NEXT: $v0 = COPY [[ORi4]] + ; MIPS32_PIC-NEXT: RetRA implicit $v0 + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.3.sw.bb1: + ; MIPS32_PIC-NEXT: $v0 = COPY [[ORi3]] + ; MIPS32_PIC-NEXT: RetRA implicit $v0 + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.4.sw.bb2: + ; MIPS32_PIC-NEXT: $v0 = COPY [[ORi2]] + ; MIPS32_PIC-NEXT: RetRA implicit $v0 + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.5.sw.bb3: + ; MIPS32_PIC-NEXT: $v0 = COPY [[ORi1]] + ; MIPS32_PIC-NEXT: RetRA implicit $v0 + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.6.sw.default: + ; MIPS32_PIC-NEXT: successors: %bb.7(0x80000000) + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.7.sw.epilog: + ; MIPS32_PIC-NEXT: successors: %bb.13(0x40000000), %bb.8(0x40000000) + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8 + ; MIPS32_PIC-NEXT: [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]] + ; MIPS32_PIC-NEXT: [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]] + ; MIPS32_PIC-NEXT: [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1 + ; MIPS32_PIC-NEXT: BNE [[ANDi1]], $zero, %bb.13, implicit-def dead $at + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.8.sw.epilog: + ; MIPS32_PIC-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: [[LW2:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.1 :: (load (s32) from got) + ; MIPS32_PIC-NEXT: [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2 + ; MIPS32_PIC-NEXT: [[ADDu3:%[0-9]+]]:gpr32 = ADDu [[LW2]], [[SLL1]] + ; MIPS32_PIC-NEXT: [[LW3:%[0-9]+]]:gpr32 = LW [[ADDu3]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32)) + ; MIPS32_PIC-NEXT: [[ADDu4:%[0-9]+]]:gpr32 = ADDu [[LW3]], [[ADDu]] + ; MIPS32_PIC-NEXT: PseudoIndirectBranch [[ADDu4]] + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.9.sw.bb4: + ; MIPS32_PIC-NEXT: $v0 = COPY [[ORi4]] + ; MIPS32_PIC-NEXT: RetRA implicit $v0 + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.10.sw.bb5: + ; MIPS32_PIC-NEXT: $v0 = COPY [[ORi3]] + ; MIPS32_PIC-NEXT: RetRA implicit $v0 + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.11.sw.bb6: + ; MIPS32_PIC-NEXT: $v0 = COPY [[ORi2]] + ; MIPS32_PIC-NEXT: RetRA implicit $v0 + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.12.sw.bb7: + ; MIPS32_PIC-NEXT: $v0 = COPY [[ORi1]] + ; MIPS32_PIC-NEXT: RetRA implicit $v0 + ; MIPS32_PIC-NEXT: {{ $}} + ; MIPS32_PIC-NEXT: bb.13.sw.default8: + ; MIPS32_PIC-NEXT: $v0 = COPY [[ADDiu]] + ; MIPS32_PIC-NEXT: RetRA implicit $v0 bb.1.entry: liveins: $a0 diff --git a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir index 77e5ee2..44d31d99 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir +++ b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir @@ -80,7 +80,7 @@ body: | ; MIPS32FP32-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1 ; MIPS32FP32-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $a2 ; MIPS32FP32-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1 - ; MIPS32FP32-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def $at + ; MIPS32FP32-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at ; MIPS32FP32-NEXT: J %bb.2, implicit-def dead $at ; MIPS32FP32-NEXT: {{ $}} ; MIPS32FP32-NEXT: bb.1.cond.true: @@ -105,7 +105,7 @@ body: | ; MIPS32FP64-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1 ; MIPS32FP64-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $a2 ; MIPS32FP64-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1 - ; MIPS32FP64-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def $at + ; MIPS32FP64-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at ; MIPS32FP64-NEXT: J %bb.2, implicit-def dead $at ; MIPS32FP64-NEXT: {{ $}} ; MIPS32FP64-NEXT: bb.1.cond.true: @@ -166,7 +166,7 @@ body: | ; MIPS32FP32-NEXT: [[ADDiu1:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.1, 0 ; MIPS32FP32-NEXT: [[LW1:%[0-9]+]]:gpr32 = LW [[ADDiu1]], 0 :: (load (s32) from %fixed-stack.1) ; MIPS32FP32-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1 - ; MIPS32FP32-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def $at + ; MIPS32FP32-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at ; MIPS32FP32-NEXT: J %bb.2, implicit-def dead $at ; MIPS32FP32-NEXT: {{ $}} ; MIPS32FP32-NEXT: bb.1.cond.true: @@ -197,7 +197,7 @@ body: | ; MIPS32FP64-NEXT: [[ADDiu1:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.1, 0 ; MIPS32FP64-NEXT: [[LW1:%[0-9]+]]:gpr32 = LW [[ADDiu1]], 0 :: (load (s32) from %fixed-stack.1) ; MIPS32FP64-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1 - ; MIPS32FP64-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def $at + ; MIPS32FP64-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at ; MIPS32FP64-NEXT: J %bb.2, implicit-def dead $at ; MIPS32FP64-NEXT: {{ $}} ; MIPS32FP64-NEXT: bb.1.cond.true: @@ -259,7 +259,7 @@ body: | ; MIPS32FP32-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1 ; MIPS32FP32-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $a2 ; MIPS32FP32-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1 - ; MIPS32FP32-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def $at + ; MIPS32FP32-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at ; MIPS32FP32-NEXT: J %bb.2, implicit-def dead $at ; MIPS32FP32-NEXT: {{ $}} ; MIPS32FP32-NEXT: bb.1.cond.true: @@ -284,7 +284,7 @@ body: | ; MIPS32FP64-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1 ; MIPS32FP64-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $a2 ; MIPS32FP64-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1 - ; MIPS32FP64-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def $at + ; MIPS32FP64-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at ; MIPS32FP64-NEXT: J %bb.2, implicit-def dead $at ; MIPS32FP64-NEXT: {{ $}} ; MIPS32FP64-NEXT: bb.1.cond.true: @@ -341,7 +341,7 @@ body: | ; MIPS32FP32-NEXT: [[ADDiu:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.0, 0 ; MIPS32FP32-NEXT: [[LW:%[0-9]+]]:gpr32 = LW [[ADDiu]], 0 :: (load (s32) from %fixed-stack.0, align 8) ; MIPS32FP32-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[LW]], 1 - ; MIPS32FP32-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def $at + ; MIPS32FP32-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at ; MIPS32FP32-NEXT: J %bb.2, implicit-def dead $at ; MIPS32FP32-NEXT: {{ $}} ; MIPS32FP32-NEXT: bb.1.cond.true: @@ -367,7 +367,7 @@ body: | ; MIPS32FP64-NEXT: [[ADDiu:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.0, 0 ; MIPS32FP64-NEXT: [[LW:%[0-9]+]]:gpr32 = LW [[ADDiu]], 0 :: (load (s32) from %fixed-stack.0, align 8) ; MIPS32FP64-NEXT: [[ANDi:%[0-9]+]]:gpr32 = ANDi [[LW]], 1 - ; MIPS32FP64-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def $at + ; MIPS32FP64-NEXT: BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at ; MIPS32FP64-NEXT: J %bb.2, implicit-def dead $at ; MIPS32FP64-NEXT: {{ $}} ; MIPS32FP64-NEXT: bb.1.cond.true: -- cgit v1.1 From 2c3ba9f6225612caf7d2d5ba6613ba1454d52dc3 Mon Sep 17 00:00:00 2001 From: Quinn Dawkins Date: Sat, 10 Feb 2024 20:01:14 -0800 Subject: [mlir][Linalg] Unrestrict redundant transfer hoisting from func.func (#79516) All the hoistRedundantVectorTransfers op does is walk the target operation, which does not have to be restricted to func.func. --- mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h | 6 ++---- mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h index 921c3c3..186e83a 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h @@ -10,10 +10,8 @@ #define MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_ namespace mlir { +class Operation; class RewriterBase; -namespace func { -class FuncOp; -} // namespace func namespace scf { class ForOp; } // namespace scf @@ -43,7 +41,7 @@ namespace linalg { /// /// WARNING: This hoisting does not model parallelism and is generally incorrect /// when used on distributed loops with memref semantics! -void hoistRedundantVectorTransfers(func::FuncOp func); +void hoistRedundantVectorTransfers(Operation *root); } // namespace linalg } // namespace mlir diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp index 80ce97e..34c9b2c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -73,16 +73,16 @@ static bool noAliasingUseInLoop(vector::TransferReadOp transferRead, return true; } -void mlir::linalg::hoistRedundantVectorTransfers(func::FuncOp func) { +void mlir::linalg::hoistRedundantVectorTransfers(Operation *root) { bool changed = true; while (changed) { changed = false; // First move loop invariant ops outside of their loop. This needs to be // done before as we cannot move ops without interrupting the function walk. - func.walk( + root->walk( [&](LoopLikeOpInterface loopLike) { moveLoopInvariantCode(loopLike); }); - func.walk([&](vector::TransferReadOp transferRead) { + root->walk([&](vector::TransferReadOp transferRead) { if (!isa(transferRead.getShapedType())) return WalkResult::advance(); -- cgit v1.1 From c2f9885a8aa3a820eefdacccf3fcc6b9d87e3284 Mon Sep 17 00:00:00 2001 From: Koakuma Date: Sun, 11 Feb 2024 14:04:18 +0700 Subject: [SPARC] Support reserving arbitrary general purpose registers (#74927) This adds support for marking arbitrary general purpose registers - except for those with special purpose (G0, I6-I7, O6-O7) - as reserved, as needed by some software like the Linux kernel. --- clang/include/clang/Driver/Options.td | 12 ++ clang/lib/Driver/ToolChains/Arch/Sparc.cpp | 81 +++++++++ clang/test/Driver/sparc-fixed-register.c | 181 +++++++++++++++++++++ llvm/lib/Target/Sparc/Sparc.td | 14 ++ llvm/lib/Target/Sparc/SparcISelLowering.cpp | 43 +++++ llvm/lib/Target/Sparc/SparcRegisterInfo.cpp | 14 +- llvm/lib/Target/Sparc/SparcRegisterInfo.h | 1 + llvm/lib/Target/Sparc/SparcRegisterInfo.td | 4 + llvm/lib/Target/Sparc/SparcSubtarget.cpp | 1 + llvm/lib/Target/Sparc/SparcSubtarget.h | 10 ++ llvm/test/CodeGen/SPARC/reserved-arg-regs.ll | 25 +++ llvm/test/CodeGen/SPARC/reserved-regs-named.ll | 13 ++ .../CodeGen/SPARC/reserved-regs-unavailable.ll | 14 ++ llvm/test/CodeGen/SPARC/reserved-regs.ll | 17 ++ 14 files changed, 428 insertions(+), 2 deletions(-) create mode 100644 clang/test/Driver/sparc-fixed-register.c create mode 100644 llvm/test/CodeGen/SPARC/reserved-arg-regs.ll create mode 100644 llvm/test/CodeGen/SPARC/reserved-regs-named.ll create mode 100644 llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7f00732..31e8571 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5829,6 +5829,18 @@ def mvis3 : Flag<["-"], "mvis3">, Group; def mno_vis3 : Flag<["-"], "mno-vis3">, Group; def mhard_quad_float : Flag<["-"], "mhard-quad-float">, Group; def msoft_quad_float : Flag<["-"], "msoft-quad-float">, Group; +foreach i = 1 ... 7 in + def ffixed_g#i : Flag<["-"], "ffixed-g"#i>, Group, + HelpText<"Reserve the G"#i#" register (SPARC only)">; +foreach i = 0 ... 5 in + def ffixed_o#i : Flag<["-"], "ffixed-o"#i>, Group, + HelpText<"Reserve the O"#i#" register (SPARC only)">; +foreach i = 0 ... 7 in + def ffixed_l#i : Flag<["-"], "ffixed-l"#i>, Group, + HelpText<"Reserve the L"#i#" register (SPARC only)">; +foreach i = 0 ... 5 in + def ffixed_i#i : Flag<["-"], "ffixed-i"#i>, Group, + HelpText<"Reserve the I"#i#" register (SPARC only)">; } // let Flags = [TargetSpecific] // M68k features flags diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp index 22e5830..ae1a4ba 100644 --- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp @@ -178,4 +178,85 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args, else Features.push_back("-hard-quad-float"); } + + if (Args.hasArg(options::OPT_ffixed_g1)) + Features.push_back("+reserve-g1"); + + if (Args.hasArg(options::OPT_ffixed_g2)) + Features.push_back("+reserve-g2"); + + if (Args.hasArg(options::OPT_ffixed_g3)) + Features.push_back("+reserve-g3"); + + if (Args.hasArg(options::OPT_ffixed_g4)) + Features.push_back("+reserve-g4"); + + if (Args.hasArg(options::OPT_ffixed_g5)) + Features.push_back("+reserve-g5"); + + if (Args.hasArg(options::OPT_ffixed_g6)) + Features.push_back("+reserve-g6"); + + if (Args.hasArg(options::OPT_ffixed_g7)) + Features.push_back("+reserve-g7"); + + if (Args.hasArg(options::OPT_ffixed_o0)) + Features.push_back("+reserve-o0"); + + if (Args.hasArg(options::OPT_ffixed_o1)) + Features.push_back("+reserve-o1"); + + if (Args.hasArg(options::OPT_ffixed_o2)) + Features.push_back("+reserve-o2"); + + if (Args.hasArg(options::OPT_ffixed_o3)) + Features.push_back("+reserve-o3"); + + if (Args.hasArg(options::OPT_ffixed_o4)) + Features.push_back("+reserve-o4"); + + if (Args.hasArg(options::OPT_ffixed_o5)) + Features.push_back("+reserve-o5"); + + if (Args.hasArg(options::OPT_ffixed_l0)) + Features.push_back("+reserve-l0"); + + if (Args.hasArg(options::OPT_ffixed_l1)) + Features.push_back("+reserve-l1"); + + if (Args.hasArg(options::OPT_ffixed_l2)) + Features.push_back("+reserve-l2"); + + if (Args.hasArg(options::OPT_ffixed_l3)) + Features.push_back("+reserve-l3"); + + if (Args.hasArg(options::OPT_ffixed_l4)) + Features.push_back("+reserve-l4"); + + if (Args.hasArg(options::OPT_ffixed_l5)) + Features.push_back("+reserve-l5"); + + if (Args.hasArg(options::OPT_ffixed_l6)) + Features.push_back("+reserve-l6"); + + if (Args.hasArg(options::OPT_ffixed_l7)) + Features.push_back("+reserve-l7"); + + if (Args.hasArg(options::OPT_ffixed_i0)) + Features.push_back("+reserve-i0"); + + if (Args.hasArg(options::OPT_ffixed_i1)) + Features.push_back("+reserve-i1"); + + if (Args.hasArg(options::OPT_ffixed_i2)) + Features.push_back("+reserve-i2"); + + if (Args.hasArg(options::OPT_ffixed_i3)) + Features.push_back("+reserve-i3"); + + if (Args.hasArg(options::OPT_ffixed_i4)) + Features.push_back("+reserve-i4"); + + if (Args.hasArg(options::OPT_ffixed_i5)) + Features.push_back("+reserve-i5"); } diff --git a/clang/test/Driver/sparc-fixed-register.c b/clang/test/Driver/sparc-fixed-register.c new file mode 100644 index 0000000..24880b9 --- /dev/null +++ b/clang/test/Driver/sparc-fixed-register.c @@ -0,0 +1,181 @@ +// RUN: %clang --target=sparc-none-gnu -ffixed-g1 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G1 < %t %s +// CHECK-FIXED-G1: "-target-feature" "+reserve-g1" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g2 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G2 < %t %s +// CHECK-FIXED-G2: "-target-feature" "+reserve-g2" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g3 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G3 < %t %s +// CHECK-FIXED-G3: "-target-feature" "+reserve-g3" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g4 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G4 < %t %s +// CHECK-FIXED-G4: "-target-feature" "+reserve-g4" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g5 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G5 < %t %s +// CHECK-FIXED-G5: "-target-feature" "+reserve-g5" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g6 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G6 < %t %s +// CHECK-FIXED-G6: "-target-feature" "+reserve-g6" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g7 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G7 < %t %s +// CHECK-FIXED-G7: "-target-feature" "+reserve-g7" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o0 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O0 < %t %s +// CHECK-FIXED-O0: "-target-feature" "+reserve-o0" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o1 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O1 < %t %s +// CHECK-FIXED-O1: "-target-feature" "+reserve-o1" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o2 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O2 < %t %s +// CHECK-FIXED-O2: "-target-feature" "+reserve-o2" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o3 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O3 < %t %s +// CHECK-FIXED-O3: "-target-feature" "+reserve-o3" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o4 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O4 < %t %s +// CHECK-FIXED-O4: "-target-feature" "+reserve-o4" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o5 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O5 < %t %s +// CHECK-FIXED-O5: "-target-feature" "+reserve-o5" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l0 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L0 < %t %s +// CHECK-FIXED-L0: "-target-feature" "+reserve-l0" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l1 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L1 < %t %s +// CHECK-FIXED-L1: "-target-feature" "+reserve-l1" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l2 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L2 < %t %s +// CHECK-FIXED-L2: "-target-feature" "+reserve-l2" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l3 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L3 < %t %s +// CHECK-FIXED-L3: "-target-feature" "+reserve-l3" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l4 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L4 < %t %s +// CHECK-FIXED-L4: "-target-feature" "+reserve-l4" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l5 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L5 < %t %s +// CHECK-FIXED-L5: "-target-feature" "+reserve-l5" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l6 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L6 < %t %s +// CHECK-FIXED-L6: "-target-feature" "+reserve-l6" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l7 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L7 < %t %s +// CHECK-FIXED-L7: "-target-feature" "+reserve-l7" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i0 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I0 < %t %s +// CHECK-FIXED-I0: "-target-feature" "+reserve-i0" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i1 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I1 < %t %s +// CHECK-FIXED-I1: "-target-feature" "+reserve-i1" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i2 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I2 < %t %s +// CHECK-FIXED-I2: "-target-feature" "+reserve-i2" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i3 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I3 < %t %s +// CHECK-FIXED-I3: "-target-feature" "+reserve-i3" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i4 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I4 < %t %s +// CHECK-FIXED-I4: "-target-feature" "+reserve-i4" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i5 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I5 < %t %s +// CHECK-FIXED-I5: "-target-feature" "+reserve-i5" + +// Test multiple of reserve-* options together. +// RUN: %clang --target=sparc-none-gnu \ +// RUN: -ffixed-g1 \ +// RUN: -ffixed-o2 \ +// RUN: -ffixed-l3 \ +// RUN: -ffixed-i4 \ +// RUN: -### %s 2> %t +// RUN: FileCheck \ +// RUN: --check-prefix=CHECK-FIXED-G1 \ +// RUN: --check-prefix=CHECK-FIXED-O2 \ +// RUN: --check-prefix=CHECK-FIXED-L3 \ +// RUN: --check-prefix=CHECK-FIXED-I4 \ +// RUN: < %t %s + +// Test all reserve-* options together. +// RUN: %clang --target=sparc-none-gnu \ +// RUN: -ffixed-g1 \ +// RUN: -ffixed-g2 \ +// RUN: -ffixed-g3 \ +// RUN: -ffixed-g4 \ +// RUN: -ffixed-g5 \ +// RUN: -ffixed-g6 \ +// RUN: -ffixed-g7 \ +// RUN: -ffixed-o0 \ +// RUN: -ffixed-o1 \ +// RUN: -ffixed-o2 \ +// RUN: -ffixed-o3 \ +// RUN: -ffixed-o4 \ +// RUN: -ffixed-o5 \ +// RUN: -ffixed-l0 \ +// RUN: -ffixed-l1 \ +// RUN: -ffixed-l2 \ +// RUN: -ffixed-l3 \ +// RUN: -ffixed-l4 \ +// RUN: -ffixed-l5 \ +// RUN: -ffixed-l6 \ +// RUN: -ffixed-l7 \ +// RUN: -ffixed-i0 \ +// RUN: -ffixed-i1 \ +// RUN: -ffixed-i2 \ +// RUN: -ffixed-i3 \ +// RUN: -ffixed-i4 \ +// RUN: -ffixed-i5 \ +// RUN: -### %s 2> %t +// RUN: FileCheck \ +// RUN: --check-prefix=CHECK-FIXED-G1 \ +// RUN: --check-prefix=CHECK-FIXED-G2 \ +// RUN: --check-prefix=CHECK-FIXED-G3 \ +// RUN: --check-prefix=CHECK-FIXED-G4 \ +// RUN: --check-prefix=CHECK-FIXED-G5 \ +// RUN: --check-prefix=CHECK-FIXED-G6 \ +// RUN: --check-prefix=CHECK-FIXED-G7 \ +// RUN: --check-prefix=CHECK-FIXED-O0 \ +// RUN: --check-prefix=CHECK-FIXED-O1 \ +// RUN: --check-prefix=CHECK-FIXED-O2 \ +// RUN: --check-prefix=CHECK-FIXED-O3 \ +// RUN: --check-prefix=CHECK-FIXED-O4 \ +// RUN: --check-prefix=CHECK-FIXED-O5 \ +// RUN: --check-prefix=CHECK-FIXED-L0 \ +// RUN: --check-prefix=CHECK-FIXED-L1 \ +// RUN: --check-prefix=CHECK-FIXED-L2 \ +// RUN: --check-prefix=CHECK-FIXED-L3 \ +// RUN: --check-prefix=CHECK-FIXED-L4 \ +// RUN: --check-prefix=CHECK-FIXED-L5 \ +// RUN: --check-prefix=CHECK-FIXED-L6 \ +// RUN: --check-prefix=CHECK-FIXED-L7 \ +// RUN: --check-prefix=CHECK-FIXED-I0 \ +// RUN: --check-prefix=CHECK-FIXED-I1 \ +// RUN: --check-prefix=CHECK-FIXED-I2 \ +// RUN: --check-prefix=CHECK-FIXED-I3 \ +// RUN: --check-prefix=CHECK-FIXED-I4 \ +// RUN: --check-prefix=CHECK-FIXED-I5 \ +// RUN: < %t %s diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td index 7b10339..38a59e6 100644 --- a/llvm/lib/Target/Sparc/Sparc.td +++ b/llvm/lib/Target/Sparc/Sparc.td @@ -72,6 +72,20 @@ def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true", //==== Features added predmoninantly for LEON subtarget support include "LeonFeatures.td" +//==== Register allocation tweaks needed by some low-level software +foreach i = 1 ... 7 in + def FeatureReserveG#i : SubtargetFeature<"reserve-g"#i, "ReserveRegister["#i#" + SP::G0]", "true", + "Reserve G"#i#", making it unavailable as a GPR">; +foreach i = 0 ... 5 in + def FeatureReserveO#i : SubtargetFeature<"reserve-o"#i, "ReserveRegister["#i#" + SP::O0]", "true", + "Reserve O"#i#", making it unavailable as a GPR">; +foreach i = 0 ... 7 in + def FeatureReserveL#i : SubtargetFeature<"reserve-l"#i, "ReserveRegister["#i#" + SP::L0]", "true", + "Reserve L"#i#", making it unavailable as a GPR">; +foreach i = 0 ... 5 in + def FeatureReserveI#i : SubtargetFeature<"reserve-i"#i, "ReserveRegister["#i#" + SP::I0]", "true", + "Reserve I"#i#", making it unavailable as a GPR">; + //===----------------------------------------------------------------------===// // Register File, Calling Conv, Instruction Descriptions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 78bdf3a..bdefb08 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -13,6 +13,7 @@ #include "SparcISelLowering.h" #include "MCTargetDesc/SparcMCExpr.h" +#include "MCTargetDesc/SparcMCTargetDesc.h" #include "SparcMachineFunctionInfo.h" #include "SparcRegisterInfo.h" #include "SparcTargetMachine.h" @@ -28,6 +29,7 @@ #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Support/ErrorHandling.h" @@ -729,6 +731,30 @@ SDValue SparcTargetLowering::LowerFormalArguments_64( return Chain; } +// Check whether any of the argument registers are reserved +static bool isAnyArgRegReserved(const SparcRegisterInfo *TRI, + const MachineFunction &MF) { + // The register window design means that outgoing parameters at O* + // will appear in the callee as I*. + // Be conservative and check both sides of the register names. + bool Outgoing = + llvm::any_of(SP::GPROutgoingArgRegClass, [TRI, &MF](MCPhysReg r) { + return TRI->isReservedReg(MF, r); + }); + bool Incoming = + llvm::any_of(SP::GPRIncomingArgRegClass, [TRI, &MF](MCPhysReg r) { + return TRI->isReservedReg(MF, r); + }); + return Outgoing || Incoming; +} + +static void emitReservedArgRegCallError(const MachineFunction &MF) { + const Function &F = MF.getFunction(); + F.getContext().diagnose(DiagnosticInfoUnsupported{ + F, ("SPARC doesn't support" + " function calls if any of the argument registers is reserved.")}); +} + SDValue SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { @@ -805,6 +831,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, bool &isTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool isVarArg = CLI.IsVarArg; + MachineFunction &MF = DAG.getMachineFunction(); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; @@ -1055,6 +1082,10 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CallConv) : TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv)); + + if (isAnyArgRegReserved(TRI, MF)) + emitReservedArgRegCallError(MF); + assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -1125,6 +1156,13 @@ Register SparcTargetLowering::getRegisterByName(const char* RegName, LLT VT, .Case("g4", SP::G4).Case("g5", SP::G5).Case("g6", SP::G6).Case("g7", SP::G7) .Default(0); + // If we're directly referencing register names + // (e.g in GCC C extension `register int r asm("g1");`), + // make sure that said register is in the reserve list. + const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo(); + if (!TRI->isReservedReg(MF, Reg)) + Reg = 0; + if (Reg) return Reg; @@ -1189,6 +1227,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, SDLoc DL = CLI.DL; SDValue Chain = CLI.Chain; auto PtrVT = getPointerTy(DAG.getDataLayout()); + MachineFunction &MF = DAG.getMachineFunction(); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; @@ -1372,6 +1411,10 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv) : TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv)); + + if (isAnyArgRegReserved(TRI, MF)) + emitReservedArgRegCallError(MF); + assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp index f97bf57..71a27f7 100644 --- a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp @@ -12,10 +12,8 @@ #include "SparcRegisterInfo.h" #include "Sparc.h" -#include "SparcMachineFunctionInfo.h" #include "SparcSubtarget.h" #include "llvm/ADT/BitVector.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -98,9 +96,21 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (unsigned n = 0; n < 31; n++) Reserved.set(SP::ASR1 + n); + for (TargetRegisterClass::iterator i = SP::IntRegsRegClass.begin(); + i != SP::IntRegsRegClass.end(); ++i) { + if (MF.getSubtarget().isRegisterReserved(*i)) + markSuperRegs(Reserved, *i); + } + + assert(checkAllSuperRegsMarked(Reserved)); return Reserved; } +bool SparcRegisterInfo::isReservedReg(const MachineFunction &MF, + MCRegister Reg) const { + return getReservedRegs(MF)[Reg]; +} + const TargetRegisterClass* SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/llvm/lib/Target/Sparc/SparcRegisterInfo.h index 5b3c1a7..58c85f3 100644 --- a/llvm/lib/Target/Sparc/SparcRegisterInfo.h +++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.h @@ -30,6 +30,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo { const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const; BitVector getReservedRegs(const MachineFunction &MF) const override; + bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const; const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF, unsigned Kind) const override; diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/llvm/lib/Target/Sparc/SparcRegisterInfo.td index d5ba746..d8319a8 100644 --- a/llvm/lib/Target/Sparc/SparcRegisterInfo.td +++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.td @@ -370,6 +370,10 @@ def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>; // Floating point control register classes. def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>; +// GPR argument registers. +def GPROutgoingArg : RegisterClass<"SP", [i32, i64], 32, (sequence "O%u", 0, 5)>; +def GPRIncomingArg : RegisterClass<"SP", [i32, i64], 32, (sequence "I%u", 0, 5)>; + let isAllocatable = 0 in { // Ancillary state registers // FIXME: TICK is special-cased here as it can be accessed diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/llvm/lib/Target/Sparc/SparcSubtarget.cpp index 6b09904..5b65e34 100644 --- a/llvm/lib/Target/Sparc/SparcSubtarget.cpp +++ b/llvm/lib/Target/Sparc/SparcSubtarget.cpp @@ -50,6 +50,7 @@ SparcSubtarget::SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU, const StringRef &FS, const TargetMachine &TM, bool is64Bit) : SparcGenSubtargetInfo(TM.getTargetTriple(), CPU, TuneCPU, FS), + ReserveRegister(TM.getMCRegisterInfo()->getNumRegs()), TargetTriple(TM.getTargetTriple()), Is64Bit(is64Bit), InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)), TLInfo(TM, *this), FrameLowering(*this) {} diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h index cdb210f..fe4aca5 100644 --- a/llvm/lib/Target/Sparc/SparcSubtarget.h +++ b/llvm/lib/Target/Sparc/SparcSubtarget.h @@ -13,12 +13,14 @@ #ifndef LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H #define LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H +#include "MCTargetDesc/SparcMCTargetDesc.h" #include "SparcFrameLowering.h" #include "SparcISelLowering.h" #include "SparcInstrInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Triple.h" #include @@ -29,6 +31,10 @@ namespace llvm { class StringRef; class SparcSubtarget : public SparcGenSubtargetInfo { + // ReserveRegister[i] - Register #i is not available as a general purpose + // register. + BitVector ReserveRegister; + Triple TargetTriple; virtual void anchor(); @@ -82,6 +88,10 @@ public: return is64Bit() ? 2047 : 0; } + bool isRegisterReserved(MCPhysReg PhysReg) const { + return ReserveRegister[PhysReg]; + } + /// Given a actual stack size as determined by FrameInfo, this function /// returns adjusted framesize which includes space for register window /// spills and arguments. diff --git a/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll b/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll new file mode 100644 index 0000000..3587ecb --- /dev/null +++ b/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll @@ -0,0 +1,25 @@ +;; Test reserving argument registers. +; RUN: not llc < %s -mtriple=sparc-linux-gnu -mattr=+reserve-o0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-O0 +; RUN: not llc < %s -mtriple=sparc64-linux-gnu -mattr=+reserve-o0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-O0 +; RUN: not llc < %s -mtriple=sparc-linux-gnu -mattr=+reserve-i0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-I0 +; RUN: not llc < %s -mtriple=sparc64-linux-gnu -mattr=+reserve-i0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-I0 + +; CHECK-RESERVED-O0: error: +; CHECK-RESERVED-O0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved. +; CHECK-RESERVED-I0: error: +; CHECK-RESERVED-I0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved. +define void @call_function() { + call void @foo() + ret void +} +declare void @foo() + +; CHECK-RESERVED-O0: error: +; CHECK-RESERVED-O0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved. +; CHECK-RESERVED-I0: error: +; CHECK-RESERVED-I0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved. +define void @call_function_with_arg(i8 %in) { + call void @bar(i8 %in) + ret void +} +declare void @bar(i8) diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-named.ll b/llvm/test/CodeGen/SPARC/reserved-regs-named.ll new file mode 100644 index 0000000..91808be --- /dev/null +++ b/llvm/test/CodeGen/SPARC/reserved-regs-named.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-l0 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-L0 + +;; Ensure explicit register references are catched as well. + +; CHECK-RESERVED-L0: %l0 +define void @set_reg(i32 zeroext %x) { +entry: + tail call void @llvm.write_register.i32(metadata !0, i32 %x) + ret void +} + +declare void @llvm.write_register.i32(metadata, i32) +!0 = !{!"l0"} diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll new file mode 100644 index 0000000..53ca045 --- /dev/null +++ b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll @@ -0,0 +1,14 @@ +; RUN: not --crash llc -mtriple=sparc64-linux-gnu -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-L0 + +;; Ensure explicit register references for non-reserved registers +;; are caught properly. + +; CHECK-RESERVED-L0: LLVM ERROR: Invalid register name global variable +define void @set_reg(i32 zeroext %x) { +entry: + tail call void @llvm.write_register.i32(metadata !0, i32 %x) + ret void +} + +declare void @llvm.write_register.i32(metadata, i32) +!0 = !{!"l0"} diff --git a/llvm/test/CodeGen/SPARC/reserved-regs.ll b/llvm/test/CodeGen/SPARC/reserved-regs.ll index 27ebf47..bf46177 100644 --- a/llvm/test/CodeGen/SPARC/reserved-regs.ll +++ b/llvm/test/CodeGen/SPARC/reserved-regs.ll @@ -1,5 +1,14 @@ ; RUN: llc -march=sparc -verify-machineinstrs < %s | FileCheck %s +;; Test reserve-* options. +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-g1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-G1 +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-o1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-O1 +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-l1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-L1 +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-i1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-I1 + +;; Test multiple reserve-* options together. +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-g1 -mattr=+reserve-o1 -mattr=+reserve-l1 -mattr=+reserve-i1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-G1,CHECK-RESERVED-O1,CHECK-RESERVED-L1,CHECK-RESERVED-I1 + @g = common global [32 x i32] zeroinitializer, align 16 @h = common global [16 x i64] zeroinitializer, align 16 @@ -16,6 +25,10 @@ ; CHECK-NOT: %o6 ; CHECK-NOT: %i6 ; CHECK-NOT: %i7 +; CHECK-RESERVED-G1-NOT: %g1 +; CHECK-RESERVED-O1-NOT: %o1 +; CHECK-RESERVED-L1-NOT: %l1 +; CHECK-RESERVED-I1-NOT: %i1 ; CHECK: ret define void @use_all_i32_regs() { entry: @@ -100,6 +113,10 @@ entry: ; CHECK-NOT: %o7 ; CHECK-NOT: %i6 ; CHECK-NOT: %i7 +; CHECK-RESERVED-G1-NOT: %g1 +; CHECK-RESERVED-O1-NOT: %o1 +; CHECK-RESERVED-L1-NOT: %l1 +; CHECK-RESERVED-I1-NOT: %i1 ; CHECK: ret define void @use_all_i64_regs() { entry: -- cgit v1.1 From d9124332aa3b95725b149617066fdd1f525b530d Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 10:40:05 +0300 Subject: [clang][NFC] Annotate `Sema/DelayedDiagnostic.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Sema/DelayedDiagnostic.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/include/clang/Sema/DelayedDiagnostic.h b/clang/include/clang/Sema/DelayedDiagnostic.h index 9de7131..0105089 100644 --- a/clang/include/clang/Sema/DelayedDiagnostic.h +++ b/clang/include/clang/Sema/DelayedDiagnostic.h @@ -111,7 +111,9 @@ public: } private: + LLVM_PREFERRED_TYPE(AccessSpecifier) unsigned Access : 2; + LLVM_PREFERRED_TYPE(bool) unsigned IsMember : 1; NamedDecl *Target; CXXRecordDecl *NamingClass; -- cgit v1.1 From c0ed1b2c08ab3b75e79d90fcda7e949ca50400a5 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 10:47:45 +0300 Subject: [clang][NFC] Annotate `Basic/Visibility.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Basic/Visibility.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/include/clang/Basic/Visibility.h b/clang/include/clang/Basic/Visibility.h index 1e19630..b9693e6 100644 --- a/clang/include/clang/Basic/Visibility.h +++ b/clang/include/clang/Basic/Visibility.h @@ -51,8 +51,11 @@ inline Visibility minVisibility(Visibility L, Visibility R) { } class LinkageInfo { + LLVM_PREFERRED_TYPE(Linkage) uint8_t linkage_ : 3; + LLVM_PREFERRED_TYPE(Visibility) uint8_t visibility_ : 2; + LLVM_PREFERRED_TYPE(bool) uint8_t explicit_ : 1; void setVisibility(Visibility V, bool E) { visibility_ = V; explicit_ = E; } -- cgit v1.1 From 07ec9a3799fa1e80888f8bd0c1101ad6dd546842 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 10:58:03 +0300 Subject: [clang][NFC] Partially annotate `CGFunctionInfo.h` with `preferred_type` `CallingConvention` and `EffectiveCallingConvention` bit-fields that hold `llvm::CallingConv` are impossible to annotate at the moment, as `llvm::CallingConv` is actually a namespace with an unnamed enum inside. --- clang/include/clang/CodeGen/CGFunctionInfo.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h index e388901..811f334 100644 --- a/clang/include/clang/CodeGen/CGFunctionInfo.h +++ b/clang/include/clang/CodeGen/CGFunctionInfo.h @@ -564,35 +564,45 @@ class CGFunctionInfo final unsigned EffectiveCallingConvention : 8; /// The clang::CallingConv that this was originally created with. + LLVM_PREFERRED_TYPE(CallingConv) unsigned ASTCallingConvention : 6; /// Whether this is an instance method. + LLVM_PREFERRED_TYPE(bool) unsigned InstanceMethod : 1; /// Whether this is a chain call. + LLVM_PREFERRED_TYPE(bool) unsigned ChainCall : 1; /// Whether this function is called by forwarding arguments. /// This doesn't support inalloca or varargs. + LLVM_PREFERRED_TYPE(bool) unsigned DelegateCall : 1; /// Whether this function is a CMSE nonsecure call + LLVM_PREFERRED_TYPE(bool) unsigned CmseNSCall : 1; /// Whether this function is noreturn. + LLVM_PREFERRED_TYPE(bool) unsigned NoReturn : 1; /// Whether this function is returns-retained. + LLVM_PREFERRED_TYPE(bool) unsigned ReturnsRetained : 1; /// Whether this function saved caller registers. + LLVM_PREFERRED_TYPE(bool) unsigned NoCallerSavedRegs : 1; /// How many arguments to pass inreg. + LLVM_PREFERRED_TYPE(bool) unsigned HasRegParm : 1; unsigned RegParm : 3; /// Whether this function has nocf_check attribute. + LLVM_PREFERRED_TYPE(bool) unsigned NoCfCheck : 1; /// Log 2 of the maximum vector width. @@ -604,6 +614,7 @@ class CGFunctionInfo final /// passing non-trivial types with inalloca. Not part of the profile. llvm::StructType *ArgStruct; unsigned ArgStructAlign : 31; + LLVM_PREFERRED_TYPE(bool) unsigned HasExtParameterInfos : 1; unsigned NumArgs; -- cgit v1.1 From c112f963ce2b2efc8da765a1161402cebfa379b8 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:00:28 +0300 Subject: [clang][NFC] Annotate `AnalysisBasedWarning.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Sema/AnalysisBasedWarnings.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h index 020ddd3..aafe227 100644 --- a/clang/include/clang/Sema/AnalysisBasedWarnings.h +++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h @@ -34,9 +34,13 @@ public: class Policy { friend class AnalysisBasedWarnings; // The warnings to run. + LLVM_PREFERRED_TYPE(bool) unsigned enableCheckFallThrough : 1; + LLVM_PREFERRED_TYPE(bool) unsigned enableCheckUnreachable : 1; + LLVM_PREFERRED_TYPE(bool) unsigned enableThreadSafetyAnalysis : 1; + LLVM_PREFERRED_TYPE(bool) unsigned enableConsumedAnalysis : 1; public: Policy(); -- cgit v1.1 From 0764254e014db8783a31e84a322636c651bc7d6d Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:04:12 +0300 Subject: [clang][NFC] Annotate `StmtOpenMP.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/AST/StmtOpenMP.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h index 6216433..3cb3c10 100644 --- a/clang/include/clang/AST/StmtOpenMP.h +++ b/clang/include/clang/AST/StmtOpenMP.h @@ -2974,6 +2974,7 @@ class OMPAtomicDirective : public OMPExecutableDirective { /// This field is 1 for the first form of the expression and 0 for the /// second. Required for correct codegen of non-associative operations (like /// << or >>). + LLVM_PREFERRED_TYPE(bool) uint8_t IsXLHSInRHSPart : 1; /// Used for 'atomic update' or 'atomic capture' constructs. They may /// have atomic expressions of forms: @@ -2983,9 +2984,11 @@ class OMPAtomicDirective : public OMPExecutableDirective { /// \endcode /// This field is 1 for the first(postfix) form of the expression and 0 /// otherwise. + LLVM_PREFERRED_TYPE(bool) uint8_t IsPostfixUpdate : 1; /// 1 if 'v' is updated only when the condition is false (compare capture /// only). + LLVM_PREFERRED_TYPE(bool) uint8_t IsFailOnly : 1; } Flags; -- cgit v1.1 From c8a12ed413aae2c2602c880395270acbdbb15e70 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:09:34 +0300 Subject: [clang][NFC] Annotate `CommentCommandTraits.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/AST/CommentCommandTraits.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/clang/include/clang/AST/CommentCommandTraits.h b/clang/include/clang/AST/CommentCommandTraits.h index 83a29a5..0c3254d 100644 --- a/clang/include/clang/AST/CommentCommandTraits.h +++ b/clang/include/clang/AST/CommentCommandTraits.h @@ -50,52 +50,65 @@ struct CommandInfo { unsigned NumArgs : 4; /// True if this command is a inline command (of any kind). + LLVM_PREFERRED_TYPE(bool) unsigned IsInlineCommand : 1; /// True if this command is a block command (of any kind). + LLVM_PREFERRED_TYPE(bool) unsigned IsBlockCommand : 1; /// True if this command is introducing a brief documentation /// paragraph (\or an alias). + LLVM_PREFERRED_TYPE(bool) unsigned IsBriefCommand : 1; /// True if this command is \\returns or an alias. + LLVM_PREFERRED_TYPE(bool) unsigned IsReturnsCommand : 1; /// True if this command is introducing documentation for a function /// parameter (\\param or an alias). + LLVM_PREFERRED_TYPE(bool) unsigned IsParamCommand : 1; /// True if this command is introducing documentation for /// a template parameter (\\tparam or an alias). + LLVM_PREFERRED_TYPE(bool) unsigned IsTParamCommand : 1; /// True if this command is \\throws or an alias. + LLVM_PREFERRED_TYPE(bool) unsigned IsThrowsCommand : 1; /// True if this command is \\deprecated or an alias. + LLVM_PREFERRED_TYPE(bool) unsigned IsDeprecatedCommand : 1; /// True if this is a \\headerfile-like command. + LLVM_PREFERRED_TYPE(bool) unsigned IsHeaderfileCommand : 1; /// True if we don't want to warn about this command being passed an empty /// paragraph. Meaningful only for block commands. + LLVM_PREFERRED_TYPE(bool) unsigned IsEmptyParagraphAllowed : 1; /// True if this command is a verbatim-like block command. /// /// A verbatim-like block command eats every character (except line starting /// decorations) until matching end command is seen or comment end is hit. + LLVM_PREFERRED_TYPE(bool) unsigned IsVerbatimBlockCommand : 1; /// True if this command is an end command for a verbatim-like block. + LLVM_PREFERRED_TYPE(bool) unsigned IsVerbatimBlockEndCommand : 1; /// True if this command is a verbatim line command. /// /// A verbatim-like line command eats everything until a newline is seen or /// comment end is hit. + LLVM_PREFERRED_TYPE(bool) unsigned IsVerbatimLineCommand : 1; /// True if this command contains a declaration for the entity being @@ -105,20 +118,25 @@ struct CommandInfo { /// \code /// \fn void f(int a); /// \endcode + LLVM_PREFERRED_TYPE(bool) unsigned IsDeclarationCommand : 1; /// True if verbatim-like line command is a function declaration. + LLVM_PREFERRED_TYPE(bool) unsigned IsFunctionDeclarationCommand : 1; /// True if block command is further describing a container API; such /// as \@coclass, \@classdesign, etc. + LLVM_PREFERRED_TYPE(bool) unsigned IsRecordLikeDetailCommand : 1; /// True if block command is a container API; such as \@interface. + LLVM_PREFERRED_TYPE(bool) unsigned IsRecordLikeDeclarationCommand : 1; /// True if this command is unknown. This \c CommandInfo object was /// created during parsing. + LLVM_PREFERRED_TYPE(bool) unsigned IsUnknownCommand : 1; }; -- cgit v1.1 From ab2cef5391cc3434bc54b755810c51b55e9a04fc Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:16:58 +0300 Subject: [clang][NFC] Annotate `Analysis/CFG.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Analysis/CFG.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/include/clang/Analysis/CFG.h b/clang/include/clang/Analysis/CFG.h index 9f776ca..a7ff38c 100644 --- a/clang/include/clang/Analysis/CFG.h +++ b/clang/include/clang/Analysis/CFG.h @@ -879,6 +879,7 @@ private: /// /// Optimization Note: This bit could be profitably folded with Terminator's /// storage if the memory usage of CFGBlock becomes an issue. + LLVM_PREFERRED_TYPE(bool) unsigned HasNoReturnElement : 1; /// The parent CFG that owns this CFGBlock. @@ -1007,7 +1008,9 @@ public: class FilterOptions { public: + LLVM_PREFERRED_TYPE(bool) unsigned IgnoreNullPredecessors : 1; + LLVM_PREFERRED_TYPE(bool) unsigned IgnoreDefaultsWithCoveredEnums : 1; FilterOptions() -- cgit v1.1 From 535da10842c7309e9eeaf9828cf6bb034fecaf16 Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Sun, 11 Feb 2024 08:17:53 +0000 Subject: [lldb] checks if lldb can trace/attach/set a breakpoint a process or load a file to debug on FreeBSD. before having the generic EINVAL message, we check if the `security.bsd.unprivileged_proc_debug` allows process debugging. close #79634 --- .../Process/FreeBSD/NativeProcessFreeBSD.cpp | 39 +++++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp index 19e0986..9c620e4 100644 --- a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp +++ b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp @@ -48,20 +48,38 @@ static Status EnsureFDFlags(int fd, int flags) { return error; } +static Status CanTrace() { + int proc_debug, ret; + size_t len = sizeof(proc_debug); + ret = ::sysctlbyname("security.bsd.unprivileged_proc_debug", &proc_debug, + &len, nullptr, 0); + if (ret != 0) + return Status("sysctlbyname() security.bsd.unprivileged_proc_debug failed"); + + if (proc_debug < 1) + return Status( + "process debug disabled by security.bsd.unprivileged_proc_debug oid"); + + return {}; +} + // Public Static Methods llvm::Expected> NativeProcessFreeBSD::Manager::Launch(ProcessLaunchInfo &launch_info, NativeDelegate &native_delegate) { Log *log = GetLog(POSIXLog::Process); - Status status; + ::pid_t pid = ProcessLauncherPosixFork() .LaunchProcess(launch_info, status) .GetProcessId(); LLDB_LOG(log, "pid = {0:x}", pid); if (status.Fail()) { + auto error = CanTrace(); LLDB_LOG(log, "failed to launch process: {0}", status); + if (status.Fail()) + return error.ToError(); return status.ToError(); } @@ -392,8 +410,11 @@ Status NativeProcessFreeBSD::PtraceWrapper(int req, lldb::pid_t pid, void *addr, ret = ptrace(req, static_cast<::pid_t>(pid), static_cast(addr), data); - if (ret == -1) - error.SetErrorToErrno(); + if (ret == -1) { + error = CanTrace(); + if (error.Success()) + error.SetErrorToErrno(); + } if (result) *result = ret; @@ -707,8 +728,12 @@ Status NativeProcessFreeBSD::SetBreakpoint(lldb::addr_t addr, uint32_t size, Status NativeProcessFreeBSD::GetLoadedModuleFileSpec(const char *module_path, FileSpec &file_spec) { Status error = PopulateMemoryRegionCache(); - if (error.Fail()) + if (error.Fail()) { + auto status = CanTrace(); + if (status.Fail()) + return status; return error; + } FileSpec module_file_spec(module_path); FileSystem::Instance().Resolve(module_file_spec); @@ -729,8 +754,12 @@ NativeProcessFreeBSD::GetFileLoadAddress(const llvm::StringRef &file_name, lldb::addr_t &load_addr) { load_addr = LLDB_INVALID_ADDRESS; Status error = PopulateMemoryRegionCache(); - if (error.Fail()) + if (error.Fail()) { + auto status = CanTrace(); + if (status.Fail()) + return status; return error; + } FileSpec file(file_name); for (const auto &it : m_mem_region_cache) { -- cgit v1.1 From 6496948a427fc8f815f7c21cd068acd046873cca Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:28:37 +0300 Subject: [clang][NFC] Partially annotate `APINotes/Types.h` with `preferred_type` `RawRetainCountConversion` bit-field requires a new enumerator in `RetainCountConventionKind` to be annotated. --- clang/include/clang/APINotes/Types.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h index 1d116be..93bb045 100644 --- a/clang/include/clang/APINotes/Types.h +++ b/clang/include/clang/APINotes/Types.h @@ -55,16 +55,20 @@ public: std::string UnavailableMsg; /// Whether this entity is marked unavailable. + LLVM_PREFERRED_TYPE(bool) unsigned Unavailable : 1; /// Whether this entity is marked unavailable in Swift. + LLVM_PREFERRED_TYPE(bool) unsigned UnavailableInSwift : 1; private: /// Whether SwiftPrivate was specified. + LLVM_PREFERRED_TYPE(bool) unsigned SwiftPrivateSpecified : 1; /// Whether this entity is considered "private" to a Swift overlay. + LLVM_PREFERRED_TYPE(bool) unsigned SwiftPrivate : 1; public: @@ -191,18 +195,25 @@ inline bool operator!=(const CommonTypeInfo &LHS, const CommonTypeInfo &RHS) { /// Describes API notes data for an Objective-C class or protocol. class ObjCContextInfo : public CommonTypeInfo { /// Whether this class has a default nullability. + LLVM_PREFERRED_TYPE(bool) unsigned HasDefaultNullability : 1; /// The default nullability. + LLVM_PREFERRED_TYPE(NullabilityKind) unsigned DefaultNullability : 2; /// Whether this class has designated initializers recorded. + LLVM_PREFERRED_TYPE(bool) unsigned HasDesignatedInits : 1; + LLVM_PREFERRED_TYPE(bool) unsigned SwiftImportAsNonGenericSpecified : 1; + LLVM_PREFERRED_TYPE(bool) unsigned SwiftImportAsNonGeneric : 1; + LLVM_PREFERRED_TYPE(bool) unsigned SwiftObjCMembersSpecified : 1; + LLVM_PREFERRED_TYPE(bool) unsigned SwiftObjCMembers : 1; public: @@ -298,10 +309,12 @@ inline bool operator!=(const ObjCContextInfo &LHS, const ObjCContextInfo &RHS) { /// API notes for a variable/property. class VariableInfo : public CommonEntityInfo { /// Whether this property has been audited for nullability. + LLVM_PREFERRED_TYPE(bool) unsigned NullabilityAudited : 1; /// The kind of nullability for this property. Only valid if the nullability /// has been audited. + LLVM_PREFERRED_TYPE(NullabilityKind) unsigned Nullable : 2; /// The C type of the variable, as a string. @@ -352,7 +365,9 @@ inline bool operator!=(const VariableInfo &LHS, const VariableInfo &RHS) { /// Describes API notes data for an Objective-C property. class ObjCPropertyInfo : public VariableInfo { + LLVM_PREFERRED_TYPE(bool) unsigned SwiftImportAsAccessorsSpecified : 1; + LLVM_PREFERRED_TYPE(bool) unsigned SwiftImportAsAccessors : 1; public: @@ -409,9 +424,11 @@ inline bool operator!=(const ObjCPropertyInfo &LHS, /// Describes a function or method parameter. class ParamInfo : public VariableInfo { /// Whether noescape was specified. + LLVM_PREFERRED_TYPE(bool) unsigned NoEscapeSpecified : 1; /// Whether the this parameter has the 'noescape' attribute. + LLVM_PREFERRED_TYPE(bool) unsigned NoEscape : 1; /// A biased RetainCountConventionKind, where 0 means "unspecified". @@ -488,6 +505,7 @@ public: // unknown nullability. /// Whether the signature has been audited with respect to nullability. + LLVM_PREFERRED_TYPE(bool) unsigned NullabilityAudited : 1; /// Number of types whose nullability is encoded with the NullabilityPayload. @@ -597,9 +615,11 @@ inline bool operator!=(const FunctionInfo &LHS, const FunctionInfo &RHS) { class ObjCMethodInfo : public FunctionInfo { public: /// Whether this is a designated initializer of its class. + LLVM_PREFERRED_TYPE(bool) unsigned DesignatedInit : 1; /// Whether this is a required initializer. + LLVM_PREFERRED_TYPE(bool) unsigned RequiredInit : 1; ObjCMethodInfo() : DesignatedInit(false), RequiredInit(false) {} @@ -650,7 +670,9 @@ public: /// Describes API notes data for a tag. class TagInfo : public CommonTypeInfo { + LLVM_PREFERRED_TYPE(bool) unsigned HasFlagEnum : 1; + LLVM_PREFERRED_TYPE(bool) unsigned IsFlagEnum : 1; public: -- cgit v1.1 From fcd21624b082b0c42777f6047cdfbc8a59057001 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:44:02 +0300 Subject: [clang][NFC] Annotate `Driver.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Driver/Driver.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index 3ee1bcf..908bc87 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -232,10 +232,12 @@ public: bool IsDXCMode() const { return Mode == DXCMode; } /// Only print tool bindings, don't build any jobs. + LLVM_PREFERRED_TYPE(bool) unsigned CCCPrintBindings : 1; /// Set CC_PRINT_OPTIONS mode, which is like -v but logs the commands to /// CCPrintOptionsFilename or to stderr. + LLVM_PREFERRED_TYPE(bool) unsigned CCPrintOptions : 1; /// The format of the header information that is emitted. If CC_PRINT_HEADERS @@ -252,17 +254,21 @@ public: /// Set CC_LOG_DIAGNOSTICS mode, which causes the frontend to log diagnostics /// to CCLogDiagnosticsFilename or to stderr, in a stable machine readable /// format. + LLVM_PREFERRED_TYPE(bool) unsigned CCLogDiagnostics : 1; /// Whether the driver is generating diagnostics for debugging purposes. + LLVM_PREFERRED_TYPE(bool) unsigned CCGenDiagnostics : 1; /// Set CC_PRINT_PROC_STAT mode, which causes the driver to dump /// performance report to CC_PRINT_PROC_STAT_FILE or to stdout. + LLVM_PREFERRED_TYPE(bool) unsigned CCPrintProcessStats : 1; /// Set CC_PRINT_INTERNAL_STAT mode, which causes the driver to dump internal /// performance report to CC_PRINT_INTERNAL_STAT_FILE or to stdout. + LLVM_PREFERRED_TYPE(bool) unsigned CCPrintInternalStats : 1; /// Pointer to the ExecuteCC1Tool function, if available. @@ -303,9 +309,11 @@ private: /// Whether to check that input files exist when constructing compilation /// jobs. + LLVM_PREFERRED_TYPE(bool) unsigned CheckInputsExist : 1; /// Whether to probe for PCH files on disk, in order to upgrade /// -include foo.h to -include-pch foo.h.pch. + LLVM_PREFERRED_TYPE(bool) unsigned ProbePrecompiled : 1; public: @@ -319,6 +327,7 @@ public: private: /// Certain options suppress the 'no input files' warning. + LLVM_PREFERRED_TYPE(bool) unsigned SuppressMissingInputWarning : 1; /// Cache of all the ToolChains in use by the driver. -- cgit v1.1 From 956722698172a806652ca8e2dba0a783a1c3d593 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:46:51 +0300 Subject: [clang][NFC] Annotate `CodeCompletionConsumer.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Sema/CodeCompleteConsumer.h | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/include/clang/Sema/CodeCompleteConsumer.h b/clang/include/clang/Sema/CodeCompleteConsumer.h index 274eaac..a2028e4 100644 --- a/clang/include/clang/Sema/CodeCompleteConsumer.h +++ b/clang/include/clang/Sema/CodeCompleteConsumer.h @@ -581,6 +581,7 @@ private: unsigned Priority : 16; /// The availability of this code-completion result. + LLVM_PREFERRED_TYPE(CXAvailabilityKind) unsigned Availability : 2; /// The name of the parent context. -- cgit v1.1 From 04812c72dee2c1c211306a4dd6d51e783f0c5015 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:49:20 +0300 Subject: [clang][NFC] Annotate `CodeCompletionOptions.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Sema/CodeCompleteOptions.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/clang/include/clang/Sema/CodeCompleteOptions.h b/clang/include/clang/Sema/CodeCompleteOptions.h index a3403b0..d8dc386 100644 --- a/clang/include/clang/Sema/CodeCompleteOptions.h +++ b/clang/include/clang/Sema/CodeCompleteOptions.h @@ -9,18 +9,23 @@ #ifndef LLVM_CLANG_SEMA_CODECOMPLETEOPTIONS_H #define LLVM_CLANG_SEMA_CODECOMPLETEOPTIONS_H +#include "llvm/Support/Compiler.h" + namespace clang { /// Options controlling the behavior of code completion. class CodeCompleteOptions { public: /// Show macros in code completion results. + LLVM_PREFERRED_TYPE(bool) unsigned IncludeMacros : 1; /// Show code patterns in code completion results. + LLVM_PREFERRED_TYPE(bool) unsigned IncludeCodePatterns : 1; /// Show top-level decls in code completion results. + LLVM_PREFERRED_TYPE(bool) unsigned IncludeGlobals : 1; /// Show decls in namespace (including the global namespace) in code @@ -29,18 +34,22 @@ public: /// Currently, this only works when completing qualified IDs (i.e. /// `Sema::CodeCompleteQualifiedId`). /// FIXME: consider supporting more completion cases with this option. + LLVM_PREFERRED_TYPE(bool) unsigned IncludeNamespaceLevelDecls : 1; /// Show brief documentation comments in code completion results. + LLVM_PREFERRED_TYPE(bool) unsigned IncludeBriefComments : 1; /// Hint whether to load data from the external AST to provide full results. /// If false, namespace-level declarations and macros from the preamble may be /// omitted. + LLVM_PREFERRED_TYPE(bool) unsigned LoadExternal : 1; /// Include results after corrections (small fix-its), e.g. change '.' to '->' /// on member access, etc. + LLVM_PREFERRED_TYPE(bool) unsigned IncludeFixIts : 1; CodeCompleteOptions() -- cgit v1.1 From 1ee81076388078cb0cb1fbc90ad374fceafd0c98 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:52:10 +0300 Subject: [clang][NFC] Annotate `ASTReader.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Serialization/ASTReader.h | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index a4c7f54..2002bf2 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -721,6 +721,7 @@ private: unsigned ID; /// Whether this is a wildcard export. + LLVM_PREFERRED_TYPE(bool) unsigned IsWildcard : 1; /// String data. -- cgit v1.1 From 3bf89e5883ff0ea82ca4ad5cd511b77826b7bb71 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:55:07 +0300 Subject: [clang][NFC] Annotate `AST/Linkage.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/AST/Linkage.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/lib/AST/Linkage.h b/clang/lib/AST/Linkage.h index 31f384e..e4dcb5e 100644 --- a/clang/lib/AST/Linkage.h +++ b/clang/lib/AST/Linkage.h @@ -29,12 +29,15 @@ namespace clang { struct LVComputationKind { /// The kind of entity whose visibility is ultimately being computed; /// visibility computations for types and non-types follow different rules. + LLVM_PREFERRED_TYPE(bool) unsigned ExplicitKind : 1; /// Whether explicit visibility attributes should be ignored. When set, /// visibility may only be restricted by the visibility of template arguments. + LLVM_PREFERRED_TYPE(bool) unsigned IgnoreExplicitVisibility : 1; /// Whether all visibility should be ignored. When set, we're only interested /// in computing linkage. + LLVM_PREFERRED_TYPE(bool) unsigned IgnoreAllVisibility : 1; enum { NumLVComputationKindBits = 3 }; -- cgit v1.1 From 1366e4f594bdb4cd429423a1e07509e984838fa0 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 11:59:03 +0300 Subject: [clang][NFC] Annotate `Interp/Descriptor.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/AST/Interp/Descriptor.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h index 6cca9d5..6a53205 100644 --- a/clang/lib/AST/Interp/Descriptor.h +++ b/clang/lib/AST/Interp/Descriptor.h @@ -59,17 +59,22 @@ struct InlineDescriptor { /// Flag indicating if the storage is constant or not. /// Relevant for primitive fields. + LLVM_PREFERRED_TYPE(bool) unsigned IsConst : 1; /// For primitive fields, it indicates if the field was initialized. /// Primitive fields in static storage are always initialized. /// Arrays are always initialized, even though their elements might not be. /// Base classes are initialized after the constructor is invoked. + LLVM_PREFERRED_TYPE(bool) unsigned IsInitialized : 1; /// Flag indicating if the field is an embedded base class. + LLVM_PREFERRED_TYPE(bool) unsigned IsBase : 1; /// Flag indicating if the field is the active member of a union. + LLVM_PREFERRED_TYPE(bool) unsigned IsActive : 1; /// Flag indicating if the field is mutable (if in a record). + LLVM_PREFERRED_TYPE(bool) unsigned IsFieldMutable : 1; const Descriptor *Desc; -- cgit v1.1 From ee56d494974311049e055c73e4feb2e4098f1da8 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 12:02:26 +0300 Subject: [clang][NFC] Annotate `Targets/ARM.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/Basic/Targets/ARM.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h index 9802eb0..71322a0 100644 --- a/clang/lib/Basic/Targets/ARM.h +++ b/clang/lib/Basic/Targets/ARM.h @@ -61,26 +61,43 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo { llvm::ARM::ProfileKind ArchProfile; unsigned ArchVersion; + LLVM_PREFERRED_TYPE(FPUMode) unsigned FPU : 5; + LLVM_PREFERRED_TYPE(MVEMode) unsigned MVE : 2; + LLVM_PREFERRED_TYPE(bool) unsigned IsAAPCS : 1; + LLVM_PREFERRED_TYPE(HWDivMode) unsigned HWDiv : 2; // Initialized via features. + LLVM_PREFERRED_TYPE(bool) unsigned SoftFloat : 1; + LLVM_PREFERRED_TYPE(bool) unsigned SoftFloatABI : 1; + LLVM_PREFERRED_TYPE(bool) unsigned CRC : 1; + LLVM_PREFERRED_TYPE(bool) unsigned Crypto : 1; + LLVM_PREFERRED_TYPE(bool) unsigned SHA2 : 1; + LLVM_PREFERRED_TYPE(bool) unsigned AES : 1; + LLVM_PREFERRED_TYPE(bool) unsigned DSP : 1; + LLVM_PREFERRED_TYPE(bool) unsigned Unaligned : 1; + LLVM_PREFERRED_TYPE(bool) unsigned DotProd : 1; + LLVM_PREFERRED_TYPE(bool) unsigned HasMatMul : 1; + LLVM_PREFERRED_TYPE(bool) unsigned FPRegsDisabled : 1; + LLVM_PREFERRED_TYPE(bool) unsigned HasPAC : 1; + LLVM_PREFERRED_TYPE(bool) unsigned HasBTI : 1; enum { -- cgit v1.1 From ba0d35181cef094209306207dc6e3fa816ddde36 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 12:04:55 +0300 Subject: [clang][NFC] Annotate `CGCall.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/CodeGen/CGCall.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/lib/CodeGen/CGCall.h b/clang/lib/CodeGen/CGCall.h index 1c0d15d..1bd48a0 100644 --- a/clang/lib/CodeGen/CGCall.h +++ b/clang/lib/CodeGen/CGCall.h @@ -357,8 +357,11 @@ class ReturnValueSlot { Address Addr = Address::invalid(); // Return value slot flags + LLVM_PREFERRED_TYPE(bool) unsigned IsVolatile : 1; + LLVM_PREFERRED_TYPE(bool) unsigned IsUnused : 1; + LLVM_PREFERRED_TYPE(bool) unsigned IsExternallyDestructed : 1; public: -- cgit v1.1 From fd80304763a41f86b877c91b750551d7e6bd852d Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 12:07:27 +0300 Subject: [clang][NFC] Annotate `CGCUDARuntime.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/CodeGen/CGCUDARuntime.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h index c7af8f1..8030d63 100644 --- a/clang/lib/CodeGen/CGCUDARuntime.h +++ b/clang/lib/CodeGen/CGCUDARuntime.h @@ -54,10 +54,15 @@ public: }; private: + LLVM_PREFERRED_TYPE(DeviceVarKind) unsigned Kind : 2; + LLVM_PREFERRED_TYPE(bool) unsigned Extern : 1; + LLVM_PREFERRED_TYPE(bool) unsigned Constant : 1; // Constant variable. + LLVM_PREFERRED_TYPE(bool) unsigned Managed : 1; // Managed variable. + LLVM_PREFERRED_TYPE(bool) unsigned Normalized : 1; // Normalized texture. int SurfTexType; // Type of surface/texutre. -- cgit v1.1 From 35737beaef1452b6ecdb0e6d7a359d48c8e9236a Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 12:11:49 +0300 Subject: [clang][NFC] Annotate `CodeGenFunction.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/CodeGen/CodeGenFunction.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 143ad64..fc9b328 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -203,6 +203,7 @@ template <> struct DominatingValue { llvm::Value *Value; llvm::Type *ElementType; + LLVM_PREFERRED_TYPE(Kind) unsigned K : 3; unsigned Align : 29; saved_type(llvm::Value *v, llvm::Type *e, Kind k, unsigned a = 0) @@ -650,9 +651,11 @@ public: struct LifetimeExtendedCleanupHeader { /// The size of the following cleanup object. unsigned Size; - /// The kind of cleanup to push: a value from the CleanupKind enumeration. + /// The kind of cleanup to push. + LLVM_PREFERRED_TYPE(CleanupKind) unsigned Kind : 31; /// Whether this is a conditional cleanup. + LLVM_PREFERRED_TYPE(bool) unsigned IsConditional : 1; size_t getSize() const { return Size; } -- cgit v1.1 From 866e073c2851bd4180cc0c64ce5a3d7f109e21dc Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 12:14:31 +0300 Subject: [clang][NFC] Annotate `CGRecordLayout.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/CodeGen/CGRecordLayout.h | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/CodeGen/CGRecordLayout.h b/clang/lib/CodeGen/CGRecordLayout.h index d5ea749..6c06ad2 100644 --- a/clang/lib/CodeGen/CGRecordLayout.h +++ b/clang/lib/CodeGen/CGRecordLayout.h @@ -71,6 +71,7 @@ struct CGBitFieldInfo { unsigned Size : 15; /// Whether the bit-field is signed. + LLVM_PREFERRED_TYPE(bool) unsigned IsSigned : 1; /// The storage size in bits which should be used when accessing this -- cgit v1.1 From 1ed37606ca4bda4659b33a7f570d273b5afd16ea Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 12:20:34 +0300 Subject: [clang][NFC] Annotate `CGCleanup.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/CodeGen/CGCleanup.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGCleanup.h b/clang/lib/CodeGen/CGCleanup.h index fcfbf41..7a7344c 100644 --- a/clang/lib/CodeGen/CGCleanup.h +++ b/clang/lib/CodeGen/CGCleanup.h @@ -40,6 +40,10 @@ struct CatchTypeInfo { /// A protected scope for zero-cost EH handling. class EHScope { +public: + enum Kind { Cleanup, Catch, Terminate, Filter }; + +private: llvm::BasicBlock *CachedLandingPad; llvm::BasicBlock *CachedEHDispatchBlock; @@ -47,6 +51,7 @@ class EHScope { class CommonBitFields { friend class EHScope; + LLVM_PREFERRED_TYPE(Kind) unsigned Kind : 3; }; enum { NumCommonBits = 3 }; @@ -64,21 +69,27 @@ protected: unsigned : NumCommonBits; /// Whether this cleanup needs to be run along normal edges. + LLVM_PREFERRED_TYPE(bool) unsigned IsNormalCleanup : 1; /// Whether this cleanup needs to be run along exception edges. + LLVM_PREFERRED_TYPE(bool) unsigned IsEHCleanup : 1; /// Whether this cleanup is currently active. + LLVM_PREFERRED_TYPE(bool) unsigned IsActive : 1; /// Whether this cleanup is a lifetime marker + LLVM_PREFERRED_TYPE(bool) unsigned IsLifetimeMarker : 1; /// Whether the normal cleanup should test the activation flag. + LLVM_PREFERRED_TYPE(bool) unsigned TestFlagInNormalCleanup : 1; /// Whether the EH cleanup should test the activation flag. + LLVM_PREFERRED_TYPE(bool) unsigned TestFlagInEHCleanup : 1; /// The amount of extra storage needed by the Cleanup. @@ -101,8 +112,6 @@ protected: }; public: - enum Kind { Cleanup, Catch, Terminate, Filter }; - EHScope(Kind kind, EHScopeStack::stable_iterator enclosingEHScope) : CachedLandingPad(nullptr), CachedEHDispatchBlock(nullptr), EnclosingEHScope(enclosingEHScope) { -- cgit v1.1 From bf571059f3bcf50bf8d3b39dc6aadeb14ede14bf Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 12:57:42 +0300 Subject: [clang][NFC] Annotate `LangOptions.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Basic/LangOptions.h | 48 ++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index c1cc554..862952d 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -30,27 +30,6 @@ namespace clang { -/// Bitfields of LangOptions, split out from LangOptions in order to ensure that -/// this large collection of bitfields is a trivial class type. -class LangOptionsBase { - friend class CompilerInvocation; - friend class CompilerInvocationBase; - -public: - // Define simple language options (with no accessors). -#define LANGOPT(Name, Bits, Default, Description) unsigned Name : Bits; -#define ENUM_LANGOPT(Name, Type, Bits, Default, Description) -#include "clang/Basic/LangOptions.def" - -protected: - // Define language options of enumeration type. These are private, and will - // have accessors (below). -#define LANGOPT(Name, Bits, Default, Description) -#define ENUM_LANGOPT(Name, Type, Bits, Default, Description) \ - unsigned Name : Bits; -#include "clang/Basic/LangOptions.def" -}; - /// In the Microsoft ABI, this controls the placement of virtual displacement /// members used to implement virtual inheritance. enum class MSVtorDispMode { Never, ForVBaseOverride, ForVFTable }; @@ -78,9 +57,12 @@ enum class ShaderStage { Invalid, }; -/// Keeps track of the various options that can be -/// enabled, which controls the dialect of C or C++ that is accepted. -class LangOptions : public LangOptionsBase { +/// Bitfields of LangOptions, split out from LangOptions in order to ensure that +/// this large collection of bitfields is a trivial class type. +class LangOptionsBase { + friend class CompilerInvocation; + friend class CompilerInvocationBase; + public: using Visibility = clang::Visibility; using RoundingMode = llvm::RoundingMode; @@ -416,6 +398,24 @@ public: enum ComplexRangeKind { CX_Full, CX_Limited, CX_Fortran, CX_None }; + // Define simple language options (with no accessors). +#define LANGOPT(Name, Bits, Default, Description) unsigned Name : Bits; +#define ENUM_LANGOPT(Name, Type, Bits, Default, Description) +#include "clang/Basic/LangOptions.def" + +protected: + // Define language options of enumeration type. These are private, and will + // have accessors (below). +#define LANGOPT(Name, Bits, Default, Description) +#define ENUM_LANGOPT(Name, Type, Bits, Default, Description) \ + LLVM_PREFERRED_TYPE(Type) \ + unsigned Name : Bits; +#include "clang/Basic/LangOptions.def" +}; + +/// Keeps track of the various options that can be +/// enabled, which controls the dialect of C or C++ that is accepted. +class LangOptions : public LangOptionsBase { public: /// The used language standard. LangStandard::Kind LangStd; -- cgit v1.1 From 4502dc416f40e0165ef988ded7db2673ac35670e Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Sun, 11 Feb 2024 10:04:29 +0000 Subject: [mlir][nfc] Remove leftover print stmt in a test --- mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir index 1739341..22cf15d 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir @@ -98,7 +98,6 @@ module attributes {transform.with_named_sequence} { // Step 1: Tile %tiled_matmul, %loops:3 = transform.structured.tile_using_for %matmul [2, [4], 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - transform.print %tiled_matmul {name = "matmul lal"}: !transform.any_op // Step 2: Vectorize transform.structured.vectorize %tiled_matmul vector_sizes [2, [4], 1] : !transform.any_op -- cgit v1.1 From 5aec9392674572fa5a06283173a6a739742d261d Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sun, 11 Feb 2024 02:14:22 -0800 Subject: [clang-format][NFC] Keep Operator== sorted in Format.h --- clang/include/clang/Format/Format.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index ab56cc8..d9c18e5 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -4822,7 +4822,6 @@ struct FormatStyle { AlwaysBreakAfterReturnType == R.AlwaysBreakAfterReturnType && AlwaysBreakBeforeMultilineStrings == R.AlwaysBreakBeforeMultilineStrings && - BreakTemplateDeclarations == R.BreakTemplateDeclarations && AttributeMacros == R.AttributeMacros && BinPackArguments == R.BinPackArguments && BinPackParameters == R.BinPackParameters && @@ -4840,6 +4839,7 @@ struct FormatStyle { BreakConstructorInitializers == R.BreakConstructorInitializers && BreakInheritanceList == R.BreakInheritanceList && BreakStringLiterals == R.BreakStringLiterals && + BreakTemplateDeclarations == R.BreakTemplateDeclarations && ColumnLimit == R.ColumnLimit && CommentPragmas == R.CommentPragmas && CompactNamespaces == R.CompactNamespaces && ConstructorInitializerIndentWidth == -- cgit v1.1 From 5932fcc47855fdd209784f38820422d2369b84b2 Mon Sep 17 00:00:00 2001 From: Quentin Dian Date: Sun, 11 Feb 2024 18:24:59 +0800 Subject: [InlineCost] Consider the default branch when calculating cost (#77856) First step in fixing #76772. This PR considers the default branch as a case branch. This will give the unreachable default branch fair consideration. --- .../include/llvm/Analysis/InlineModelFeatureMaps.h | 2 + llvm/include/llvm/IR/Instructions.h | 7 + llvm/lib/Analysis/InlineCost.cpp | 21 +- .../Transforms/Inline/inline-switch-default-2.ll | 317 +++++++++++++++++++++ .../Transforms/Inline/inline-switch-default.ll | 216 ++++++++++++++ 5 files changed, 555 insertions(+), 8 deletions(-) create mode 100644 llvm/test/Transforms/Inline/inline-switch-default-2.ll create mode 100644 llvm/test/Transforms/Inline/inline-switch-default.ll diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h index ca9bb724..d62ec9c 100644 --- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h +++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h @@ -39,6 +39,8 @@ namespace llvm { M(int64_t, {1}, jump_table_penalty, "Accumulation of costs for jump tables") \ M(int64_t, {1}, case_cluster_penalty, \ "Accumulation of costs for case clusters") \ + M(int64_t, {1}, switch_default_dest_penalty, \ + "Accumulation of costs for switch default destination") \ M(int64_t, {1}, switch_penalty, \ "Accumulation of costs for switch statements") \ M(int64_t, {1}, unsimplified_common_instructions, \ diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 4b5a442..1db4ff2 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -49,6 +49,7 @@ class DataLayout; class StringRef; class Type; class Value; +class UnreachableInst; //===----------------------------------------------------------------------===// // AllocaInst Class @@ -3505,6 +3506,12 @@ public: return cast(getOperand(1)); } + /// Returns true if the default branch must result in immediate undefined + /// behavior, false otherwise. + bool defaultDestUndefined() const { + return isa(getDefaultDest()->getFirstNonPHIOrDbg()); + } + void setDefaultDest(BasicBlock *DefaultCase) { setOperand(1, reinterpret_cast(DefaultCase)); } diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 5b780b5..e55eaa5 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -336,8 +336,8 @@ protected: /// Called at the end of processing a switch instruction, with the given /// number of case clusters. - virtual void onFinalizeSwitch(unsigned JumpTableSize, - unsigned NumCaseCluster) {} + virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, + bool DefaultDestUndefined) {} /// Called to account for any other instruction not specifically accounted /// for. @@ -699,15 +699,16 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { CallPenalty)); } - void onFinalizeSwitch(unsigned JumpTableSize, - unsigned NumCaseCluster) override { + void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, + bool DefaultDestUndefined) override { + if (!DefaultDestUndefined) + addCost(2 * InstrCost); // If suitable for a jump table, consider the cost for the table size and // branch to destination. // Maximum valid cost increased in this function. if (JumpTableSize) { int64_t JTCost = static_cast(JumpTableSize) * InstrCost + 4 * InstrCost; - addCost(JTCost); return; } @@ -1153,6 +1154,7 @@ private: // heuristics in the ML inliner. static constexpr int JTCostMultiplier = 4; static constexpr int CaseClusterCostMultiplier = 2; + static constexpr int SwitchDefaultDestCostMultiplier = 2; static constexpr int SwitchCostMultiplier = 2; // FIXME: These are taken from the heuristic-based cost visitor: we should @@ -1231,8 +1233,11 @@ private: } } - void onFinalizeSwitch(unsigned JumpTableSize, - unsigned NumCaseCluster) override { + void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, + bool DefaultDestUndefined) override { + if (!DefaultDestUndefined) + increment(InlineCostFeatureIndex::switch_default_dest_penalty, + SwitchDefaultDestCostMultiplier * InstrCost); if (JumpTableSize) { int64_t JTCost = static_cast(JumpTableSize) * InstrCost + @@ -2461,7 +2466,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { unsigned NumCaseCluster = TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI); - onFinalizeSwitch(JumpTableSize, NumCaseCluster); + onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUndefined()); return false; } diff --git a/llvm/test/Transforms/Inline/inline-switch-default-2.ll b/llvm/test/Transforms/Inline/inline-switch-default-2.ll new file mode 100644 index 0000000..8d3e24c --- /dev/null +++ b/llvm/test/Transforms/Inline/inline-switch-default-2.ll @@ -0,0 +1,317 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt %s -S -passes=inline -inline-threshold=21 | FileCheck %s + +; Check for scenarios without TTI. + +define i64 @foo1(i64 %a) { +; LOOKUPTABLE-LABEL: define i64 @foo1( +; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) { +; LOOKUPTABLE-NEXT: switch i64 [[TMP0]], label [[DEFAULT_BRANCH_I:%.*]] [ +; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0_I:%.*]] +; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2_I:%.*]] +; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4_I:%.*]] +; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6_I:%.*]] +; LOOKUPTABLE-NEXT: ] +; LOOKUPTABLE: branch_0.i: +; LOOKUPTABLE-NEXT: br label [[BAR1_EXIT:%.*]] +; LOOKUPTABLE: branch_2.i: +; LOOKUPTABLE-NEXT: br label [[BAR1_EXIT]] +; LOOKUPTABLE: branch_4.i: +; LOOKUPTABLE-NEXT: br label [[BAR1_EXIT]] +; LOOKUPTABLE: branch_6.i: +; LOOKUPTABLE-NEXT: br label [[BAR1_EXIT]] +; LOOKUPTABLE: default_branch.i: +; LOOKUPTABLE-NEXT: br label [[BAR1_EXIT]] +; LOOKUPTABLE: bar1.exit: +; LOOKUPTABLE-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ], [ 3, [[DEFAULT_BRANCH_I]] ] +; LOOKUPTABLE-NEXT: ret i64 [[TMP2]] +; +; SWITCH-LABEL: define i64 @foo1( +; SWITCH-SAME: i64 [[TMP0:%.*]]) { +; SWITCH-NEXT: switch i64 [[TMP0]], label [[DEFAULT_BRANCH_I:%.*]] [ +; SWITCH-NEXT: i64 0, label [[BRANCH_0_I:%.*]] +; SWITCH-NEXT: i64 2, label [[BRANCH_2_I:%.*]] +; SWITCH-NEXT: i64 4, label [[BRANCH_4_I:%.*]] +; SWITCH-NEXT: i64 6, label [[BRANCH_6_I:%.*]] +; SWITCH-NEXT: ] +; SWITCH: branch_0.i: +; SWITCH-NEXT: br label [[BAR1_EXIT:%.*]] +; SWITCH: branch_2.i: +; SWITCH-NEXT: br label [[BAR1_EXIT]] +; SWITCH: branch_4.i: +; SWITCH-NEXT: br label [[BAR1_EXIT]] +; SWITCH: branch_6.i: +; SWITCH-NEXT: br label [[BAR1_EXIT]] +; SWITCH: default_branch.i: +; SWITCH-NEXT: br label [[BAR1_EXIT]] +; SWITCH: bar1.exit: +; SWITCH-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ], [ 3, [[DEFAULT_BRANCH_I]] ] +; SWITCH-NEXT: ret i64 [[TMP2]] +; +; CHECK-LABEL: define i64 @foo1( +; CHECK-SAME: i64 [[A:%.*]]) { +; CHECK-NEXT: [[B:%.*]] = call i64 @bar1(i64 [[A]]) +; CHECK-NEXT: ret i64 [[B]] +; + %b = call i64 @bar1(i64 %a) + ret i64 %b +} + +define i64 @foo2(i64 %a) { +; LOOKUPTABLE-LABEL: define i64 @foo2( +; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) { +; LOOKUPTABLE-NEXT: switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT_I:%.*]] [ +; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0_I:%.*]] +; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2_I:%.*]] +; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4_I:%.*]] +; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6_I:%.*]] +; LOOKUPTABLE-NEXT: ] +; LOOKUPTABLE: branch_0.i: +; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT:%.*]] +; LOOKUPTABLE: branch_2.i: +; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT]] +; LOOKUPTABLE: branch_4.i: +; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT]] +; LOOKUPTABLE: branch_6.i: +; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT]] +; LOOKUPTABLE: unreachabledefault.i: +; LOOKUPTABLE-NEXT: unreachable +; LOOKUPTABLE: bar2.exit: +; LOOKUPTABLE-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ] +; LOOKUPTABLE-NEXT: ret i64 [[TMP2]] +; +; SWITCH-LABEL: define i64 @foo2( +; SWITCH-SAME: i64 [[TMP0:%.*]]) { +; SWITCH-NEXT: switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT_I:%.*]] [ +; SWITCH-NEXT: i64 0, label [[BRANCH_0_I:%.*]] +; SWITCH-NEXT: i64 2, label [[BRANCH_2_I:%.*]] +; SWITCH-NEXT: i64 4, label [[BRANCH_4_I:%.*]] +; SWITCH-NEXT: i64 6, label [[BRANCH_6_I:%.*]] +; SWITCH-NEXT: ] +; SWITCH: branch_0.i: +; SWITCH-NEXT: br label [[BAR2_EXIT:%.*]] +; SWITCH: branch_2.i: +; SWITCH-NEXT: br label [[BAR2_EXIT]] +; SWITCH: branch_4.i: +; SWITCH-NEXT: br label [[BAR2_EXIT]] +; SWITCH: branch_6.i: +; SWITCH-NEXT: br label [[BAR2_EXIT]] +; SWITCH: unreachabledefault.i: +; SWITCH-NEXT: unreachable +; SWITCH: bar2.exit: +; SWITCH-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ] +; SWITCH-NEXT: ret i64 [[TMP2]] +; +; CHECK-LABEL: define i64 @foo2( +; CHECK-SAME: i64 [[A:%.*]]) { +; CHECK-NEXT: switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [ +; CHECK-NEXT: i64 0, label [[BRANCH_0_I:%.*]] +; CHECK-NEXT: i64 2, label [[BRANCH_2_I:%.*]] +; CHECK-NEXT: i64 4, label [[BRANCH_4_I:%.*]] +; CHECK-NEXT: i64 6, label [[BRANCH_6_I:%.*]] +; CHECK-NEXT: ] +; CHECK: branch_0.i: +; CHECK-NEXT: br label [[BAR2_EXIT:%.*]] +; CHECK: branch_2.i: +; CHECK-NEXT: br label [[BAR2_EXIT]] +; CHECK: branch_4.i: +; CHECK-NEXT: br label [[BAR2_EXIT]] +; CHECK: branch_6.i: +; CHECK-NEXT: br label [[BAR2_EXIT]] +; CHECK: unreachabledefault.i: +; CHECK-NEXT: unreachable +; CHECK: bar2.exit: +; CHECK-NEXT: [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ] +; CHECK-NEXT: ret i64 [[B_I]] +; + %b = call i64 @bar2(i64 %a) + ret i64 %b +} + +define i64 @bar1(i64 %a) { +; LOOKUPTABLE-LABEL: define i64 @bar1( +; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) { +; LOOKUPTABLE-NEXT: switch i64 [[TMP0]], label [[DEFAULT_BRANCH:%.*]] [ +; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0:%.*]] +; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2:%.*]] +; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4:%.*]] +; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6:%.*]] +; LOOKUPTABLE-NEXT: ] +; LOOKUPTABLE: branch_0: +; LOOKUPTABLE-NEXT: br label [[EXIT:%.*]] +; LOOKUPTABLE: branch_2: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: branch_4: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: branch_6: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: default_branch: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: exit: +; LOOKUPTABLE-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ] +; LOOKUPTABLE-NEXT: ret i64 [[TMP2]] +; +; SWITCH-LABEL: define i64 @bar1( +; SWITCH-SAME: i64 [[TMP0:%.*]]) { +; SWITCH-NEXT: switch i64 [[TMP0]], label [[DEFAULT_BRANCH:%.*]] [ +; SWITCH-NEXT: i64 0, label [[BRANCH_0:%.*]] +; SWITCH-NEXT: i64 2, label [[BRANCH_2:%.*]] +; SWITCH-NEXT: i64 4, label [[BRANCH_4:%.*]] +; SWITCH-NEXT: i64 6, label [[BRANCH_6:%.*]] +; SWITCH-NEXT: ] +; SWITCH: branch_0: +; SWITCH-NEXT: br label [[EXIT:%.*]] +; SWITCH: branch_2: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: branch_4: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: branch_6: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: default_branch: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: exit: +; SWITCH-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ] +; SWITCH-NEXT: ret i64 [[TMP2]] +; +; CHECK-LABEL: define i64 @bar1( +; CHECK-SAME: i64 [[A:%.*]]) { +; CHECK-NEXT: switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [ +; CHECK-NEXT: i64 0, label [[BRANCH_0:%.*]] +; CHECK-NEXT: i64 2, label [[BRANCH_2:%.*]] +; CHECK-NEXT: i64 4, label [[BRANCH_4:%.*]] +; CHECK-NEXT: i64 6, label [[BRANCH_6:%.*]] +; CHECK-NEXT: ] +; CHECK: branch_0: +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: branch_2: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: branch_4: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: branch_6: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: default_branch: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ] +; CHECK-NEXT: ret i64 [[B]] +; + switch i64 %a, label %default_branch [ + i64 0, label %branch_0 + i64 2, label %branch_2 + i64 4, label %branch_4 + i64 6, label %branch_6 + ] + +branch_0: + br label %exit + +branch_2: + br label %exit + +branch_4: + br label %exit + +branch_6: + br label %exit + +default_branch: + br label %exit + +exit: + %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ], [ 3, %default_branch ] + ret i64 %b +} + +define i64 @bar2(i64 %a) { +; LOOKUPTABLE-LABEL: define i64 @bar2( +; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) { +; LOOKUPTABLE-NEXT: switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT:%.*]] [ +; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0:%.*]] +; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2:%.*]] +; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4:%.*]] +; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6:%.*]] +; LOOKUPTABLE-NEXT: ] +; LOOKUPTABLE: branch_0: +; LOOKUPTABLE-NEXT: br label [[EXIT:%.*]] +; LOOKUPTABLE: branch_2: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: branch_4: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: branch_6: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: unreachabledefault: +; LOOKUPTABLE-NEXT: unreachable +; LOOKUPTABLE: exit: +; LOOKUPTABLE-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ] +; LOOKUPTABLE-NEXT: ret i64 [[TMP2]] +; +; SWITCH-LABEL: define i64 @bar2( +; SWITCH-SAME: i64 [[TMP0:%.*]]) { +; SWITCH-NEXT: switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT:%.*]] [ +; SWITCH-NEXT: i64 0, label [[BRANCH_0:%.*]] +; SWITCH-NEXT: i64 2, label [[BRANCH_2:%.*]] +; SWITCH-NEXT: i64 4, label [[BRANCH_4:%.*]] +; SWITCH-NEXT: i64 6, label [[BRANCH_6:%.*]] +; SWITCH-NEXT: ] +; SWITCH: branch_0: +; SWITCH-NEXT: br label [[EXIT:%.*]] +; SWITCH: branch_2: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: branch_4: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: branch_6: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: unreachabledefault: +; SWITCH-NEXT: unreachable +; SWITCH: exit: +; SWITCH-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ] +; SWITCH-NEXT: ret i64 [[TMP2]] +; +; CHECK-LABEL: define i64 @bar2( +; CHECK-SAME: i64 [[A:%.*]]) { +; CHECK-NEXT: switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [ +; CHECK-NEXT: i64 0, label [[BRANCH_0:%.*]] +; CHECK-NEXT: i64 2, label [[BRANCH_2:%.*]] +; CHECK-NEXT: i64 4, label [[BRANCH_4:%.*]] +; CHECK-NEXT: i64 6, label [[BRANCH_6:%.*]] +; CHECK-NEXT: ] +; CHECK: branch_0: +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: branch_2: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: branch_4: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: branch_6: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: unreachabledefault: +; CHECK-NEXT: unreachable +; CHECK: exit: +; CHECK-NEXT: [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ] +; CHECK-NEXT: ret i64 [[B]] +; + switch i64 %a, label %unreachabledefault [ + i64 0, label %branch_0 + i64 2, label %branch_2 + i64 4, label %branch_4 + i64 6, label %branch_6 + ] + +branch_0: + br label %exit + +branch_2: + br label %exit + +branch_4: + br label %exit + +branch_6: + br label %exit + +unreachabledefault: + unreachable + +exit: + %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ] + ret i64 %b +} diff --git a/llvm/test/Transforms/Inline/inline-switch-default.ll b/llvm/test/Transforms/Inline/inline-switch-default.ll new file mode 100644 index 0000000..44f1304 --- /dev/null +++ b/llvm/test/Transforms/Inline/inline-switch-default.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt %s -S -passes=inline -inline-threshold=26 -min-jump-table-entries=4 | FileCheck %s -check-prefix=LOOKUPTABLE +; RUN: opt %s -S -passes=inline -inline-threshold=21 -min-jump-table-entries=5 | FileCheck %s -check-prefix=SWITCH + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; The `bar1` should not be inlined since there is a default branch. + +define i64 @foo1(i64 %a) { +; LOOKUPTABLE-LABEL: define i64 @foo1( +; LOOKUPTABLE-SAME: i64 [[A:%.*]]) { +; LOOKUPTABLE-NEXT: [[B:%.*]] = call i64 @bar1(i64 [[A]]) +; LOOKUPTABLE-NEXT: ret i64 [[B]] +; +; SWITCH-LABEL: define i64 @foo1( +; SWITCH-SAME: i64 [[A:%.*]]) { +; SWITCH-NEXT: [[B:%.*]] = call i64 @bar1(i64 [[A]]) +; SWITCH-NEXT: ret i64 [[B]] +; + %b = call i64 @bar1(i64 %a) + ret i64 %b +} + +define i64 @foo2(i64 %a) { +; LOOKUPTABLE-LABEL: define i64 @foo2( +; LOOKUPTABLE-SAME: i64 [[A:%.*]]) { +; LOOKUPTABLE-NEXT: switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [ +; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0_I:%.*]] +; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2_I:%.*]] +; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4_I:%.*]] +; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6_I:%.*]] +; LOOKUPTABLE-NEXT: ] +; LOOKUPTABLE: branch_0.i: +; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT:%.*]] +; LOOKUPTABLE: branch_2.i: +; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT]] +; LOOKUPTABLE: branch_4.i: +; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT]] +; LOOKUPTABLE: branch_6.i: +; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT]] +; LOOKUPTABLE: unreachabledefault.i: +; LOOKUPTABLE-NEXT: unreachable +; LOOKUPTABLE: bar2.exit: +; LOOKUPTABLE-NEXT: [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ] +; LOOKUPTABLE-NEXT: ret i64 [[B_I]] +; +; SWITCH-LABEL: define i64 @foo2( +; SWITCH-SAME: i64 [[A:%.*]]) { +; SWITCH-NEXT: switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [ +; SWITCH-NEXT: i64 0, label [[BRANCH_0_I:%.*]] +; SWITCH-NEXT: i64 2, label [[BRANCH_2_I:%.*]] +; SWITCH-NEXT: i64 4, label [[BRANCH_4_I:%.*]] +; SWITCH-NEXT: i64 6, label [[BRANCH_6_I:%.*]] +; SWITCH-NEXT: ] +; SWITCH: branch_0.i: +; SWITCH-NEXT: br label [[BAR2_EXIT:%.*]] +; SWITCH: branch_2.i: +; SWITCH-NEXT: br label [[BAR2_EXIT]] +; SWITCH: branch_4.i: +; SWITCH-NEXT: br label [[BAR2_EXIT]] +; SWITCH: branch_6.i: +; SWITCH-NEXT: br label [[BAR2_EXIT]] +; SWITCH: unreachabledefault.i: +; SWITCH-NEXT: unreachable +; SWITCH: bar2.exit: +; SWITCH-NEXT: [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ] +; SWITCH-NEXT: ret i64 [[B_I]] +; + %b = call i64 @bar2(i64 %a) + ret i64 %b +} + +define i64 @bar1(i64 %a) { +; LOOKUPTABLE-LABEL: define i64 @bar1( +; LOOKUPTABLE-SAME: i64 [[A:%.*]]) { +; LOOKUPTABLE-NEXT: switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [ +; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0:%.*]] +; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2:%.*]] +; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4:%.*]] +; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6:%.*]] +; LOOKUPTABLE-NEXT: ] +; LOOKUPTABLE: branch_0: +; LOOKUPTABLE-NEXT: br label [[EXIT:%.*]] +; LOOKUPTABLE: branch_2: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: branch_4: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: branch_6: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: default_branch: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: exit: +; LOOKUPTABLE-NEXT: [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ] +; LOOKUPTABLE-NEXT: ret i64 [[B]] +; +; SWITCH-LABEL: define i64 @bar1( +; SWITCH-SAME: i64 [[A:%.*]]) { +; SWITCH-NEXT: switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [ +; SWITCH-NEXT: i64 0, label [[BRANCH_0:%.*]] +; SWITCH-NEXT: i64 2, label [[BRANCH_2:%.*]] +; SWITCH-NEXT: i64 4, label [[BRANCH_4:%.*]] +; SWITCH-NEXT: i64 6, label [[BRANCH_6:%.*]] +; SWITCH-NEXT: ] +; SWITCH: branch_0: +; SWITCH-NEXT: br label [[EXIT:%.*]] +; SWITCH: branch_2: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: branch_4: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: branch_6: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: default_branch: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: exit: +; SWITCH-NEXT: [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ] +; SWITCH-NEXT: ret i64 [[B]] +; + switch i64 %a, label %default_branch [ + i64 0, label %branch_0 + i64 2, label %branch_2 + i64 4, label %branch_4 + i64 6, label %branch_6 + ] + +branch_0: + br label %exit + +branch_2: + br label %exit + +branch_4: + br label %exit + +branch_6: + br label %exit + +default_branch: + br label %exit + +exit: + %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ], [ 3, %default_branch ] + ret i64 %b +} + +define i64 @bar2(i64 %a) { +; LOOKUPTABLE-LABEL: define i64 @bar2( +; LOOKUPTABLE-SAME: i64 [[A:%.*]]) { +; LOOKUPTABLE-NEXT: switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [ +; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0:%.*]] +; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2:%.*]] +; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4:%.*]] +; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6:%.*]] +; LOOKUPTABLE-NEXT: ] +; LOOKUPTABLE: branch_0: +; LOOKUPTABLE-NEXT: br label [[EXIT:%.*]] +; LOOKUPTABLE: branch_2: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: branch_4: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: branch_6: +; LOOKUPTABLE-NEXT: br label [[EXIT]] +; LOOKUPTABLE: unreachabledefault: +; LOOKUPTABLE-NEXT: unreachable +; LOOKUPTABLE: exit: +; LOOKUPTABLE-NEXT: [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ] +; LOOKUPTABLE-NEXT: ret i64 [[B]] +; +; SWITCH-LABEL: define i64 @bar2( +; SWITCH-SAME: i64 [[A:%.*]]) { +; SWITCH-NEXT: switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [ +; SWITCH-NEXT: i64 0, label [[BRANCH_0:%.*]] +; SWITCH-NEXT: i64 2, label [[BRANCH_2:%.*]] +; SWITCH-NEXT: i64 4, label [[BRANCH_4:%.*]] +; SWITCH-NEXT: i64 6, label [[BRANCH_6:%.*]] +; SWITCH-NEXT: ] +; SWITCH: branch_0: +; SWITCH-NEXT: br label [[EXIT:%.*]] +; SWITCH: branch_2: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: branch_4: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: branch_6: +; SWITCH-NEXT: br label [[EXIT]] +; SWITCH: unreachabledefault: +; SWITCH-NEXT: unreachable +; SWITCH: exit: +; SWITCH-NEXT: [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ] +; SWITCH-NEXT: ret i64 [[B]] +; + switch i64 %a, label %unreachabledefault [ + i64 0, label %branch_0 + i64 2, label %branch_2 + i64 4, label %branch_4 + i64 6, label %branch_6 + ] + +branch_0: + br label %exit + +branch_2: + br label %exit + +branch_4: + br label %exit + +branch_6: + br label %exit + +unreachabledefault: + unreachable + +exit: + %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ] + ret i64 %b +} -- cgit v1.1 From c3dfbb6f49845edd4b953055f5fe14257fad6b58 Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 11 Feb 2024 11:20:11 +0000 Subject: [AArch64][GlobalISel] Add commute_constant_to_rhs to post legalizer combiners (#81103) This helps the fp reductions, moving the constant operands to the RHS which in turn helps simplify away fadd -0.0 and fmul 1.0. --- llvm/lib/Target/AArch64/AArch64Combine.td | 3 +- .../CodeGen/AArch64/GlobalISel/arm64-atomic.ll | 12 +- llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll | 214 +++++++-------------- llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll | 112 ++++------- 4 files changed, 105 insertions(+), 236 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 1daa7d5..fdea974 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -288,5 +288,6 @@ def AArch64PostLegalizerCombiner constant_fold_binops, identity_combines, ptr_add_immed_chain, overlapping_and, split_store_zero_128, undef_combines, - select_to_minmax, or_to_bsp]> { + select_to_minmax, or_to_bsp, + commute_constant_to_rhs]> { } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll index 0e9c126..458c2cb 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll @@ -2146,8 +2146,7 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w30, -8 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: mov x2, x0 -; CHECK-OUTLINE-O1-NEXT: mov w8, #-1 ; =0xffffffff -; CHECK-OUTLINE-O1-NEXT: eor w0, w8, w1 +; CHECK-OUTLINE-O1-NEXT: mvn w0, w1 ; CHECK-OUTLINE-O1-NEXT: mov x1, x2 ; CHECK-OUTLINE-O1-NEXT: bl ___aarch64_ldclr1_rel ; CHECK-OUTLINE-O1-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload @@ -3202,8 +3201,7 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w30, -8 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: mov x2, x0 -; CHECK-OUTLINE-O1-NEXT: mov w8, #-1 ; =0xffffffff -; CHECK-OUTLINE-O1-NEXT: eor w0, w8, w1 +; CHECK-OUTLINE-O1-NEXT: mvn w0, w1 ; CHECK-OUTLINE-O1-NEXT: mov x1, x2 ; CHECK-OUTLINE-O1-NEXT: bl ___aarch64_ldclr2_rel ; CHECK-OUTLINE-O1-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload @@ -4255,8 +4253,7 @@ define i32 @atomicrmw_and_i32(ptr %ptr, i32 %rhs) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w30, -8 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: mov x2, x0 -; CHECK-OUTLINE-O1-NEXT: mov w8, #-1 ; =0xffffffff -; CHECK-OUTLINE-O1-NEXT: eor w0, w8, w1 +; CHECK-OUTLINE-O1-NEXT: mvn w0, w1 ; CHECK-OUTLINE-O1-NEXT: mov x1, x2 ; CHECK-OUTLINE-O1-NEXT: bl ___aarch64_ldclr4_rel ; CHECK-OUTLINE-O1-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload @@ -5276,8 +5273,7 @@ define i64 @atomicrmw_and_i64(ptr %ptr, i64 %rhs) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w30, -8 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: mov x2, x0 -; CHECK-OUTLINE-O1-NEXT: mov x8, #-1 ; =0xffffffffffffffff -; CHECK-OUTLINE-O1-NEXT: eor x0, x8, x1 +; CHECK-OUTLINE-O1-NEXT: mvn x0, x1 ; CHECK-OUTLINE-O1-NEXT: mov x1, x2 ; CHECK-OUTLINE-O1-NEXT: bl ___aarch64_ldclr8_rel ; CHECK-OUTLINE-O1-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll index 2023770..de95943 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll @@ -13,11 +13,7 @@ define float @add_HalfS(<2 x float> %bin.rdx) { ; ; CHECK-GI-LABEL: add_HalfS: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2s, #128, lsl #24 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: fadd s0, s1, s0 -; CHECK-GI-NEXT: fadd s0, s0, s2 +; CHECK-GI-NEXT: faddp s0, v0.2s ; CHECK-GI-NEXT: ret %r = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx) ret float %r @@ -82,15 +78,12 @@ define half @add_HalfH(<4 x half> %bin.rdx) { ; ; CHECK-GI-FP16-LABEL: add_HalfH: ; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: adrp x8, .LCPI1_0 ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov h2, v0.h[1] -; CHECK-GI-FP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-FP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0] -; CHECK-GI-FP16-NEXT: fadd h1, h1, h0 +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: fadd h1, h0, h1 ; CHECK-GI-FP16-NEXT: mov h0, v0.h[3] ; CHECK-GI-FP16-NEXT: fadd h1, h1, h2 -; CHECK-GI-FP16-NEXT: fadd h1, h1, h3 ; CHECK-GI-FP16-NEXT: fadd h0, h1, h0 ; CHECK-GI-FP16-NEXT: ret %r = call half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx) @@ -202,22 +195,18 @@ define half @add_H(<8 x half> %bin.rdx) { ; ; CHECK-GI-FP16-LABEL: add_H: ; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: adrp x8, .LCPI2_0 -; CHECK-GI-FP16-NEXT: mov h2, v0.h[1] -; CHECK-GI-FP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-FP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0] -; CHECK-GI-FP16-NEXT: fadd h1, h1, h0 -; CHECK-GI-FP16-NEXT: fadd h1, h1, h2 -; CHECK-GI-FP16-NEXT: mov h2, v0.h[3] +; CHECK-GI-FP16-NEXT: mov h1, v0.h[2] +; CHECK-GI-FP16-NEXT: faddp h2, v0.2h +; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] +; CHECK-GI-FP16-NEXT: fadd h1, h2, h1 +; CHECK-GI-FP16-NEXT: mov h2, v0.h[4] ; CHECK-GI-FP16-NEXT: fadd h1, h1, h3 -; CHECK-GI-FP16-NEXT: mov h3, v0.h[4] +; CHECK-GI-FP16-NEXT: mov h3, v0.h[5] ; CHECK-GI-FP16-NEXT: fadd h1, h1, h2 -; CHECK-GI-FP16-NEXT: mov h2, v0.h[5] -; CHECK-GI-FP16-NEXT: fadd h1, h1, h3 -; CHECK-GI-FP16-NEXT: mov h3, v0.h[6] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[6] ; CHECK-GI-FP16-NEXT: mov h0, v0.h[7] -; CHECK-GI-FP16-NEXT: fadd h1, h1, h2 ; CHECK-GI-FP16-NEXT: fadd h1, h1, h3 +; CHECK-GI-FP16-NEXT: fadd h1, h1, h2 ; CHECK-GI-FP16-NEXT: fadd h0, h1, h0 ; CHECK-GI-FP16-NEXT: ret %r = call half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx) @@ -225,44 +214,23 @@ define half @add_H(<8 x half> %bin.rdx) { } define float @add_S(<4 x float> %bin.rdx) { -; CHECK-SD-LABEL: add_S: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov s1, v0.s[2] -; CHECK-SD-NEXT: faddp s2, v0.2s -; CHECK-SD-NEXT: mov s0, v0.s[3] -; CHECK-SD-NEXT: fadd s1, s2, s1 -; CHECK-SD-NEXT: fadd s0, s1, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_S: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2s, #128, lsl #24 -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: mov s3, v0.s[2] -; CHECK-GI-NEXT: fadd s1, s1, s0 -; CHECK-GI-NEXT: mov s0, v0.s[3] -; CHECK-GI-NEXT: fadd s1, s1, s2 -; CHECK-GI-NEXT: fadd s1, s1, s3 -; CHECK-GI-NEXT: fadd s0, s1, s0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_S: +; CHECK: // %bb.0: +; CHECK-NEXT: mov s1, v0.s[2] +; CHECK-NEXT: faddp s2, v0.2s +; CHECK-NEXT: mov s0, v0.s[3] +; CHECK-NEXT: fadd s1, s2, s1 +; CHECK-NEXT: fadd s0, s1, s0 +; CHECK-NEXT: ret %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx) ret float %r } define double @add_D(<2 x double> %bin.rdx) { -; CHECK-SD-LABEL: add_D: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: faddp d0, v0.2d -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_D: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: fmov d1, x8 -; CHECK-GI-NEXT: fadd d0, d1, d0 -; CHECK-GI-NEXT: fadd d0, d0, d2 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_D: +; CHECK: // %bb.0: +; CHECK-NEXT: faddp d0, v0.2d +; CHECK-NEXT: ret %r = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx) ret double %r } @@ -464,23 +432,19 @@ define half @add_2H(<16 x half> %bin.rdx) { ; ; CHECK-GI-FP16-LABEL: add_2H: ; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: adrp x8, .LCPI5_0 -; CHECK-GI-FP16-NEXT: mov h3, v0.h[1] -; CHECK-GI-FP16-NEXT: mov h4, v0.h[2] -; CHECK-GI-FP16-NEXT: ldr h2, [x8, :lo12:.LCPI5_0] -; CHECK-GI-FP16-NEXT: fadd h2, h2, h0 -; CHECK-GI-FP16-NEXT: fadd h2, h2, h3 -; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: faddp h3, v0.2h +; CHECK-GI-FP16-NEXT: mov h4, v0.h[3] +; CHECK-GI-FP16-NEXT: fadd h2, h3, h2 +; CHECK-GI-FP16-NEXT: mov h3, v0.h[4] ; CHECK-GI-FP16-NEXT: fadd h2, h2, h4 -; CHECK-GI-FP16-NEXT: mov h4, v0.h[4] +; CHECK-GI-FP16-NEXT: mov h4, v0.h[5] ; CHECK-GI-FP16-NEXT: fadd h2, h2, h3 -; CHECK-GI-FP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-FP16-NEXT: fadd h2, h2, h4 -; CHECK-GI-FP16-NEXT: mov h4, v0.h[6] +; CHECK-GI-FP16-NEXT: mov h3, v0.h[6] ; CHECK-GI-FP16-NEXT: mov h0, v0.h[7] +; CHECK-GI-FP16-NEXT: fadd h2, h2, h4 ; CHECK-GI-FP16-NEXT: fadd h2, h2, h3 ; CHECK-GI-FP16-NEXT: mov h3, v1.h[2] -; CHECK-GI-FP16-NEXT: fadd h2, h2, h4 ; CHECK-GI-FP16-NEXT: fadd h0, h2, h0 ; CHECK-GI-FP16-NEXT: mov h2, v1.h[1] ; CHECK-GI-FP16-NEXT: fadd h0, h0, h1 @@ -502,95 +466,51 @@ define half @add_2H(<16 x half> %bin.rdx) { } define float @add_2S(<8 x float> %bin.rdx) { -; CHECK-SD-LABEL: add_2S: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov s2, v0.s[2] -; CHECK-SD-NEXT: faddp s3, v0.2s -; CHECK-SD-NEXT: mov s0, v0.s[3] -; CHECK-SD-NEXT: fadd s2, s3, s2 -; CHECK-SD-NEXT: mov s3, v1.s[2] -; CHECK-SD-NEXT: fadd s0, s2, s0 -; CHECK-SD-NEXT: mov s2, v1.s[1] -; CHECK-SD-NEXT: fadd s0, s0, s1 -; CHECK-SD-NEXT: mov s1, v1.s[3] -; CHECK-SD-NEXT: fadd s0, s0, s2 -; CHECK-SD-NEXT: fadd s0, s0, s3 -; CHECK-SD-NEXT: fadd s0, s0, s1 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_2S: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2s, #128, lsl #24 -; CHECK-GI-NEXT: mov s3, v0.s[1] -; CHECK-GI-NEXT: mov s4, v0.s[2] -; CHECK-GI-NEXT: fadd s2, s2, s0 -; CHECK-GI-NEXT: mov s0, v0.s[3] -; CHECK-GI-NEXT: fadd s2, s2, s3 -; CHECK-GI-NEXT: mov s3, v1.s[2] -; CHECK-GI-NEXT: fadd s2, s2, s4 -; CHECK-GI-NEXT: fadd s0, s2, s0 -; CHECK-GI-NEXT: mov s2, v1.s[1] -; CHECK-GI-NEXT: fadd s0, s0, s1 -; CHECK-GI-NEXT: mov s1, v1.s[3] -; CHECK-GI-NEXT: fadd s0, s0, s2 -; CHECK-GI-NEXT: fadd s0, s0, s3 -; CHECK-GI-NEXT: fadd s0, s0, s1 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_2S: +; CHECK: // %bb.0: +; CHECK-NEXT: mov s2, v0.s[2] +; CHECK-NEXT: faddp s3, v0.2s +; CHECK-NEXT: mov s0, v0.s[3] +; CHECK-NEXT: fadd s2, s3, s2 +; CHECK-NEXT: mov s3, v1.s[2] +; CHECK-NEXT: fadd s0, s2, s0 +; CHECK-NEXT: mov s2, v1.s[1] +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: mov s1, v1.s[3] +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: fadd s0, s0, s3 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret %r = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx) ret float %r } define double @add_2D(<4 x double> %bin.rdx) { -; CHECK-SD-LABEL: add_2D: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: faddp d0, v0.2d -; CHECK-SD-NEXT: mov d2, v1.d[1] -; CHECK-SD-NEXT: fadd d0, d0, d1 -; CHECK-SD-NEXT: fadd d0, d0, d2 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_2D: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 -; CHECK-GI-NEXT: mov d3, v0.d[1] -; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: fadd d0, d2, d0 -; CHECK-GI-NEXT: mov d2, v1.d[1] -; CHECK-GI-NEXT: fadd d0, d0, d3 -; CHECK-GI-NEXT: fadd d0, d0, d1 -; CHECK-GI-NEXT: fadd d0, d0, d2 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_2D: +; CHECK: // %bb.0: +; CHECK-NEXT: faddp d0, v0.2d +; CHECK-NEXT: mov d2, v1.d[1] +; CHECK-NEXT: fadd d0, d0, d1 +; CHECK-NEXT: fadd d0, d0, d2 +; CHECK-NEXT: ret %r = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx) ret double %r } ; Added at least one test where the start value is not -0.0. define float @add_S_init_42(<4 x float> %bin.rdx) { -; CHECK-SD-LABEL: add_S_init_42: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #1109917696 // =0x42280000 -; CHECK-SD-NEXT: mov s2, v0.s[1] -; CHECK-SD-NEXT: mov s3, v0.s[2] -; CHECK-SD-NEXT: fmov s1, w8 -; CHECK-SD-NEXT: fadd s1, s0, s1 -; CHECK-SD-NEXT: mov s0, v0.s[3] -; CHECK-SD-NEXT: fadd s1, s1, s2 -; CHECK-SD-NEXT: fadd s1, s1, s3 -; CHECK-SD-NEXT: fadd s0, s1, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_S_init_42: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1109917696 // =0x42280000 -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: mov s3, v0.s[2] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: fadd s1, s1, s0 -; CHECK-GI-NEXT: mov s0, v0.s[3] -; CHECK-GI-NEXT: fadd s1, s1, s2 -; CHECK-GI-NEXT: fadd s1, s1, s3 -; CHECK-GI-NEXT: fadd s0, s1, s0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_S_init_42: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1109917696 // =0x42280000 +; CHECK-NEXT: mov s2, v0.s[1] +; CHECK-NEXT: mov s3, v0.s[2] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fadd s1, s0, s1 +; CHECK-NEXT: mov s0, v0.s[3] +; CHECK-NEXT: fadd s1, s1, s2 +; CHECK-NEXT: fadd s1, s1, s3 +; CHECK-NEXT: fadd s0, s1, s0 +; CHECK-NEXT: ret %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float 42.0, <4 x float> %bin.rdx) ret float %r } @@ -604,5 +524,3 @@ declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll index 32ce4d6..7b93e60 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll @@ -13,11 +13,9 @@ define float @mul_HalfS(<2 x float> %bin.rdx) { ; ; CHECK-GI-LABEL: mul_HalfS: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov s1, #1.00000000 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: fmul s0, s1, s0 -; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: ret %r = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx) ret float %r @@ -80,14 +78,12 @@ define half @mul_HalfH(<4 x half> %bin.rdx) { ; ; CHECK-GI-FP16-LABEL: mul_HalfH: ; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: fmov h1, #1.00000000 ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov h2, v0.h[1] -; CHECK-GI-FP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-FP16-NEXT: fmul h1, h1, h0 +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: fmul h1, h0, h1 ; CHECK-GI-FP16-NEXT: mov h0, v0.h[3] ; CHECK-GI-FP16-NEXT: fmul h1, h1, h2 -; CHECK-GI-FP16-NEXT: fmul h1, h1, h3 ; CHECK-GI-FP16-NEXT: fmul h0, h1, h0 ; CHECK-GI-FP16-NEXT: ret %r = call half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx) @@ -193,9 +189,7 @@ define half @mul_H(<8 x half> %bin.rdx) { ; ; CHECK-GI-FP16-LABEL: mul_H: ; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: fmov h1, #1.00000000 -; CHECK-GI-FP16-NEXT: fmul h1, h1, h0 -; CHECK-GI-FP16-NEXT: fmul h1, h1, v0.h[1] +; CHECK-GI-FP16-NEXT: fmul h1, h0, v0.h[1] ; CHECK-GI-FP16-NEXT: fmul h1, h1, v0.h[2] ; CHECK-GI-FP16-NEXT: fmul h1, h1, v0.h[3] ; CHECK-GI-FP16-NEXT: fmul h1, h1, v0.h[4] @@ -208,37 +202,21 @@ define half @mul_H(<8 x half> %bin.rdx) { } define float @mul_S(<4 x float> %bin.rdx) { -; CHECK-SD-LABEL: mul_S: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmul s1, s0, v0.s[1] -; CHECK-SD-NEXT: fmul s1, s1, v0.s[2] -; CHECK-SD-NEXT: fmul s0, s1, v0.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: mul_S: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov s1, #1.00000000 -; CHECK-GI-NEXT: fmul s1, s1, s0 -; CHECK-GI-NEXT: fmul s1, s1, v0.s[1] -; CHECK-GI-NEXT: fmul s1, s1, v0.s[2] -; CHECK-GI-NEXT: fmul s0, s1, v0.s[3] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: mul_S: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul s1, s0, v0.s[1] +; CHECK-NEXT: fmul s1, s1, v0.s[2] +; CHECK-NEXT: fmul s0, s1, v0.s[3] +; CHECK-NEXT: ret %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx) ret float %r } define double @mul_D(<2 x double> %bin.rdx) { -; CHECK-SD-LABEL: mul_D: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmul d0, d0, v0.d[1] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: mul_D: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov d1, #1.00000000 -; CHECK-GI-NEXT: fmul d1, d1, d0 -; CHECK-GI-NEXT: fmul d0, d1, v0.d[1] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: mul_D: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul d0, d0, v0.d[1] +; CHECK-NEXT: ret %r = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %bin.rdx) ret double %r } @@ -427,9 +405,7 @@ define half @mul_2H(<16 x half> %bin.rdx) { ; ; CHECK-GI-FP16-LABEL: mul_2H: ; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: fmov h2, #1.00000000 -; CHECK-GI-FP16-NEXT: fmul h2, h2, h0 -; CHECK-GI-FP16-NEXT: fmul h2, h2, v0.h[1] +; CHECK-GI-FP16-NEXT: fmul h2, h0, v0.h[1] ; CHECK-GI-FP16-NEXT: fmul h2, h2, v0.h[2] ; CHECK-GI-FP16-NEXT: fmul h2, h2, v0.h[3] ; CHECK-GI-FP16-NEXT: fmul h2, h2, v0.h[4] @@ -450,49 +426,27 @@ define half @mul_2H(<16 x half> %bin.rdx) { } define float @mul_2S(<8 x float> %bin.rdx) { -; CHECK-SD-LABEL: mul_2S: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmul s2, s0, v0.s[1] -; CHECK-SD-NEXT: fmul s2, s2, v0.s[2] -; CHECK-SD-NEXT: fmul s0, s2, v0.s[3] -; CHECK-SD-NEXT: fmul s0, s0, s1 -; CHECK-SD-NEXT: fmul s0, s0, v1.s[1] -; CHECK-SD-NEXT: fmul s0, s0, v1.s[2] -; CHECK-SD-NEXT: fmul s0, s0, v1.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: mul_2S: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov s2, #1.00000000 -; CHECK-GI-NEXT: fmul s2, s2, s0 -; CHECK-GI-NEXT: fmul s2, s2, v0.s[1] -; CHECK-GI-NEXT: fmul s2, s2, v0.s[2] -; CHECK-GI-NEXT: fmul s0, s2, v0.s[3] -; CHECK-GI-NEXT: fmul s0, s0, s1 -; CHECK-GI-NEXT: fmul s0, s0, v1.s[1] -; CHECK-GI-NEXT: fmul s0, s0, v1.s[2] -; CHECK-GI-NEXT: fmul s0, s0, v1.s[3] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: mul_2S: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul s2, s0, v0.s[1] +; CHECK-NEXT: fmul s2, s2, v0.s[2] +; CHECK-NEXT: fmul s0, s2, v0.s[3] +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: fmul s0, s0, v1.s[1] +; CHECK-NEXT: fmul s0, s0, v1.s[2] +; CHECK-NEXT: fmul s0, s0, v1.s[3] +; CHECK-NEXT: ret %r = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx) ret float %r } define double @mul_2D(<4 x double> %bin.rdx) { -; CHECK-SD-LABEL: mul_2D: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmul d0, d0, v0.d[1] -; CHECK-SD-NEXT: fmul d0, d0, d1 -; CHECK-SD-NEXT: fmul d0, d0, v1.d[1] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: mul_2D: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov d2, #1.00000000 -; CHECK-GI-NEXT: fmul d2, d2, d0 -; CHECK-GI-NEXT: fmul d0, d2, v0.d[1] -; CHECK-GI-NEXT: fmul d0, d0, d1 -; CHECK-GI-NEXT: fmul d0, d0, v1.d[1] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: mul_2D: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul d0, d0, v0.d[1] +; CHECK-NEXT: fmul d0, d0, d1 +; CHECK-NEXT: fmul d0, d0, v1.d[1] +; CHECK-NEXT: ret %r = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %bin.rdx) ret double %r } -- cgit v1.1 From 887ed6d2876156ade8a382e521130feae4b91b82 Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 11 Feb 2024 11:20:53 +0000 Subject: [AArch64][GlobalISel] Remove mulh c++ lowering (#81105) I believe these should be selectable via tablegen patterns nowadays. --- .../AArch64/GISel/AArch64InstructionSelector.cpp | 28 ---------------------- 1 file changed, 28 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 2515991..9d51a7f 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -3020,34 +3020,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_INDEXED_STORE: return selectIndexedStore(cast(I), MRI); - case TargetOpcode::G_SMULH: - case TargetOpcode::G_UMULH: { - // Reject the various things we don't support yet. - if (unsupportedBinOp(I, RBI, MRI, TRI)) - return false; - - const Register DefReg = I.getOperand(0).getReg(); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); - - if (RB.getID() != AArch64::GPRRegBankID) { - LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); - return false; - } - - if (Ty != LLT::scalar(64)) { - LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty - << ", expected: " << LLT::scalar(64) << '\n'); - return false; - } - - unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr - : AArch64::UMULHrr; - I.setDesc(TII.get(NewOpc)); - - // Now that we selected an opcode, we need to constrain the register - // operands to use appropriate classes. - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } case TargetOpcode::G_LSHR: case TargetOpcode::G_ASHR: if (MRI.getType(I.getOperand(0).getReg()).isVector()) -- cgit v1.1 From b985d4179a882892ce009fb3668cdc917e27f5d5 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 14:59:33 +0300 Subject: [clang][NFC] Annotate `ExprConstant.cpp` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/AST/ExprConstant.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 02e153f..33ad94e 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -240,15 +240,19 @@ namespace { /// True if the subobject was named in a manner not supported by C++11. Such /// lvalues can still be folded, but they are not core constant expressions /// and we cannot perform lvalue-to-rvalue conversions on them. + LLVM_PREFERRED_TYPE(bool) unsigned Invalid : 1; /// Is this a pointer one past the end of an object? + LLVM_PREFERRED_TYPE(bool) unsigned IsOnePastTheEnd : 1; /// Indicator of whether the first entry is an unsized array. + LLVM_PREFERRED_TYPE(bool) unsigned FirstEntryIsAnUnsizedArray : 1; /// Indicator of whether the most-derived object is an array element. + LLVM_PREFERRED_TYPE(bool) unsigned MostDerivedIsArrayElement : 1; /// The length of the path to the most-derived object of which this is a -- cgit v1.1 From 63b414e4977d6e19f05947c88f57cd127fa328e3 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 15:01:18 +0300 Subject: [clang][NFC] Annotate `RecordLayoutBuilder.cpp` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/AST/RecordLayoutBuilder.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp index 6dfaadd..a3b7431 100644 --- a/clang/lib/AST/RecordLayoutBuilder.cpp +++ b/clang/lib/AST/RecordLayoutBuilder.cpp @@ -602,21 +602,28 @@ protected: /// Whether the external AST source has provided a layout for this /// record. + LLVM_PREFERRED_TYPE(bool) unsigned UseExternalLayout : 1; /// Whether we need to infer alignment, even when we have an /// externally-provided layout. + LLVM_PREFERRED_TYPE(bool) unsigned InferAlignment : 1; /// Packed - Whether the record is packed or not. + LLVM_PREFERRED_TYPE(bool) unsigned Packed : 1; + LLVM_PREFERRED_TYPE(bool) unsigned IsUnion : 1; + LLVM_PREFERRED_TYPE(bool) unsigned IsMac68kAlign : 1; + LLVM_PREFERRED_TYPE(bool) unsigned IsNaturalAlign : 1; + LLVM_PREFERRED_TYPE(bool) unsigned IsMsStruct : 1; /// UnfilledBitsInLastUnit - If the last field laid out was a bitfield, -- cgit v1.1 From eaff01f4fc1b3f1ccdc5fc6dafb39af959d00f6d Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 15:03:03 +0300 Subject: [clang][NFC] Annotate `CGExprCXX.cpp` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/CodeGen/CGExprCXX.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index d136bfc..2adbef6 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -1423,6 +1423,7 @@ namespace { }; unsigned NumPlacementArgs : 31; + LLVM_PREFERRED_TYPE(bool) unsigned PassAlignmentToPlacementDelete : 1; const FunctionDecl *OperatorDelete; ValueTy Ptr; -- cgit v1.1 From bcc4c8231fbee46f1b16f8b9db7d9926745db9bb Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 15:04:28 +0300 Subject: [clang][NFC] Annotate `CGObjC.cpp` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/CodeGen/CGObjC.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp index 03fc0ec..f3a948c 100644 --- a/clang/lib/CodeGen/CGObjC.cpp +++ b/clang/lib/CodeGen/CGObjC.cpp @@ -899,9 +899,13 @@ namespace { const ObjCPropertyImplDecl *propImpl); private: + LLVM_PREFERRED_TYPE(StrategyKind) unsigned Kind : 8; + LLVM_PREFERRED_TYPE(bool) unsigned IsAtomic : 1; + LLVM_PREFERRED_TYPE(bool) unsigned IsCopy : 1; + LLVM_PREFERRED_TYPE(bool) unsigned HasStrong : 1; CharUnits IvarSize; -- cgit v1.1 From 6884657de8da3024b50d8737219c1f24ab075c4c Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 15:06:15 +0300 Subject: [clang][NFC] Annotate `SemaChecking.cpp` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/Sema/SemaChecking.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index f8b73c7..71e6e72 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -16652,6 +16652,7 @@ class SequenceChecker : public ConstEvaluatedExprVisitor { struct Value { explicit Value(unsigned Parent) : Parent(Parent), Merged(false) {} unsigned Parent : 31; + LLVM_PREFERRED_TYPE(bool) unsigned Merged : 1; }; SmallVector Values; -- cgit v1.1 From f0b2bcfe91e70816b33973bc50a2cb63144ba77a Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 15:07:14 +0300 Subject: [clang][NFC] Annotate `SemaStmt.cpp` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/Sema/SemaStmt.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 5ab2534..d9aaea8 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -4381,6 +4381,7 @@ Sema::ActOnObjCAutoreleasePoolStmt(SourceLocation AtLoc, Stmt *Body) { namespace { class CatchHandlerType { QualType QT; + LLVM_PREFERRED_TYPE(bool) unsigned IsPointer : 1; // This is a special constructor to be used only with DenseMapInfo's -- cgit v1.1 From 83269a04def26fe9890036857d3e1a8c6c1f770d Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 15:08:58 +0300 Subject: [clang][NFC] Annotate `cc1as_main.cpp` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/tools/driver/cc1as_main.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp index bc398fa..a55e0650 100644 --- a/clang/tools/driver/cc1as_main.cpp +++ b/clang/tools/driver/cc1as_main.cpp @@ -89,10 +89,15 @@ struct AssemblerInvocation { /// @{ std::vector IncludePaths; + LLVM_PREFERRED_TYPE(bool) unsigned NoInitialTextSection : 1; + LLVM_PREFERRED_TYPE(bool) unsigned SaveTemporaryLabels : 1; + LLVM_PREFERRED_TYPE(bool) unsigned GenDwarfForAssembly : 1; + LLVM_PREFERRED_TYPE(bool) unsigned RelaxELFRelocations : 1; + LLVM_PREFERRED_TYPE(bool) unsigned Dwarf64 : 1; unsigned DwarfVersion; std::string DwarfDebugFlags; @@ -117,7 +122,9 @@ struct AssemblerInvocation { FT_Obj ///< Object file output. }; FileType OutputType; + LLVM_PREFERRED_TYPE(bool) unsigned ShowHelp : 1; + LLVM_PREFERRED_TYPE(bool) unsigned ShowVersion : 1; /// @} @@ -125,19 +132,28 @@ struct AssemblerInvocation { /// @{ unsigned OutputAsmVariant; + LLVM_PREFERRED_TYPE(bool) unsigned ShowEncoding : 1; + LLVM_PREFERRED_TYPE(bool) unsigned ShowInst : 1; /// @} /// @name Assembler Options /// @{ + LLVM_PREFERRED_TYPE(bool) unsigned RelaxAll : 1; + LLVM_PREFERRED_TYPE(bool) unsigned NoExecStack : 1; + LLVM_PREFERRED_TYPE(bool) unsigned FatalWarnings : 1; + LLVM_PREFERRED_TYPE(bool) unsigned NoWarn : 1; + LLVM_PREFERRED_TYPE(bool) unsigned NoTypeCheck : 1; + LLVM_PREFERRED_TYPE(bool) unsigned IncrementalLinkerCompatible : 1; + LLVM_PREFERRED_TYPE(bool) unsigned EmbedBitcode : 1; /// Whether to emit DWARF unwind info. @@ -145,6 +161,7 @@ struct AssemblerInvocation { // Whether to emit compact-unwind for non-canonical entries. // Note: maybe overriden by other constraints. + LLVM_PREFERRED_TYPE(bool) unsigned EmitCompactUnwindNonCanonical : 1; /// The name of the relocation model to use. -- cgit v1.1 From bc1d61cbf8759f5144217af50d2309b5dddd5538 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 15:27:21 +0300 Subject: [clang][NFC] Annotate `SourceManagerTest.cpp` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/unittests/Basic/SourceManagerTest.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/unittests/Basic/SourceManagerTest.cpp b/clang/unittests/Basic/SourceManagerTest.cpp index 5572814..45840f5 100644 --- a/clang/unittests/Basic/SourceManagerTest.cpp +++ b/clang/unittests/Basic/SourceManagerTest.cpp @@ -530,6 +530,7 @@ struct MacroAction { SourceLocation Loc; std::string Name; + LLVM_PREFERRED_TYPE(Kind) unsigned MAKind : 3; MacroAction(SourceLocation Loc, StringRef Name, unsigned K) -- cgit v1.1 From 23bdca2c6737f25f1d184f03021f61157bac6196 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 15:41:49 +0300 Subject: [clang][NFC] Annotate `RISCVVEmitter.cpp` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/utils/TableGen/RISCVVEmitter.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp index 9f6ed39..8513174 100644 --- a/clang/utils/TableGen/RISCVVEmitter.cpp +++ b/clang/utils/TableGen/RISCVVEmitter.cpp @@ -67,7 +67,9 @@ struct SemaRecord { bool HasMaskPolicy : 1; bool HasFRMRoundModeOp : 1; bool IsTuple : 1; + LLVM_PREFERRED_TYPE(PolicyScheme) uint8_t UnMaskedPolicyScheme : 2; + LLVM_PREFERRED_TYPE(PolicyScheme) uint8_t MaskedPolicyScheme : 2; }; -- cgit v1.1 From 4bbae068d704752acbd7c5d8652c11b0954742be Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 15:43:35 +0300 Subject: [clang][NFC] Annotate `RISCVVIntrinsicUtils.h` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/include/clang/Support/RISCVVIntrinsicUtils.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/include/clang/Support/RISCVVIntrinsicUtils.h b/clang/include/clang/Support/RISCVVIntrinsicUtils.h index 30bf36e..ef9d6c1 100644 --- a/clang/include/clang/Support/RISCVVIntrinsicUtils.h +++ b/clang/include/clang/Support/RISCVVIntrinsicUtils.h @@ -554,7 +554,9 @@ struct RVVIntrinsicRecord { bool HasMaskPolicy : 1; bool HasFRMRoundModeOp : 1; bool IsTuple : 1; + LLVM_PREFERRED_TYPE(PolicyScheme) uint8_t UnMaskedPolicyScheme : 2; + LLVM_PREFERRED_TYPE(PolicyScheme) uint8_t MaskedPolicyScheme : 2; }; -- cgit v1.1 From 803374994602910aae2cb483d03bcbdb294b21bb Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 15:48:52 +0300 Subject: [clang][NFC] Annotate `DiagnosticID.cpp` with `preferred_type` This helps debuggers to display values in bit-fields in a more helpful way. --- clang/lib/Basic/DiagnosticIDs.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index 6c7bd50..b353a66 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -100,7 +100,7 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = { }; // Diagnostic classes. -enum { +enum DiagnosticClass { CLASS_NOTE = 0x01, CLASS_REMARK = 0x02, CLASS_WARNING = 0x03, @@ -110,15 +110,22 @@ enum { struct StaticDiagInfoRec { uint16_t DiagID; + LLVM_PREFERRED_TYPE(diag::Severity) uint8_t DefaultSeverity : 3; + LLVM_PREFERRED_TYPE(DiagnosticClass) uint8_t Class : 3; + LLVM_PREFERRED_TYPE(DiagnosticIDs::SFINAEResponse) uint8_t SFINAE : 2; uint8_t Category : 6; + LLVM_PREFERRED_TYPE(bool) uint8_t WarnNoWerror : 1; + LLVM_PREFERRED_TYPE(bool) uint8_t WarnShowInSystemHeader : 1; + LLVM_PREFERRED_TYPE(bool) uint8_t WarnShowInSystemMacro : 1; uint16_t OptionGroupIndex : 15; + LLVM_PREFERRED_TYPE(bool) uint16_t Deferrable : 1; uint16_t DescriptionLen; -- cgit v1.1 From fe0d277f31d3369de1fd92ad8dd8044f5b1d4ed7 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sun, 11 Feb 2024 13:53:59 +0100 Subject: [libc++][ratio] Avoids accepting unrelated types. (#80491) The arithmetic and comparison operators are ill-formed when R1 or R2 is not a std::ratio. Fixes: https://github.com/llvm/llvm-project/issues/63753 --- libcxx/include/ratio | 42 +++++++++-- .../ratio.arithmetic/R1_R2_requirement.verify.cpp | 56 +++++++++++++++ .../ratio.comparison/R1_R2_requirement.verify.cpp | 81 ++++++++++++++++++++++ .../R1_R2_requirement_v.verify.cpp | 69 ++++++++++++++++++ 4 files changed, 242 insertions(+), 6 deletions(-) create mode 100644 libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp create mode 100644 libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp create mode 100644 libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp diff --git a/libcxx/include/ratio b/libcxx/include/ratio index 3b11a2a..de656f3 100644 --- a/libcxx/include/ratio +++ b/libcxx/include/ratio @@ -289,6 +289,9 @@ private: static const intmax_t __gcd_n1_d2 = __static_gcd<_R1::num, _R2::den>::value; static const intmax_t __gcd_d1_n2 = __static_gcd<_R1::den, _R2::num>::value; + static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template"); + static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template"); + public: typedef typename ratio< __ll_mul<_R1::num / __gcd_n1_d2, _R2::num / __gcd_d1_n2>::value, __ll_mul<_R2::den / __gcd_n1_d2, _R1::den / __gcd_d1_n2>::value >::type type; @@ -312,6 +315,9 @@ private: static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value; static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value; + static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template"); + static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template"); + public: typedef typename ratio< __ll_mul<_R1::num / __gcd_n1_n2, _R2::den / __gcd_d1_d2>::value, __ll_mul<_R2::num / __gcd_n1_n2, _R1::den / __gcd_d1_d2>::value >::type type; @@ -335,6 +341,9 @@ private: static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value; static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value; + static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template"); + static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template"); + public: typedef typename ratio_multiply< ratio<__gcd_n1_n2, _R1::den / __gcd_d1_d2>, @@ -361,6 +370,9 @@ private: static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value; static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value; + static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template"); + static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template"); + public: typedef typename ratio_multiply< ratio<__gcd_n1_n2, _R1::den / __gcd_d1_d2>, @@ -384,10 +396,16 @@ struct _LIBCPP_TEMPLATE_VIS ratio_subtract : public __ratio_subtract<_R1, _R2>:: // ratio_equal template -struct _LIBCPP_TEMPLATE_VIS ratio_equal : _BoolConstant<(_R1::num == _R2::num && _R1::den == _R2::den)> {}; +struct _LIBCPP_TEMPLATE_VIS ratio_equal : _BoolConstant<(_R1::num == _R2::num && _R1::den == _R2::den)> { + static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template"); + static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template"); +}; template -struct _LIBCPP_TEMPLATE_VIS ratio_not_equal : _BoolConstant::value> {}; +struct _LIBCPP_TEMPLATE_VIS ratio_not_equal : _BoolConstant::value> { + static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template"); + static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template"); +}; // ratio_less @@ -441,16 +459,28 @@ struct __ratio_less<_R1, _R2, -1LL, -1LL> { }; template -struct _LIBCPP_TEMPLATE_VIS ratio_less : _BoolConstant<__ratio_less<_R1, _R2>::value> {}; +struct _LIBCPP_TEMPLATE_VIS ratio_less : _BoolConstant<__ratio_less<_R1, _R2>::value> { + static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template"); + static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template"); +}; template -struct _LIBCPP_TEMPLATE_VIS ratio_less_equal : _BoolConstant::value> {}; +struct _LIBCPP_TEMPLATE_VIS ratio_less_equal : _BoolConstant::value> { + static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template"); + static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template"); +}; template -struct _LIBCPP_TEMPLATE_VIS ratio_greater : _BoolConstant::value> {}; +struct _LIBCPP_TEMPLATE_VIS ratio_greater : _BoolConstant::value> { + static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template"); + static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template"); +}; template -struct _LIBCPP_TEMPLATE_VIS ratio_greater_equal : _BoolConstant::value> {}; +struct _LIBCPP_TEMPLATE_VIS ratio_greater_equal : _BoolConstant::value> { + static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template"); + static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template"); +}; template struct __ratio_gcd { diff --git a/libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp b/libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp new file mode 100644 index 0000000..9fc91e1 --- /dev/null +++ b/libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// +// +// [ratio.general]/2 +// Throughout subclause [ratio], the names of template parameters are +// used to express type requirements. If a template parameter is named +// R1 or R2, and the template argument is not a specialization of the +// ratio template, the program is ill-formed. + +#include + +struct invalid { + static const int num = 1; + static const int den = 1; +}; + +using valid = std::ratio<1, 1>; + +namespace add { +using valid_valid = std::ratio_add::type; +using invalid_valid = + std::ratio_add::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} +using valid_invalid = + std::ratio_add::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} // namespace add + +namespace subtract { +using valid_valid = std::ratio_subtract::type; +using invalid_valid = + std::ratio_subtract::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} +using valid_invalid = + std::ratio_subtract::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} // namespace subtract + +namespace multiply { +using valid_valid = std::ratio_multiply::type; +using invalid_valid = + std::ratio_multiply::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} +using valid_invalid = + std::ratio_multiply::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} // namespace multiply + +namespace divide { +using valid_valid = std::ratio_divide::type; +using invalid_valid = + std::ratio_divide::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} +using valid_invalid = + std::ratio_divide::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} // namespace divide diff --git a/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp new file mode 100644 index 0000000..03bb266 --- /dev/null +++ b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// +// +// [ratio.general]/2 +// Throughout subclause [ratio], the names of template parameters are +// used to express type requirements. If a template parameter is named +// R1 or R2, and the template argument is not a specialization of the +// ratio template, the program is ill-formed. +// +// Since all std::ratio_xxx_v variables use the same instantiation, only one +// error will be generated. These values are tested in a separate test. + +#include + +struct invalid { + static const int num = 1; + static const int den = 1; +}; + +using valid = std::ratio<1, 1>; + +namespace equal { +using valid_valid = std::ratio_equal::type; +using invalid_valid = + std::ratio_equal::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} +using valid_invalid = + std::ratio_equal::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} // namespace equal + +namespace not_equal { +using valid_valid = std::ratio_not_equal::type; +using invalid_valid = + std::ratio_not_equal::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} +using valid_invalid = + std::ratio_not_equal::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} // namespace not_equal + +namespace less { +using valid_valid = std::ratio_less::type; +using invalid_valid = + std::ratio_less::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} +using valid_invalid = + std::ratio_less::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} // namespace less + +namespace less_equal { +using valid_valid = std::ratio_less_equal::type; +using invalid_valid = + std::ratio_less_equal::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} +using valid_invalid = + std::ratio_less_equal::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} // namespace less_equal + +namespace greater { +using valid_valid = std::ratio_greater::type; +using invalid_valid = + std::ratio_greater::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} +using valid_invalid = + std::ratio_greater::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} // namespace greater + +namespace greater_equal { +using valid_valid = std::ratio_greater_equal::type; +using invalid_valid = + std::ratio_greater_equal::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} +using valid_invalid = + std::ratio_greater_equal::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} // namespace greater_equal diff --git a/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp new file mode 100644 index 0000000..fbcf358 --- /dev/null +++ b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// +// +// [ratio.general]/2 +// Throughout subclause [ratio], the names of template parameters are +// used to express type requirements. If a template parameter is named +// R1 or R2, and the template argument is not a specialization of the +// ratio template, the program is ill-formed. +// +// Since all std::ratio_xxx_v variables use the same instantiation, only one +// error will be generated. These values are tested in a separate test. + +#include + +struct invalid { + constexpr static int num = 1; + constexpr static int den = 1; +}; + +using valid = std::ratio<1, 1>; + +void test() { + // equal + (void)std::ratio_equal_v; + (void)std::ratio_equal_v; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} + (void)std::ratio_equal_v; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} + + // not_equal + (void)std::ratio_not_equal_v; + (void)std::ratio_not_equal_v; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} + (void)std::ratio_not_equal_v; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} + + // less + (void)std::ratio_less_v; + (void)std::ratio_less_v; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} + (void)std::ratio_less_v; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} + + // less_equal + (void)std::ratio_less_equal_v; + (void)std::ratio_less_equal_v; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} + (void)std::ratio_less_equal_v; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} + + // greater + (void)std::ratio_greater_v; + (void)std::ratio_greater_v; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} + (void)std::ratio_greater_v; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} + + // greater_equal + (void)std::ratio_greater_equal_v; + + (void)std::ratio_greater_equal_v; // expected-error@*:* {{R1 to be a specialisation of the ratio template}} + + (void)std::ratio_greater_equal_v; // expected-error@*:* {{R2 to be a specialisation of the ratio template}} +} -- cgit v1.1 From 1503db86d65ee2bcc8ec1c2a5a4d00dea02aae0d Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 16:29:17 +0300 Subject: [clang][NFC] Refactor bit-fields in `RawComment` Make them all of the same `unsigned` type, which brings `sizeof(RawComment)` down from 12 to 4 when compiling Clang for Microsoft ABI. --- clang/include/clang/AST/RawCommentList.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/AST/RawCommentList.h b/clang/include/clang/AST/RawCommentList.h index 53aae24..3e4567b 100644 --- a/clang/include/clang/AST/RawCommentList.h +++ b/clang/include/clang/AST/RawCommentList.h @@ -175,17 +175,22 @@ private: mutable StringRef RawText; mutable const char *BriefText = nullptr; - mutable bool RawTextValid : 1; ///< True if RawText is valid - mutable bool BriefTextValid : 1; ///< True if BriefText is valid + LLVM_PREFERRED_TYPE(bool) + mutable unsigned RawTextValid : 1; + LLVM_PREFERRED_TYPE(bool) + mutable unsigned BriefTextValid : 1; LLVM_PREFERRED_TYPE(CommentKind) unsigned Kind : 3; /// True if comment is attached to a declaration in ASTContext. - bool IsAttached : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned IsAttached : 1; - bool IsTrailingComment : 1; - bool IsAlmostTrailingComment : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned IsTrailingComment : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned IsAlmostTrailingComment : 1; /// Constructor for AST deserialization. RawComment(SourceRange SR, CommentKind K, bool IsTrailingComment, -- cgit v1.1 From 082439c33fa76ad4df267600472695d24ad53821 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 16:40:15 +0300 Subject: [clang][NFC] Refactor bit-fields in `ObjCAtTryStmt` Make all bit-fields of type `unsigned`, which reduces amoung of padding on Microsoft ABI, resulting in potentially lesser size of the object. --- clang/include/clang/AST/StmtObjC.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/AST/StmtObjC.h b/clang/include/clang/AST/StmtObjC.h index c46ff46..03bc61f 100644 --- a/clang/include/clang/AST/StmtObjC.h +++ b/clang/include/clang/AST/StmtObjC.h @@ -177,7 +177,8 @@ class ObjCAtTryStmt final unsigned NumCatchStmts : 16; // Whether this statement has a \@finally statement. - bool HasFinally : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned HasFinally : 1; /// Retrieve the statements that are stored after this \@try statement. /// -- cgit v1.1 From 15279e7569108cccb49ca1fcfdfae420124d3fac Mon Sep 17 00:00:00 2001 From: Carlos Galvez Date: Sun, 11 Feb 2024 15:04:03 +0100 Subject: [OpenMP] Remove -Wno-enum-constexpr-conversion (#81318) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This effectively reverts commit 9ff0cc7e0fa7e99163610d2fcb58e96f3315e343. For some reason "git revert" lead to "no changes" after fixing conflicts, so a clean revert was not possible. The original issue (#57022) is no longer reproducible even with this patch, so we can remove the suppression. This is in line with our goal to make -Wenum-constexpr-conversion a non-downgradeable error, see #59036. Co-authored-by: Carlos Gálvez --- openmp/cmake/HandleOpenMPOptions.cmake | 1 - openmp/cmake/config-ix.cmake | 1 - 2 files changed, 2 deletions(-) diff --git a/openmp/cmake/HandleOpenMPOptions.cmake b/openmp/cmake/HandleOpenMPOptions.cmake index 9387d9b..4809520 100644 --- a/openmp/cmake/HandleOpenMPOptions.cmake +++ b/openmp/cmake/HandleOpenMPOptions.cmake @@ -41,7 +41,6 @@ append_if(OPENMP_HAVE_WSIGN_COMPARE_FLAG "-Wsign-compare" CMAKE_C_FLAGS CMAKE_CX # printed. Therefore, check for whether the compiler supports options in the # form -W, and if supported, add the corresponding -Wno- option. -append_if(OPENMP_HAVE_WENUM_CONSTEXPR_CONVERSION_FLAG "-Wno-enum-constexpr-conversion" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) append_if(OPENMP_HAVE_WEXTRA_FLAG "-Wno-extra" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) append_if(OPENMP_HAVE_WPEDANTIC_FLAG "-Wno-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) append_if(OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) diff --git a/openmp/cmake/config-ix.cmake b/openmp/cmake/config-ix.cmake index a1e1b61..cfc6833 100644 --- a/openmp/cmake/config-ix.cmake +++ b/openmp/cmake/config-ix.cmake @@ -33,7 +33,6 @@ check_cxx_compiler_flag(-Wsign-compare OPENMP_HAVE_WSIGN_COMPARE_FLAG) # printed. Therefore, check for whether the compiler supports options in the # form -W, and if supported, add the corresponding -Wno- option. -check_cxx_compiler_flag(-Wenum-constexpr-conversion OPENMP_HAVE_WENUM_CONSTEXPR_CONVERSION_FLAG) check_cxx_compiler_flag(-Wextra OPENMP_HAVE_WEXTRA_FLAG) check_cxx_compiler_flag(-Wpedantic OPENMP_HAVE_WPEDANTIC_FLAG) check_cxx_compiler_flag(-Wmaybe-uninitialized OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG) -- cgit v1.1 From e3f684d86b308bc2576d813aad1a230aa6b295ab Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 11 Feb 2024 17:27:31 +0300 Subject: [clang][NFC] Refactor bit-fields in `DefaultedFunctionKind` This patch makes all bit-fields in `DefaultedFunctionKind` of type `unsigned`, which brings `sizeof(DefaultedFunctionKind)` down from 8 to 4 when compiling Clang for Microsoft ABI. --- clang/include/clang/Sema/Sema.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 3c26003..851560f 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -3501,29 +3501,29 @@ public: /// For a defaulted function, the kind of defaulted function that it is. class DefaultedFunctionKind { - CXXSpecialMember SpecialMember : 8; - DefaultedComparisonKind Comparison : 8; + unsigned SpecialMember : 8; + unsigned Comparison : 8; public: DefaultedFunctionKind() - : SpecialMember(CXXInvalid), Comparison(DefaultedComparisonKind::None) { + : SpecialMember(CXXInvalid), Comparison(llvm::to_underlying(DefaultedComparisonKind::None)) { } DefaultedFunctionKind(CXXSpecialMember CSM) - : SpecialMember(CSM), Comparison(DefaultedComparisonKind::None) {} + : SpecialMember(CSM), Comparison(llvm::to_underlying(DefaultedComparisonKind::None)) {} DefaultedFunctionKind(DefaultedComparisonKind Comp) - : SpecialMember(CXXInvalid), Comparison(Comp) {} + : SpecialMember(CXXInvalid), Comparison(llvm::to_underlying(Comp)) {} bool isSpecialMember() const { return SpecialMember != CXXInvalid; } bool isComparison() const { - return Comparison != DefaultedComparisonKind::None; + return static_cast(Comparison) != DefaultedComparisonKind::None; } explicit operator bool() const { return isSpecialMember() || isComparison(); } - CXXSpecialMember asSpecialMember() const { return SpecialMember; } - DefaultedComparisonKind asComparison() const { return Comparison; } + CXXSpecialMember asSpecialMember() const { return static_cast(SpecialMember); } + DefaultedComparisonKind asComparison() const { return static_cast(Comparison); } /// Get the index of this function kind for use in diagnostics. unsigned getDiagnosticIndex() const { @@ -3531,7 +3531,7 @@ public: "invalid should have highest index"); static_assert((unsigned)DefaultedComparisonKind::None == 0, "none should be equal to zero"); - return SpecialMember + (unsigned)Comparison; + return SpecialMember + Comparison; } }; -- cgit v1.1 From b45de48be24695b613f48ed21bb35f844454193b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 11 Feb 2024 15:02:27 +0000 Subject: [MVE] Expand64BitShift - handle all constant shift amounts less than 32 (#81261) Expand64BitShift was always dropping to generic shift legalization if the shift amount type was larger than i64, even if the constant shift amount was actually very small. I've adjusted the constant bounds checks to work with APInt types so we can always perform the comparison. This results in the MVE long shift instructions being used more often, and it looks like this is preventing some additional combines from happening. This could be addressed in the future. This came about while I was trying to extend the DAGTypeLegalizer::ExpandShift* helpers and need to move to consistently using the legal shift amount types instead of reusing the shift amount type from the original wider shift. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 4 +- llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll | 1703 +++++++++++---------- llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll | 1327 ++++++++-------- 3 files changed, 1545 insertions(+), 1489 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index b5c4a8a..b98006e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -6702,8 +6702,8 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, // If the shift amount is greater than 32 or has a greater bitwidth than 64 // then do the default optimisation - if (ShAmt->getValueType(0).getSizeInBits() > 64 || - (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) + if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) || + (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32)))) return SDValue(); // Extract the lower 32 bits of the shift amount if it's not an i32 diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll index 3ca01cf..570834f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll @@ -1821,44 +1821,42 @@ define arm_aapcs_vfpcc <4 x i32> @test_signed_v4f32_v4i32_duplicate(<4 x float> define arm_aapcs_vfpcc <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) { ; CHECK-LABEL: test_signed_v4f32_v4i50: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: vmov r0, s19 ; CHECK-NEXT: vldr s20, .LCPI28_0 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vcmp.f32 s17, s20 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: itt lt -; CHECK-NEXT: movlt r7, #0 -; CHECK-NEXT: movtlt r7, #65534 +; CHECK-NEXT: movlt r5, #0 +; CHECK-NEXT: movtlt r5, #65534 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vldr s22, .LCPI28_1 ; CHECK-NEXT: vcmp.f32 s19, s20 -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: vcmp.f32 s17, s22 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: itt lt -; CHECK-NEXT: movlt r5, #0 -; CHECK-NEXT: movtlt r5, #65534 +; CHECK-NEXT: movlt r7, #0 +; CHECK-NEXT: movtlt r7, #65534 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r7, #65535 -; CHECK-NEXT: movtgt r7, #1 +; CHECK-NEXT: movwgt r5, #65535 +; CHECK-NEXT: movtgt r5, #1 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s22 ; CHECK-NEXT: it lt @@ -1866,109 +1864,103 @@ define arm_aapcs_vfpcc <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s22 ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r5, #65535 -; CHECK-NEXT: movtgt r5, #1 +; CHECK-NEXT: movwgt r7, #65535 +; CHECK-NEXT: movtgt r7, #1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s16 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: vcmp.f32 s16, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 +; CHECK-NEXT: str.w r0, [r8] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s22 -; CHECK-NEXT: str.w r0, [r8] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r6, #0 +; CHECK-NEXT: movlt r4, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r6, #-1 +; CHECK-NEXT: movgt.w r4, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s17 +; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: itt vs -; CHECK-NEXT: movvs r6, #0 -; CHECK-NEXT: movvs r5, #0 -; CHECK-NEXT: lsls r0, r5, #22 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s20 -; CHECK-NEXT: orr.w r0, r0, r6, lsr #10 -; CHECK-NEXT: str.w r0, [r8, #20] -; CHECK-NEXT: it vs +; CHECK-NEXT: movvs r4, #0 ; CHECK-NEXT: movvs r7, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w r9, #0 -; CHECK-NEXT: vcmp.f32 s17, s22 +; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r9, #-1 -; CHECK-NEXT: vcmp.f32 s17, s17 +; CHECK-NEXT: vcmp.f32 s18, s18 +; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it vs +; CHECK-NEXT: itt vs ; CHECK-NEXT: movvs.w r9, #0 -; CHECK-NEXT: lsr.w r0, r9, #14 -; CHECK-NEXT: orr.w r1, r0, r7, lsl #18 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: str.w r1, [r8, #8] +; CHECK-NEXT: movvs r5, #0 +; CHECK-NEXT: bfc r1, #18, #14 +; CHECK-NEXT: vcmp.f32 s16, s20 +; CHECK-NEXT: bfc r5, #18, #14 +; CHECK-NEXT: mov r6, r9 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: lsll r4, r1, #22 +; CHECK-NEXT: lsrl r6, r5, #28 +; CHECK-NEXT: itt lt +; CHECK-NEXT: movwlt r10, #0 +; CHECK-NEXT: movtlt r10, #65534 +; CHECK-NEXT: vcmp.f32 s16, s22 +; CHECK-NEXT: orrs r1, r5 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: itt gt +; CHECK-NEXT: movwgt r10, #65535 +; CHECK-NEXT: movtgt r10, #1 +; CHECK-NEXT: str.w r1, [r8, #20] ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: lsrs r2, r5, #10 +; CHECK-NEXT: vcmp.f32 s17, s20 +; CHECK-NEXT: orr.w r2, r6, r4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: itt lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: movtlt r1, #65534 +; CHECK-NEXT: vcmp.f32 s17, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: itt gt ; CHECK-NEXT: movwgt r1, #65535 ; CHECK-NEXT: movtgt r1, #1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s22 -; CHECK-NEXT: itt lt -; CHECK-NEXT: movlt r4, #0 -; CHECK-NEXT: movtlt r4, #65534 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r4, #65535 -; CHECK-NEXT: movtgt r4, #1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s22 +; CHECK-NEXT: str.w r2, [r8, #16] +; CHECK-NEXT: lsrs r2, r7, #10 +; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: strb.w r2, [r8, #24] +; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: vcmp.f32 s17, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s18 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: ubfx r2, r7, #14, #4 -; CHECK-NEXT: vcmp.f32 s16, s16 -; CHECK-NEXT: it vs +; CHECK-NEXT: itt vs ; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: orr.w r2, r2, r0, lsl #4 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r2, [r8, #12] -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r4, #0 -; CHECK-NEXT: vcmp.f32 s18, s18 -; CHECK-NEXT: bfc r4, #18, #14 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: orr.w r2, r4, r9, lsl #18 -; CHECK-NEXT: str.w r2, [r8, #4] -; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 -; CHECK-NEXT: lsrs r0, r0, #28 ; CHECK-NEXT: bfc r1, #18, #14 -; CHECK-NEXT: orr.w r0, r0, r1, lsl #4 -; CHECK-NEXT: orr.w r0, r0, r6, lsl #22 -; CHECK-NEXT: str.w r0, [r8, #16] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: lsrl r2, r1, #14 +; CHECK-NEXT: vcmp.f32 s16, s16 +; CHECK-NEXT: orr.w r1, r1, r9, lsl #4 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: strd r2, r1, [r8, #8] +; CHECK-NEXT: it vs +; CHECK-NEXT: movvs.w r10, #0 +; CHECK-NEXT: bfc r10, #18, #14 +; CHECK-NEXT: orr.w r0, r10, r0, lsl #18 +; CHECK-NEXT: str.w r0, [r8, #4] ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI28_0: @@ -2120,21 +2112,22 @@ define arm_aapcs_vfpcc <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) { define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-LABEL: test_signed_v4f32_v4i100: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vldr s20, .LCPI30_0 -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: vmov r5, s16 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vldr s22, .LCPI30_1 -; CHECK-NEXT: mov r6, r3 +; CHECK-NEXT: mov r10, r3 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vldr s22, .LCPI30_0 +; CHECK-NEXT: vmov r7, s17 +; CHECK-NEXT: vldr s20, .LCPI30_1 +; CHECK-NEXT: vmov r4, s19 ; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, s20 @@ -2150,7 +2143,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: movvs r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: str.w r2, [r4, #33] +; CHECK-NEXT: str.w r2, [r9, #33] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -2162,7 +2155,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r1, [r4, #29] +; CHECK-NEXT: str.w r1, [r9, #29] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s18, s20 @@ -2173,11 +2166,11 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: str.w r0, [r4, #25] -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: str.w r0, [r9, #25] +; CHECK-NEXT: mov r0, r3 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: vcmp.f32 s16, s22 -; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: mov r11, r3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt @@ -2192,7 +2185,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: movvs r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: str r2, [r4, #8] +; CHECK-NEXT: str.w r2, [r9, #8] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -2204,7 +2197,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str r1, [r4, #4] +; CHECK-NEXT: str.w r1, [r9, #4] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s16, s20 @@ -2215,165 +2208,165 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: str r0, [r4] -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: str.w r0, [r9] +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: vcmp.f32 s19, s22 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: movgt.w r6, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s22 +; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: movvs r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: mvnlt r10, #7 +; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: movgt.w r10, #7 +; CHECK-NEXT: vcmp.f32 s18, s18 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r7, r1, #28 -; CHECK-NEXT: vcmp.f32 s19, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r2, #0 -; CHECK-NEXT: orr.w r7, r7, r2, lsl #4 +; CHECK-NEXT: movvs.w r10, #0 +; CHECK-NEXT: and r0, r10, #15 +; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: orr.w r0, r0, r6, lsl #4 +; CHECK-NEXT: str.w r0, [r9, #37] +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: bl __fixsfti +; CHECK-NEXT: vcmp.f32 s17, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r7, [r4, #45] +; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: vcmp.f32 s19, s19 -; CHECK-NEXT: lsrs r2, r2, #28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s16, s22 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: lsrs r7, r0, #28 -; CHECK-NEXT: vcmp.f32 s19, s22 -; CHECK-NEXT: orr.w r7, r7, r1, lsl #4 -; CHECK-NEXT: vmov r1, s17 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 -; CHECK-NEXT: str.w r7, [r4, #41] +; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: mvnlt r3, #7 +; CHECK-NEXT: mvnlt r11, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s19 +; CHECK-NEXT: vcmp.f32 s16, s16 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #7 +; CHECK-NEXT: movgt.w r11, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s22 +; CHECK-NEXT: vcmp.f32 s19, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r3, #0 -; CHECK-NEXT: orr.w r2, r2, r3, lsl #4 +; CHECK-NEXT: movvs.w r11, #0 +; CHECK-NEXT: and r7, r11, #15 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: strb.w r2, [r4, #49] +; CHECK-NEXT: vcmp.f32 s19, s20 +; CHECK-NEXT: orr.w r7, r7, r0, lsl #4 +; CHECK-NEXT: str.w r7, [r9, #12] ; CHECK-NEXT: it lt -; CHECK-NEXT: mvnlt r6, #7 -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: movlt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r6, #7 -; CHECK-NEXT: vcmp.f32 s18, s18 +; CHECK-NEXT: movgt.w r5, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s19, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r6, #0 -; CHECK-NEXT: and r2, r6, #15 -; CHECK-NEXT: orr.w r0, r2, r0, lsl #4 -; CHECK-NEXT: str.w r0, [r4, #37] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vcmp.f32 s17, s22 +; CHECK-NEXT: movvs r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s20 +; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt r4, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s17 +; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: movgt.w r4, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s22 +; CHECK-NEXT: lsrl r6, r5, #28 +; CHECK-NEXT: vcmp.f32 s19, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: movvs r4, #0 +; CHECK-NEXT: orr.w r7, r5, r4, lsl #4 +; CHECK-NEXT: str.w r7, [r9, #45] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s20 +; CHECK-NEXT: str.w r6, [r9, #41] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: mvnlt r8, #7 +; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: movgt.w r8, #7 +; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r2, #0 -; CHECK-NEXT: lsrs r7, r1, #28 +; CHECK-NEXT: movvs.w r8, #0 +; CHECK-NEXT: and r5, r8, #15 +; CHECK-NEXT: vcmp.f32 s17, s22 +; CHECK-NEXT: lsrl r4, r5, #28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s20 -; CHECK-NEXT: orr.w r7, r7, r2, lsl #4 -; CHECK-NEXT: str r7, [r4, #20] +; CHECK-NEXT: strb.w r4, [r9, #49] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: mvnlt r3, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: movgt r3, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r0, #0 +; CHECK-NEXT: movvs r3, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s20 -; CHECK-NEXT: lsr.w r7, r0, #28 -; CHECK-NEXT: orr.w r1, r7, r1, lsl #4 -; CHECK-NEXT: str r1, [r4, #16] ; CHECK-NEXT: it lt -; CHECK-NEXT: mvnlt r3, #7 +; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #7 +; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r1, r2, #28 -; CHECK-NEXT: vcmp.f32 s16, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r3, #0 -; CHECK-NEXT: orr.w r1, r1, r3, lsl #4 +; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vcmp.f32 s17, s22 +; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: strb r1, [r4, #24] ; CHECK-NEXT: it lt -; CHECK-NEXT: mvnlt r5, #7 -; CHECK-NEXT: vcmp.f32 s16, s20 +; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: lsrl r0, r1, #28 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r5, #7 -; CHECK-NEXT: vcmp.f32 s16, s16 +; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r5, #0 -; CHECK-NEXT: and r1, r5, #15 -; CHECK-NEXT: orr.w r0, r1, r0, lsl #4 -; CHECK-NEXT: str r0, [r4, #12] +; CHECK-NEXT: movvs r2, #0 +; CHECK-NEXT: orr.w r1, r1, r2, lsl #4 +; CHECK-NEXT: strd r0, r1, [r9, #16] +; CHECK-NEXT: and r1, r3, #15 +; CHECK-NEXT: lsrl r2, r1, #28 +; CHECK-NEXT: strb.w r2, [r9, #24] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI30_0: -; CHECK-NEXT: .long 0x70ffffff @ float 6.33825262E+29 -; CHECK-NEXT: .LCPI30_1: ; CHECK-NEXT: .long 0xf1000000 @ float -6.338253E+29 +; CHECK-NEXT: .LCPI30_1: +; CHECK-NEXT: .long 0x70ffffff @ float 6.33825262E+29 %x = call <4 x i100> @llvm.fptosi.sat.v4f32.v4i100(<4 x float> %f) ret <4 x i100> %x } @@ -3694,151 +3687,155 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) { ; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI40_0 -; CHECK-NEXT: vmov r6, r5, d8 -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: vmov r9, r8, d0 -; CHECK-NEXT: str.w r8, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: vmov r5, r7, d8 +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: vmov r9, r3, d0 +; CHECK-NEXT: str r0, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI40_1 -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: vmov r7, r3, d0 -; CHECK-NEXT: str r3, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: vmov r8, r3, d0 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r11, r3 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: strd r1, r0, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: csel r4, r2, r4, ne +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: str r3, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp.w r10, #0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r4, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: str.w r11, [sp, #44] @ 4-byte Spill -; CHECK-NEXT: str.w r4, [r11, #8] -; CHECK-NEXT: str.w r9, [sp, #40] @ 4-byte Spill -; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: str.w r4, [r10, #8] +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: ldr r4, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r10, r7 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: str.w r9, [sp, #44] @ 4-byte Spill ; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: csel r7, r1, r0, ne -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: csel r6, r1, r0, ne +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r7, #-1 +; CHECK-NEXT: movne.w r6, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: str.w r7, [r11, #4] -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: ldr.w r11, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: movne r6, #0 +; CHECK-NEXT: ldr r0, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: str.w r10, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: str r6, [r0, #4] +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r10, r8 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: csel r7, r1, r0, ne -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: csel r6, r1, r0, ne +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r7, #-1 -; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: str r5, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: movne.w r6, #-1 +; CHECK-NEXT: str r5, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r7, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: vmov r9, r8, d9 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: ldr r0, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: mov r5, r11 -; CHECK-NEXT: str r7, [r0] -; CHECK-NEXT: ldr r7, [sp, #40] @ 4-byte Reload -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: movne r6, #0 +; CHECK-NEXT: ldr r0, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: str r6, [r0] +; CHECK-NEXT: ldr r6, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r4, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r4, r10 +; CHECK-NEXT: str.w r10, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: mov r5, r11 +; CHECK-NEXT: str.w r11, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __fixdfti +; CHECK-NEXT: mov r10, r3 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: strd r2, r0, [sp, #4] @ 8-byte Folded Spill -; CHECK-NEXT: csel r10, r1, r11, ne -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: it eq +; CHECK-NEXT: mvneq r10, #7 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #-1 +; CHECK-NEXT: movne.w r10, #7 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: ldr r7, [sp, #44] @ 4-byte Reload ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r6, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 @@ -3852,21 +3849,21 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: lsr.w r0, r10, #28 -; CHECK-NEXT: orr.w r0, r0, r4, lsl #4 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r10 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: mov r7, r5 -; CHECK-NEXT: str r0, [r1, #20] +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: mov r5, r6 +; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: ldr r6, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: ldr r7, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r2, r7 ; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload @@ -3876,73 +3873,75 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) { ; CHECK-NEXT: csel r11, r1, r0, ne ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r11, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r11, #0 -; CHECK-NEXT: ldr r5, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: lsr.w r0, r11, #28 -; CHECK-NEXT: orr.w r0, r0, r10, lsl #4 -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: str r0, [r5, #16] +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: mov r4, r11 +; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload ; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: ldr r2, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: lsrl r4, r1, #28 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r2, r7 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: mov r10, r6 +; CHECK-NEXT: mov r7, r6 ; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: it eq -; CHECK-NEXT: mvneq r0, #7 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r0, #7 -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: csel r6, r1, r0, ne +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r6, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: lsr.w r0, r4, #28 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #0 -; CHECK-NEXT: orr.w r0, r0, r6, lsl #4 -; CHECK-NEXT: strb r0, [r5, #24] -; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: and r1, r10, #15 ; CHECK-NEXT: ldr r2, [sp, #40] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: orr.w r0, r0, r6, lsl #4 +; CHECK-NEXT: lsrl r6, r1, #28 +; CHECK-NEXT: strd r4, r0, [r2, #16] +; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: strb r6, [r2, #24] +; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r6, r4 +; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: it eq ; CHECK-NEXT: mvneq r0, #7 -; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #7 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 @@ -3950,7 +3949,7 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) { ; CHECK-NEXT: movne r4, #0 ; CHECK-NEXT: and r0, r4, #15 ; CHECK-NEXT: orr.w r0, r0, r11, lsl #4 -; CHECK-NEXT: str r0, [r5, #12] +; CHECK-NEXT: str.w r0, [r8, #12] ; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -4694,107 +4693,127 @@ define arm_aapcs_vfpcc <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) { define arm_aapcs_vfpcc <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) { ; CHECK-LABEL: test_signed_v8f16_v8i19: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vldr s12, .LCPI46_0 -; CHECK-NEXT: vcvtt.f32.f16 s15, s3 -; CHECK-NEXT: vldr s14, .LCPI46_1 -; CHECK-NEXT: vcvtb.f32.f16 s7, s0 -; CHECK-NEXT: vmaxnm.f32 s16, s15, s12 -; CHECK-NEXT: vcvtb.f32.f16 s4, s1 -; CHECK-NEXT: vcvtt.f32.f16 s8, s1 -; CHECK-NEXT: vcvtb.f32.f16 s1, s2 -; CHECK-NEXT: vcvtt.f32.f16 s0, s0 -; CHECK-NEXT: vcvtt.f32.f16 s2, s2 -; CHECK-NEXT: vcvtb.f32.f16 s3, s3 -; CHECK-NEXT: vmaxnm.f32 s6, s4, s12 -; CHECK-NEXT: vmaxnm.f32 s10, s8, s12 -; CHECK-NEXT: vmaxnm.f32 s5, s1, s12 -; CHECK-NEXT: vmaxnm.f32 s9, s7, s12 -; CHECK-NEXT: vmaxnm.f32 s11, s0, s12 -; CHECK-NEXT: vmaxnm.f32 s13, s2, s12 -; CHECK-NEXT: vminnm.f32 s16, s16, s14 -; CHECK-NEXT: vmaxnm.f32 s12, s3, s12 -; CHECK-NEXT: vcvt.s32.f32 s16, s16 -; CHECK-NEXT: vminnm.f32 s12, s12, s14 -; CHECK-NEXT: vminnm.f32 s13, s13, s14 -; CHECK-NEXT: vcvt.s32.f32 s12, s12 -; CHECK-NEXT: vminnm.f32 s9, s9, s14 -; CHECK-NEXT: vcvt.s32.f32 s13, s13 -; CHECK-NEXT: vminnm.f32 s11, s11, s14 -; CHECK-NEXT: vcvt.s32.f32 s11, s11 -; CHECK-NEXT: vminnm.f32 s5, s5, s14 -; CHECK-NEXT: vcvt.s32.f32 s9, s9 -; CHECK-NEXT: vminnm.f32 s10, s10, s14 -; CHECK-NEXT: vcmp.f32 s15, s15 -; CHECK-NEXT: vminnm.f32 s6, s6, s14 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r1, #0 -; CHECK-NEXT: lsrs r2, r1, #11 -; CHECK-NEXT: vcmp.f32 s3, s3 -; CHECK-NEXT: strb r2, [r0, #18] -; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: .save {r4, r5, r7, r9, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r7, r9, r11, lr} +; CHECK-NEXT: vldr s4, .LCPI46_0 +; CHECK-NEXT: vcvtb.f32.f16 s8, s1 +; CHECK-NEXT: vcvtt.f32.f16 s12, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vldr s6, .LCPI46_1 +; CHECK-NEXT: vmaxnm.f32 s5, s1, s4 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vmaxnm.f32 s14, s12, s4 +; CHECK-NEXT: vminnm.f32 s5, s5, s6 +; CHECK-NEXT: vmaxnm.f32 s7, s0, s4 +; CHECK-NEXT: vminnm.f32 s7, s7, s6 +; CHECK-NEXT: vcvt.s32.f32 s5, s5 +; CHECK-NEXT: vcvt.s32.f32 s7, s7 +; CHECK-NEXT: vminnm.f32 s14, s14, s6 +; CHECK-NEXT: vcvt.s32.f32 s14, s14 +; CHECK-NEXT: vmaxnm.f32 s10, s8, s4 +; CHECK-NEXT: vminnm.f32 s10, s10, s6 +; CHECK-NEXT: vcmp.f32 s1, s1 +; CHECK-NEXT: vcvt.s32.f32 s10, s10 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s0, s0 +; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s2 +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r3, #0 -; CHECK-NEXT: ubfx r2, r3, #14, #5 -; CHECK-NEXT: vcvt.s32.f32 s5, s5 -; CHECK-NEXT: orr.w r1, r2, r1, lsl #5 -; CHECK-NEXT: vcmp.f32 s2, s2 -; CHECK-NEXT: strh r1, [r0, #16] -; CHECK-NEXT: vmov lr, s13 +; CHECK-NEXT: movvs r2, #0 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: bfc r2, #19, #13 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w lr, #0 -; CHECK-NEXT: ubfx r1, lr, #1, #18 -; CHECK-NEXT: vcmp.f32 s0, s0 -; CHECK-NEXT: orr.w r1, r1, r3, lsl #18 -; CHECK-NEXT: vcvt.s32.f32 s10, s10 +; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: vcmp.f32 s12, s12 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: str r1, [r0, #12] -; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vcmp.f32 s8, s8 +; CHECK-NEXT: lsll r2, r7, #19 +; CHECK-NEXT: bfc r1, #19, #13 +; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmaxnm.f32 s8, s0, s4 +; CHECK-NEXT: orr.w r1, r1, r2 +; CHECK-NEXT: str r1, [r0] ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs.w r12, #0 -; CHECK-NEXT: vcmp.f32 s7, s7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s2 +; CHECK-NEXT: vmaxnm.f32 s2, s0, s4 +; CHECK-NEXT: vminnm.f32 s8, s8, s6 +; CHECK-NEXT: vminnm.f32 s2, s2, s6 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vcvt.s32.f32 s2, s2 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r3, #0 +; CHECK-NEXT: vcvt.s32.f32 s8, s8 ; CHECK-NEXT: bfc r3, #19, #13 -; CHECK-NEXT: vcvt.s32.f32 s6, s6 -; CHECK-NEXT: orr.w r3, r3, r12, lsl #19 -; CHECK-NEXT: str r3, [r0] -; CHECK-NEXT: vcmp.f32 s1, s1 -; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: bfc r2, #19, #13 +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: lsrl r2, r1, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s0, s0 +; CHECK-NEXT: lsrl r4, r9, #26 +; CHECK-NEXT: vcvtt.f32.f16 s0, s3 +; CHECK-NEXT: mov lr, r1 +; CHECK-NEXT: orr.w r1, r4, r2 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmaxnm.f32 s2, s0, s4 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vminnm.f32 s2, s2, s6 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r3, #0 -; CHECK-NEXT: vcmp.f32 s8, s8 -; CHECK-NEXT: bfc r3, #19, #13 -; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: movvs r2, #0 +; CHECK-NEXT: vcvt.s32.f32 s2, s2 +; CHECK-NEXT: bfc r2, #19, #13 +; CHECK-NEXT: lsll r2, r5, #12 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r1, #0 -; CHECK-NEXT: ubfx r2, r1, #7, #12 -; CHECK-NEXT: vcmp.f32 s4, s4 -; CHECK-NEXT: orr.w r2, r2, r3, lsl #12 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: orr.w r2, r2, lr, lsl #31 +; CHECK-NEXT: movvs r4, #0 +; CHECK-NEXT: orrs r2, r1 +; CHECK-NEXT: bfc r4, #19, #13 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: lsll r4, r1, #31 +; CHECK-NEXT: vcmp.f32 s0, s0 +; CHECK-NEXT: orrs r2, r4 ; CHECK-NEXT: str r2, [r0, #8] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: ubfx r3, r12, #13, #6 +; CHECK-NEXT: orr.w r2, r7, r3, lsl #6 +; CHECK-NEXT: vcvtb.f32.f16 s0, s3 +; CHECK-NEXT: orr.w r3, r2, r12, lsl #25 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmaxnm.f32 s2, s0, s4 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vminnm.f32 s2, s2, s6 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 +; CHECK-NEXT: vcvt.s32.f32 s2, s2 ; CHECK-NEXT: bfc r2, #19, #13 -; CHECK-NEXT: orr.w r2, r3, r2, lsl #6 -; CHECK-NEXT: orr.w r1, r2, r1, lsl #25 -; CHECK-NEXT: str r1, [r0, #4] -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: vcmp.f32 s0, s0 +; CHECK-NEXT: lsll r2, r7, #5 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: vmov r7, s2 +; CHECK-NEXT: it vs +; CHECK-NEXT: movvs r7, #0 +; CHECK-NEXT: mov r4, r7 +; CHECK-NEXT: bfc r4, #19, #13 +; CHECK-NEXT: lsrl r4, r11, #14 +; CHECK-NEXT: orrs r2, r4 +; CHECK-NEXT: strh r2, [r0, #16] +; CHECK-NEXT: str r3, [r0, #4] +; CHECK-NEXT: lsrs r2, r2, #16 +; CHECK-NEXT: strb r2, [r0, #18] +; CHECK-NEXT: orr.w r2, r9, lr +; CHECK-NEXT: orrs r2, r5 +; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: orr.w r1, r1, r7, lsl #18 +; CHECK-NEXT: str r1, [r0, #12] +; CHECK-NEXT: pop.w {r4, r5, r7, r9, r11, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI46_0: @@ -4844,42 +4863,40 @@ define arm_aapcs_vfpcc <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) { ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: vcvtt.f32.f16 s28, s19 -; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: vcvtt.f32.f16 s30, s19 +; CHECK-NEXT: vmov r0, s30 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcvtb.f32.f16 s26, s18 -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vldr s22, .LCPI48_1 -; CHECK-NEXT: vcvtb.f32.f16 s24, s16 -; CHECK-NEXT: vcvtt.f32.f16 s18, s18 -; CHECK-NEXT: vcmp.f32 s28, s22 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: vldr s24, .LCPI48_1 +; CHECK-NEXT: vcvtb.f32.f16 s20, s16 +; CHECK-NEXT: vcvtb.f32.f16 s28, s19 +; CHECK-NEXT: vcmp.f32 s30, s24 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vmov r6, s24 -; CHECK-NEXT: vldr s20, .LCPI48_0 -; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r7, s20 +; CHECK-NEXT: vldr s22, .LCPI48_0 +; CHECK-NEXT: vmov r6, s28 ; CHECK-NEXT: itt lt -; CHECK-NEXT: movlt r4, #0 -; CHECK-NEXT: movtlt r4, #65534 +; CHECK-NEXT: movlt r5, #0 +; CHECK-NEXT: movtlt r5, #65534 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcmp.f32 s26, s22 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vcmp.f32 s26, s24 +; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s28, s20 +; CHECK-NEXT: vcmp.f32 s30, s22 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, s20 +; CHECK-NEXT: vcmp.f32 s26, s22 ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r4, #65535 -; CHECK-NEXT: movtgt r4, #1 +; CHECK-NEXT: movwgt r5, #65535 +; CHECK-NEXT: movtgt r5, #1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 @@ -4887,263 +4904,244 @@ define arm_aapcs_vfpcc <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: str.w r0, [r11, #25] -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: str.w r0, [r9, #25] +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcmp.f32 s24, s22 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: vcmp.f32 s20, s24 +; CHECK-NEXT: mov r8, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s24, s20 +; CHECK-NEXT: vcmp.f32 s20, s22 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s24, s24 +; CHECK-NEXT: vcmp.f32 s20, s20 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s28, s22 +; CHECK-NEXT: vcmp.f32 s30, s24 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 +; CHECK-NEXT: str.w r0, [r9] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r0, [r11] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #0 -; CHECK-NEXT: vcmp.f32 s28, s20 +; CHECK-NEXT: movlt r4, #0 +; CHECK-NEXT: vcmp.f32 s30, s22 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r7, #-1 -; CHECK-NEXT: vcmp.f32 s28, s28 +; CHECK-NEXT: movgt.w r4, #-1 +; CHECK-NEXT: vcmp.f32 s30, s30 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r7, #0 -; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: it vs +; CHECK-NEXT: itt vs ; CHECK-NEXT: movvs r4, #0 -; CHECK-NEXT: lsls r0, r4, #22 -; CHECK-NEXT: orr.w r7, r0, r7, lsr #10 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: movvs r5, #0 +; CHECK-NEXT: mov r7, r5 +; CHECK-NEXT: bfc r7, #18, #14 +; CHECK-NEXT: lsll r4, r7, #22 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcmp.f32 s18, s22 -; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: vcmp.f32 s28, s24 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: itt lt +; CHECK-NEXT: vcmp.f32 s28, s22 +; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r6, #0 -; CHECK-NEXT: movtlt r6, #65534 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s18 -; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r6, #65535 -; CHECK-NEXT: movtgt r6, #1 +; CHECK-NEXT: vcmp.f32 s28, s28 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt.w r6, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: vcmp.f32 s18, s22 -; CHECK-NEXT: str.w r7, [r11, #45] +; CHECK-NEXT: vcmp.f32 s28, s24 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #0 -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: itt lt +; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movtlt r1, #65534 +; CHECK-NEXT: vcmp.f32 s28, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s18 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r5, #-1 +; CHECK-NEXT: vcmp.f32 s28, s28 +; CHECK-NEXT: itt gt +; CHECK-NEXT: movwgt r1, #65535 +; CHECK-NEXT: movtgt r1, #1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r5, #0 -; CHECK-NEXT: lsrs r0, r5, #14 -; CHECK-NEXT: orr.w r0, r0, r6, lsl #18 -; CHECK-NEXT: vcvtt.f32.f16 s18, s17 -; CHECK-NEXT: str.w r0, [r11, #33] -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: bfc r1, #18, #14 +; CHECK-NEXT: vcvtt.f32.f16 s28, s18 +; CHECK-NEXT: lsrl r2, r1, #28 +; CHECK-NEXT: orr.w r0, r1, r7 +; CHECK-NEXT: str.w r0, [r9, #45] +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: orrs r4, r2 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcmp.f32 s18, s22 -; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: vcmp.f32 s28, s24 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: itt lt +; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movtlt r1, #65534 +; CHECK-NEXT: vcmp.f32 s28, s22 +; CHECK-NEXT: vcvtb.f32.f16 s18, s17 +; CHECK-NEXT: lsrs r0, r5, #10 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: itt gt +; CHECK-NEXT: movwgt r1, #65535 +; CHECK-NEXT: movtgt r1, #1 +; CHECK-NEXT: str.w r4, [r9, #41] +; CHECK-NEXT: strb.w r0, [r9, #49] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vcmp.f32 s28, s24 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: movlt r7, #0 +; CHECK-NEXT: vcmp.f32 s28, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s18 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: movgt.w r7, #-1 +; CHECK-NEXT: vcmp.f32 s28, s28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s22 -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r0, #0 +; CHECK-NEXT: itt vs +; CHECK-NEXT: movvs r7, #0 +; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: bfc r1, #18, #14 +; CHECK-NEXT: mov r4, r7 +; CHECK-NEXT: lsrl r4, r1, #14 +; CHECK-NEXT: orr.w r6, r1, r6, lsl #4 +; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: vcvtt.f32.f16 s28, s17 +; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: vcmp.f32 s18, s24 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: itt lt -; CHECK-NEXT: movwlt r9, #0 -; CHECK-NEXT: movtlt r9, #65534 -; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: movlt r5, #0 +; CHECK-NEXT: movtlt r5, #65534 +; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r9, #65535 -; CHECK-NEXT: movtgt r9, #1 -; CHECK-NEXT: vcmp.f32 s18, s18 -; CHECK-NEXT: vcvtt.f32.f16 s16, s16 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w r9, #0 -; CHECK-NEXT: lsl.w r0, r9, #22 -; CHECK-NEXT: orr.w r0, r0, r1, lsr #10 -; CHECK-NEXT: str.w r0, [r11, #20] -; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: movwgt r5, #65535 +; CHECK-NEXT: movtgt r5, #1 +; CHECK-NEXT: str.w r6, [r9, #37] +; CHECK-NEXT: str.w r4, [r9, #33] ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcmp.f32 s16, s22 -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r8, #0 +; CHECK-NEXT: vcmp.f32 s28, s24 +; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s16 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r8, #-1 +; CHECK-NEXT: vcmp.f32 s28, s22 +; CHECK-NEXT: itt lt +; CHECK-NEXT: movlt r4, #0 +; CHECK-NEXT: movtlt r4, #65534 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: vcmp.f32 s16, s22 -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w r8, #0 +; CHECK-NEXT: vcmp.f32 s26, s24 +; CHECK-NEXT: itt gt +; CHECK-NEXT: movwgt r4, #65535 +; CHECK-NEXT: movtgt r4, #1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: itt lt ; CHECK-NEXT: movwlt r10, #0 ; CHECK-NEXT: movtlt r10, #65534 -; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: lsr.w r0, r8, #14 +; CHECK-NEXT: vcmp.f32 s26, s22 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s16 ; CHECK-NEXT: itt gt ; CHECK-NEXT: movwgt r10, #65535 ; CHECK-NEXT: movtgt r10, #1 +; CHECK-NEXT: vcmp.f32 s26, s26 +; CHECK-NEXT: vcvtt.f32.f16 s16, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs.w r10, #0 -; CHECK-NEXT: orr.w r0, r0, r10, lsl #18 -; CHECK-NEXT: str.w r0, [r11, #8] -; CHECK-NEXT: lsrs r0, r4, #10 -; CHECK-NEXT: vcvtb.f32.f16 s16, s19 -; CHECK-NEXT: strb.w r0, [r11, #49] +; CHECK-NEXT: bfc r10, #18, #14 +; CHECK-NEXT: vcmp.f32 s28, s24 +; CHECK-NEXT: orr.w r0, r10, r7, lsl #18 +; CHECK-NEXT: str.w r0, [r9, #29] ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: vcmp.f32 s16, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s28, s22 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #0 -; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: ubfx r0, r6, #14, #4 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s28, s28 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r7, #-1 -; CHECK-NEXT: vcmp.f32 s16, s16 -; CHECK-NEXT: vcvtb.f32.f16 s18, s17 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r7, #0 -; CHECK-NEXT: orr.w r0, r0, r7, lsl #4 -; CHECK-NEXT: str.w r0, [r11, #37] -; CHECK-NEXT: vcmp.f32 s26, s22 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: itt lt -; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: movtlt r0, #65534 -; CHECK-NEXT: vcmp.f32 s26, s20 -; CHECK-NEXT: mov r4, r1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r0, #65535 -; CHECK-NEXT: movtgt r0, #1 -; CHECK-NEXT: vcmp.f32 s26, s26 +; CHECK-NEXT: movgt.w r6, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: bfc r0, #18, #14 -; CHECK-NEXT: orr.w r0, r0, r5, lsl #18 -; CHECK-NEXT: str.w r0, [r11, #29] -; CHECK-NEXT: lsr.w r0, r9, #10 -; CHECK-NEXT: strb.w r0, [r11, #24] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcmp.f32 s18, s22 -; CHECK-NEXT: ubfx r2, r10, #14, #4 +; CHECK-NEXT: vcmp.f32 s18, s24 +; CHECK-NEXT: itt vs +; CHECK-NEXT: movvs r6, #0 +; CHECK-NEXT: movvs r4, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: movlt.w r11, #0 +; CHECK-NEXT: vcmp.f32 s18, s22 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: movgt.w r11, #-1 ; CHECK-NEXT: vcmp.f32 s18, s18 +; CHECK-NEXT: bfc r1, #18, #14 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: orr.w r2, r2, r0, lsl #4 -; CHECK-NEXT: str.w r2, [r11, #12] -; CHECK-NEXT: vcmp.f32 s24, s22 -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: itt vs +; CHECK-NEXT: movvs.w r11, #0 +; CHECK-NEXT: movvs r5, #0 +; CHECK-NEXT: vcmp.f32 s20, s24 +; CHECK-NEXT: bfc r5, #18, #14 +; CHECK-NEXT: mov r10, r11 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s24, s20 +; CHECK-NEXT: lsll r6, r1, #22 +; CHECK-NEXT: lsrl r10, r5, #28 ; CHECK-NEXT: itt lt -; CHECK-NEXT: movlt r2, #0 -; CHECK-NEXT: movtlt r2, #65534 +; CHECK-NEXT: movwlt r8, #0 +; CHECK-NEXT: movtlt r8, #65534 +; CHECK-NEXT: vcmp.f32 s20, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s24, s24 ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r2, #65535 -; CHECK-NEXT: movtgt r2, #1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s22 -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r2, #0 +; CHECK-NEXT: movwgt r8, #65535 +; CHECK-NEXT: movtgt r8, #1 +; CHECK-NEXT: orrs r1, r5 +; CHECK-NEXT: str.w r1, [r9, #20] +; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: vcmp.f32 s16, s24 +; CHECK-NEXT: orr.w r2, r10, r6 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: itt lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: movtlt r1, #65534 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s22 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: itt gt ; CHECK-NEXT: movwgt r1, #65535 ; CHECK-NEXT: movtgt r1, #1 +; CHECK-NEXT: str.w r2, [r9, #16] +; CHECK-NEXT: lsrs r2, r4, #10 +; CHECK-NEXT: vcmp.f32 s16, s24 +; CHECK-NEXT: strb.w r2, [r9, #24] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: itt lt -; CHECK-NEXT: movlt r4, #0 -; CHECK-NEXT: movtlt r4, #65534 -; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: bfc r2, #18, #14 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: vcmp.f32 s16, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r4, #65535 -; CHECK-NEXT: movtgt r4, #1 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: vcmp.f32 s16, s16 -; CHECK-NEXT: orr.w r2, r2, r8, lsl #18 -; CHECK-NEXT: str.w r2, [r11, #4] -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r4, #0 -; CHECK-NEXT: bfc r4, #18, #14 -; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: lsrs r2, r7, #28 -; CHECK-NEXT: vcmp.f32 s18, s18 -; CHECK-NEXT: orr.w r2, r2, r4, lsl #4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: orr.w r2, r2, r3, lsl #22 -; CHECK-NEXT: str.w r2, [r11, #41] -; CHECK-NEXT: it vs +; CHECK-NEXT: itt vs +; CHECK-NEXT: movvs r0, #0 ; CHECK-NEXT: movvs r1, #0 -; CHECK-NEXT: lsrs r0, r0, #28 ; CHECK-NEXT: bfc r1, #18, #14 -; CHECK-NEXT: orr.w r0, r0, r1, lsl #4 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: orr.w r0, r0, r1, lsl #22 -; CHECK-NEXT: str.w r0, [r11, #16] -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: lsrl r2, r1, #14 +; CHECK-NEXT: vcmp.f32 s20, s20 +; CHECK-NEXT: orr.w r1, r1, r11, lsl #4 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: strd r2, r1, [r9, #8] +; CHECK-NEXT: it vs +; CHECK-NEXT: movvs.w r8, #0 +; CHECK-NEXT: bfc r8, #18, #14 +; CHECK-NEXT: orr.w r0, r8, r0, lsl #18 +; CHECK-NEXT: str.w r0, [r9, #4] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 2 @@ -5426,477 +5424,482 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-LABEL: test_signed_v8f16_v8i100: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vcvtb.f32.f16 s30, s19 -; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: vcvtb.f32.f16 s21, s19 +; CHECK-NEXT: vcvtt.f32.f16 s24, s19 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vcvtb.f32.f16 s26, s16 +; CHECK-NEXT: vcvtb.f32.f16 s28, s17 +; CHECK-NEXT: vcvtb.f32.f16 s30, s18 +; CHECK-NEXT: vldr s20, .LCPI50_2 +; CHECK-NEXT: vmov r8, s24 +; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vcvtt.f32.f16 s22, s18 +; CHECK-NEXT: vmov r6, s28 +; CHECK-NEXT: vmov r5, s30 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vcvtb.f32.f16 s28, s18 -; CHECK-NEXT: mov r5, r3 -; CHECK-NEXT: vmov r3, s28 -; CHECK-NEXT: vldr s24, .LCPI50_2 -; CHECK-NEXT: vldr s20, .LCPI50_3 -; CHECK-NEXT: vcvtt.f32.f16 s19, s19 -; CHECK-NEXT: vcmp.f32 s30, s24 -; CHECK-NEXT: vcvtb.f32.f16 s22, s16 +; CHECK-NEXT: vldr s18, .LCPI50_3 +; CHECK-NEXT: mov r7, r3 +; CHECK-NEXT: vcmp.f32 s21, s18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vcmp.f32 s21, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s30 +; CHECK-NEXT: vcmp.f32 s21, s21 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s24 +; CHECK-NEXT: vcmp.f32 s21, s18 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 -; CHECK-NEXT: str.w r2, [r4, #83] +; CHECK-NEXT: vcmp.f32 s21, s20 +; CHECK-NEXT: str.w r2, [r9, #83] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s30 +; CHECK-NEXT: vcmp.f32 s21, s21 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s24 +; CHECK-NEXT: vcmp.f32 s21, s18 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r1, [r4, #79] +; CHECK-NEXT: str.w r1, [r9, #79] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s30, s20 -; CHECK-NEXT: vcvtb.f32.f16 s26, s17 +; CHECK-NEXT: vcmp.f32 s21, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: vcmp.f32 s30, s30 +; CHECK-NEXT: vcmp.f32 s21, s21 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: str.w r0, [r4, #75] -; CHECK-NEXT: vmov r9, s19 -; CHECK-NEXT: vmov r8, s22 -; CHECK-NEXT: mov r0, r3 -; CHECK-NEXT: vmov r6, s26 +; CHECK-NEXT: str.w r0, [r9, #75] +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vcmp.f32 s28, s24 -; CHECK-NEXT: mov r7, r3 +; CHECK-NEXT: vcmp.f32 s30, s18 +; CHECK-NEXT: mov r5, r3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s28, s20 +; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s28, s28 +; CHECK-NEXT: vcmp.f32 s30, s30 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s28, s24 +; CHECK-NEXT: vcmp.f32 s30, s18 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s28, s20 -; CHECK-NEXT: str.w r2, [r4, #58] +; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: str.w r2, [r9, #58] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s28, s28 +; CHECK-NEXT: vcmp.f32 s30, s30 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s28, s24 +; CHECK-NEXT: vcmp.f32 s30, s18 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r1, [r4, #54] +; CHECK-NEXT: str.w r1, [r9, #54] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s28, s20 +; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: vcmp.f32 s28, s28 +; CHECK-NEXT: vcmp.f32 s30, s30 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: str.w r0, [r4, #50] +; CHECK-NEXT: str.w r0, [r9, #50] ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vcmp.f32 s26, s24 -; CHECK-NEXT: mov r10, r3 +; CHECK-NEXT: vcmp.f32 s28, s18 +; CHECK-NEXT: str r3, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, s20 +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, s26 +; CHECK-NEXT: vcmp.f32 s28, s28 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, s24 +; CHECK-NEXT: vcmp.f32 s28, s18 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, s20 -; CHECK-NEXT: str.w r2, [r4, #33] +; CHECK-NEXT: vcmp.f32 s28, s20 +; CHECK-NEXT: str.w r2, [r9, #33] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, s26 +; CHECK-NEXT: vcmp.f32 s28, s28 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, s24 +; CHECK-NEXT: vcmp.f32 s28, s18 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r1, [r4, #29] +; CHECK-NEXT: str.w r1, [r9, #29] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s26, s20 +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: vcmp.f32 s26, s26 +; CHECK-NEXT: vcmp.f32 s28, s28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: str.w r0, [r4, #25] -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: str.w r0, [r9, #25] +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vcmp.f32 s22, s24 -; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: vcmp.f32 s26, s18 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s22, s20 +; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s22, s22 +; CHECK-NEXT: vcmp.f32 s26, s26 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s22, s24 +; CHECK-NEXT: vcmp.f32 s26, s18 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s22, s20 -; CHECK-NEXT: str r2, [r4, #8] +; CHECK-NEXT: vcmp.f32 s26, s20 +; CHECK-NEXT: str.w r2, [r9, #8] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s22, s22 +; CHECK-NEXT: vcmp.f32 s26, s26 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s22, s24 +; CHECK-NEXT: vcmp.f32 s26, s18 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str r1, [r4, #4] +; CHECK-NEXT: str.w r1, [r9, #4] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s22, s20 +; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: vcmp.f32 s22, s22 +; CHECK-NEXT: vcmp.f32 s26, s26 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: str r0, [r4] -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: str.w r0, [r9] +; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vcmp.f32 s19, s24 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s19 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s24 -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: vcmp.f32 s24, s18 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 +; CHECK-NEXT: vcmp.f32 s24, s20 +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s19 +; CHECK-NEXT: vcmp.f32 s24, s24 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: movgt.w r6, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s24 +; CHECK-NEXT: vcmp.f32 s21, s18 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r2, #0 -; CHECK-NEXT: lsrs r6, r1, #28 +; CHECK-NEXT: movvs r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 -; CHECK-NEXT: orr.w r6, r6, r2, lsl #4 -; CHECK-NEXT: str.w r6, [r4, #95] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: mvnlt r7, #7 +; CHECK-NEXT: vcmp.f32 s21, s20 +; CHECK-NEXT: mov r11, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: movgt r7, #7 +; CHECK-NEXT: vcmp.f32 s21, s21 +; CHECK-NEXT: mov r10, r2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s24 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: lsrs r6, r0, #28 -; CHECK-NEXT: orr.w r1, r6, r1, lsl #4 -; CHECK-NEXT: str.w r1, [r4, #91] +; CHECK-NEXT: movvs r7, #0 +; CHECK-NEXT: and r0, r7, #15 +; CHECK-NEXT: orr.w r1, r0, r6, lsl #4 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: str.w r1, [r9, #87] +; CHECK-NEXT: bl __fixsfti +; CHECK-NEXT: vcmp.f32 s22, s18 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s22, s20 +; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: it lt -; CHECK-NEXT: mvnlt r3, #7 -; CHECK-NEXT: vcmp.f32 s19, s20 +; CHECK-NEXT: movlt.w r8, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s19 +; CHECK-NEXT: vcmp.f32 s22, s22 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #7 -; CHECK-NEXT: lsrs r1, r2, #28 -; CHECK-NEXT: vcvtt.f32.f16 s19, s18 +; CHECK-NEXT: movgt.w r8, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s30, s18 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r3, #0 -; CHECK-NEXT: orr.w r2, r1, r3, lsl #4 -; CHECK-NEXT: vmov r1, s19 -; CHECK-NEXT: strb.w r2, [r4, #99] -; CHECK-NEXT: vcmp.f32 s30, s24 +; CHECK-NEXT: movvs.w r8, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: mvnlt r5, #7 ; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s30, s30 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt r5, #7 -; CHECK-NEXT: vcmp.f32 s30, s30 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r5, #0 -; CHECK-NEXT: and r2, r5, #15 -; CHECK-NEXT: orr.w r0, r2, r0, lsl #4 -; CHECK-NEXT: str.w r0, [r4, #87] -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: and r0, r5, #15 +; CHECK-NEXT: orr.w r0, r0, r8, lsl #4 +; CHECK-NEXT: vcvtt.f32.f16 s30, s17 +; CHECK-NEXT: str.w r0, [r9, #62] +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: mov r7, r3 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vcmp.f32 s19, s24 -; CHECK-NEXT: vcvtt.f32.f16 s18, s17 +; CHECK-NEXT: vcmp.f32 s30, s18 +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 +; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vcvtt.f32.f16 s16, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: vcmp.f32 s30, s30 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s24 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: movvs r0, #0 +; CHECK-NEXT: vcmp.f32 s28, s18 +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: mvnlt r0, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: movgt r0, #7 +; CHECK-NEXT: vcmp.f32 s28, s28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r6, r1, #28 -; CHECK-NEXT: vcmp.f32 s19, s24 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r2, #0 -; CHECK-NEXT: orr.w r6, r6, r2, lsl #4 +; CHECK-NEXT: movvs r0, #0 +; CHECK-NEXT: and r0, r0, #15 +; CHECK-NEXT: orr.w r0, r0, r1, lsl #4 +; CHECK-NEXT: str.w r0, [r9, #37] +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: bl __fixsfti +; CHECK-NEXT: vcmp.f32 s16, s18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r6, [r4, #70] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s19, s20 +; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: vcmp.f32 s19, s19 -; CHECK-NEXT: lsrs r2, r2, #28 +; CHECK-NEXT: vcmp.f32 s16, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: lsrs r6, r0, #28 -; CHECK-NEXT: orr.w r1, r6, r1, lsl #4 -; CHECK-NEXT: str.w r1, [r4, #66] -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: vcmp.f32 s19, s24 -; CHECK-NEXT: vcvtt.f32.f16 s16, s16 +; CHECK-NEXT: vcmp.f32 s26, s18 +; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: mvnlt r3, #7 +; CHECK-NEXT: mvnlt r5, #7 +; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #7 +; CHECK-NEXT: movgt r5, #7 +; CHECK-NEXT: vcmp.f32 s26, s26 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s28, s24 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r3, #0 -; CHECK-NEXT: orr.w r2, r2, r3, lsl #4 +; CHECK-NEXT: movvs r5, #0 +; CHECK-NEXT: vcmp.f32 s24, s18 +; CHECK-NEXT: and r5, r5, #15 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s24, s20 +; CHECK-NEXT: orr.w r5, r5, r0, lsl #4 +; CHECK-NEXT: str.w r5, [r9, #12] +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt.w r11, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: b.w .LBB50_3 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI50_2: -; CHECK-NEXT: .long 0xf1000000 @ float -6.338253E+29 +; CHECK-NEXT: .long 0x70ffffff @ float 6.33825262E+29 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: .LCPI50_3: -; CHECK-NEXT: .long 0x70ffffff @ float 6.33825262E+29 +; CHECK-NEXT: .long 0xf1000000 @ float -6.338253E+29 ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: .LBB50_3: -; CHECK-NEXT: strb.w r2, [r4, #74] -; CHECK-NEXT: it lt -; CHECK-NEXT: mvnlt r7, #7 -; CHECK-NEXT: vcmp.f32 s28, s20 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s24, s24 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r7, #7 -; CHECK-NEXT: vcmp.f32 s28, s28 +; CHECK-NEXT: movgt.w r11, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s24, s18 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r7, #0 -; CHECK-NEXT: and r2, r7, #15 -; CHECK-NEXT: orr.w r0, r2, r0, lsl #4 -; CHECK-NEXT: str.w r0, [r4, #62] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vcmp.f32 s18, s24 +; CHECK-NEXT: movvs.w r11, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt.w r10, #0 +; CHECK-NEXT: vcmp.f32 s24, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s18 +; CHECK-NEXT: lsrl r6, r11, #28 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: movgt.w r10, #-1 +; CHECK-NEXT: vcmp.f32 s24, s24 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s24 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: movvs.w r10, #0 +; CHECK-NEXT: orr.w r5, r11, r10, lsl #4 +; CHECK-NEXT: str.w r5, [r9, #95] +; CHECK-NEXT: str.w r6, [r9, #91] +; CHECK-NEXT: vcmp.f32 s24, s18 +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: mvnlt r6, #7 +; CHECK-NEXT: vcmp.f32 s24, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s18 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: movgt r6, #7 +; CHECK-NEXT: vcmp.f32 s24, s24 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r7, r1, #28 -; CHECK-NEXT: vcmp.f32 s18, s24 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r2, #0 -; CHECK-NEXT: orr.w r7, r7, r2, lsl #4 +; CHECK-NEXT: movvs r6, #0 +; CHECK-NEXT: and r5, r6, #15 +; CHECK-NEXT: vcmp.f32 s22, s18 +; CHECK-NEXT: lsrl r10, r5, #28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r7, [r4, #45] +; CHECK-NEXT: vcmp.f32 s22, s20 +; CHECK-NEXT: strb.w r10, [r9, #99] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: mvnlt r7, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s22, s22 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: vcmp.f32 s18, s18 -; CHECK-NEXT: lsrs r2, r2, #28 +; CHECK-NEXT: movgt r7, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s22, s18 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r0, #0 -; CHECK-NEXT: lsrs r7, r0, #28 -; CHECK-NEXT: vcmp.f32 s18, s24 -; CHECK-NEXT: orr.w r7, r7, r1, lsl #4 -; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: movvs r7, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: str.w r7, [r4, #41] ; CHECK-NEXT: it lt -; CHECK-NEXT: mvnlt r3, #7 +; CHECK-NEXT: movlt r4, #0 +; CHECK-NEXT: vcmp.f32 s22, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s18 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #7 +; CHECK-NEXT: movgt.w r4, #-1 +; CHECK-NEXT: vcmp.f32 s22, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, s24 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r3, #0 -; CHECK-NEXT: orr.w r2, r2, r3, lsl #4 +; CHECK-NEXT: movvs r4, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r7 +; CHECK-NEXT: vcmp.f32 s22, s18 +; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: vmov r5, s1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: strb.w r2, [r4, #49] ; CHECK-NEXT: it lt -; CHECK-NEXT: mvnlt r10, #7 -; CHECK-NEXT: vcmp.f32 s26, s20 +; CHECK-NEXT: movlt r4, #0 +; CHECK-NEXT: vcmp.f32 s22, s20 +; CHECK-NEXT: lsrl r8, r5, #28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r10, #7 -; CHECK-NEXT: vcmp.f32 s26, s26 +; CHECK-NEXT: movgt.w r4, #-1 +; CHECK-NEXT: vcmp.f32 s22, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w r10, #0 -; CHECK-NEXT: and r2, r10, #15 -; CHECK-NEXT: orr.w r0, r2, r0, lsl #4 -; CHECK-NEXT: str.w r0, [r4, #37] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vcmp.f32 s16, s24 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 +; CHECK-NEXT: movvs r4, #0 +; CHECK-NEXT: orr.w r6, r5, r4, lsl #4 +; CHECK-NEXT: and r5, r7, #15 +; CHECK-NEXT: lsrl r4, r5, #28 +; CHECK-NEXT: str.w r6, [r9, #70] +; CHECK-NEXT: str.w r8, [r9, #66] +; CHECK-NEXT: vcmp.f32 s30, s18 +; CHECK-NEXT: strb.w r4, [r9, #74] +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: mvnlt r4, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s16 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: movgt r4, #7 +; CHECK-NEXT: vcmp.f32 s30, s30 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s24 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: movvs r4, #0 +; CHECK-NEXT: vcmp.f32 s30, s18 +; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: movlt r7, #0 +; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s16 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: movgt.w r7, #-1 +; CHECK-NEXT: vcmp.f32 s30, s30 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s24 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r2, #0 -; CHECK-NEXT: lsrs r7, r1, #28 +; CHECK-NEXT: movvs r7, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r4 +; CHECK-NEXT: vcmp.f32 s30, s18 +; CHECK-NEXT: ldr.w r12, [sp] @ 4-byte Reload +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: orr.w r7, r7, r2, lsl #4 -; CHECK-NEXT: str r7, [r4, #20] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: movlt r6, #0 +; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s16 +; CHECK-NEXT: lsrl r12, r5, #28 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: movgt.w r6, #-1 +; CHECK-NEXT: vcmp.f32 s30, s30 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s24 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r0, #0 +; CHECK-NEXT: movvs r6, #0 +; CHECK-NEXT: orr.w r7, r5, r6, lsl #4 +; CHECK-NEXT: and r5, r4, #15 +; CHECK-NEXT: vcmp.f32 s16, s18 +; CHECK-NEXT: lsrl r6, r5, #28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: lsr.w r7, r0, #28 -; CHECK-NEXT: orr.w r1, r7, r1, lsl #4 -; CHECK-NEXT: str r1, [r4, #16] +; CHECK-NEXT: str.w r7, [r9, #45] +; CHECK-NEXT: str.w r12, [r9, #41] +; CHECK-NEXT: strb.w r6, [r9, #49] ; CHECK-NEXT: it lt ; CHECK-NEXT: mvnlt r3, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -5904,28 +5907,44 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt r3, #7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r1, r2, #28 -; CHECK-NEXT: vcmp.f32 s22, s24 +; CHECK-NEXT: vcmp.f32 s16, s18 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r3, #0 -; CHECK-NEXT: orr.w r1, r1, r3, lsl #4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: strb r1, [r4, #24] ; CHECK-NEXT: it lt -; CHECK-NEXT: mvnlt r8, #7 -; CHECK-NEXT: vcmp.f32 s22, s20 +; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r8, #7 -; CHECK-NEXT: vcmp.f32 s22, s22 +; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: vcmp.f32 s16, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w r8, #0 -; CHECK-NEXT: and r1, r8, #15 -; CHECK-NEXT: orr.w r0, r1, r0, lsl #4 -; CHECK-NEXT: str r0, [r4, #12] +; CHECK-NEXT: movvs r1, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vcmp.f32 s16, s18 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: vcmp.f32 s16, s20 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: lsrl r0, r1, #28 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: vcmp.f32 s16, s16 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: it vs +; CHECK-NEXT: movvs r2, #0 +; CHECK-NEXT: orr.w r1, r1, r2, lsl #4 +; CHECK-NEXT: strd r0, r1, [r9, #16] +; CHECK-NEXT: and r1, r3, #15 +; CHECK-NEXT: lsrl r2, r1, #28 +; CHECK-NEXT: strb.w r2, [r9, #24] +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: @ %bb.4: %x = call <8 x i100> @llvm.fptosi.sat.v8f16.v8i100(<8 x half> %f) ret <8 x i100> %x diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll index 8ea12bd..2b6d0da 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll @@ -1506,110 +1506,110 @@ define arm_aapcs_vfpcc <4 x i50> @test_unsigned_v4f32_v4i50(<4 x float> %f) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vldr s20, .LCPI28_0 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov r6, s19 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vcmp.f32 s16, #0 -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r9, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vldr s20, .LCPI28_0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vcmp.f32 s17, #0 -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r10, #0 -; CHECK-NEXT: vcmp.f32 s17, s20 +; CHECK-NEXT: movlt r5, #0 +; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r10, #65535 -; CHECK-NEXT: movtgt r10, #3 +; CHECK-NEXT: movwgt r5, #65535 +; CHECK-NEXT: movtgt r5, #3 ; CHECK-NEXT: bl __aeabi_f2ulz +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov r0, s17 ; CHECK-NEXT: vcmp.f32 s19, #0 -; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: mov r7, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r6, #0 +; CHECK-NEXT: movlt r7, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r6, #65535 -; CHECK-NEXT: movtgt r6, #3 +; CHECK-NEXT: movwgt r7, #65535 +; CHECK-NEXT: movtgt r7, #3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r7, #-1 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: movgt.w r6, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s20 -; CHECK-NEXT: str.w r7, [r8] +; CHECK-NEXT: str.w r6, [r8] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r4, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsl.w r0, r6, #22 -; CHECK-NEXT: vcmp.f32 s17, #0 +; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r4, #-1 -; CHECK-NEXT: orr.w r0, r0, r4, lsr #10 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r0, [r8, #20] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r9, #0 -; CHECK-NEXT: vcmp.f32 s17, s20 +; CHECK-NEXT: movlt.w r10, #0 +; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: bfc r1, #18, #14 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r9, #-1 -; CHECK-NEXT: lsr.w r0, r9, #14 -; CHECK-NEXT: orr.w r1, r0, r10, lsl #18 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: str.w r1, [r8, #8] -; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: lsrs r2, r6, #10 +; CHECK-NEXT: movgt.w r10, #-1 +; CHECK-NEXT: vcmp.f32 s16, #0 +; CHECK-NEXT: bfc r5, #18, #14 +; CHECK-NEXT: mov r6, r10 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: lsll r4, r1, #22 +; CHECK-NEXT: lsrl r6, r5, #28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt.w r9, #0 +; CHECK-NEXT: vcmp.f32 s16, s20 +; CHECK-NEXT: orrs r1, r5 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r1, #65535 -; CHECK-NEXT: movtgt r1, #3 +; CHECK-NEXT: movwgt r9, #65535 +; CHECK-NEXT: movtgt r9, #3 +; CHECK-NEXT: str.w r1, [r8, #20] +; CHECK-NEXT: bl __aeabi_f2ulz +; CHECK-NEXT: vcmp.f32 s17, #0 +; CHECK-NEXT: orr.w r2, r6, r4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #0 +; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: vcmp.f32 s17, s20 +; CHECK-NEXT: bfc r9, #18, #14 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r5, #65535 -; CHECK-NEXT: movtgt r5, #3 +; CHECK-NEXT: movwgt r1, #65535 +; CHECK-NEXT: movtgt r1, #3 +; CHECK-NEXT: str.w r2, [r8, #16] +; CHECK-NEXT: lsrs r2, r7, #10 +; CHECK-NEXT: vcmp.f32 s17, #0 ; CHECK-NEXT: strb.w r2, [r8, #24] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: vcmp.f32 s17, s20 +; CHECK-NEXT: bfc r1, #18, #14 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: ubfx r2, r10, #14, #4 -; CHECK-NEXT: bfc r1, #18, #14 -; CHECK-NEXT: orr.w r2, r2, r0, lsl #4 -; CHECK-NEXT: lsrs r0, r0, #28 -; CHECK-NEXT: orr.w r0, r0, r1, lsl #4 -; CHECK-NEXT: bfc r5, #18, #14 -; CHECK-NEXT: str.w r2, [r8, #12] -; CHECK-NEXT: orr.w r2, r5, r9, lsl #18 -; CHECK-NEXT: str.w r2, [r8, #4] -; CHECK-NEXT: orr.w r0, r0, r4, lsl #22 -; CHECK-NEXT: str.w r0, [r8, #16] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: orr.w r0, r9, r0, lsl #18 +; CHECK-NEXT: lsrl r2, r1, #14 +; CHECK-NEXT: orr.w r1, r1, r10, lsl #4 +; CHECK-NEXT: strd r2, r1, [r8, #8] +; CHECK-NEXT: str.w r0, [r8, #4] ; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} ; CHECK-NEXT: .p2align 2 @@ -1729,32 +1729,31 @@ define arm_aapcs_vfpcc <4 x i64> @test_unsigned_v4f32_v4i64(<4 x float> %f) { define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) { ; CHECK-LABEL: test_unsigned_v4f32_v4i100: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10} ; CHECK-NEXT: vpush {d8, d9, d10} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vldr s20, .LCPI30_0 -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov r7, s19 ; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: mov r9, r3 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vldr s20, .LCPI30_0 ; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: mov r6, r3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: str.w r2, [r4, #33] +; CHECK-NEXT: str.w r2, [r8, #33] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -1762,18 +1761,20 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r1, [r4, #29] +; CHECK-NEXT: str.w r1, [r8, #29] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: str.w r0, [r4, #25] -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: str.w r0, [r8, #25] +; CHECK-NEXT: vmov r7, s17 +; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: mov r0, r3 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vcmp.f32 s16, #0 -; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: mov r10, r3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt @@ -1784,7 +1785,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: str r2, [r4, #8] +; CHECK-NEXT: str.w r2, [r8, #8] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -1792,126 +1793,126 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str r1, [r4, #4] +; CHECK-NEXT: str.w r1, [r8, #4] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: str r0, [r4] -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: str.w r0, [r8] +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vcmp.f32 s19, #0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt r4, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, #0 +; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: movgt.w r4, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: movlt.w r9, #0 +; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r7, r1, #28 -; CHECK-NEXT: vcmp.f32 s19, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 -; CHECK-NEXT: orr.w r7, r7, r2, lsl #4 +; CHECK-NEXT: movgt.w r9, #15 +; CHECK-NEXT: and r0, r9, #15 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: orr.w r0, r0, r4, lsl #4 +; CHECK-NEXT: str.w r0, [r8, #37] +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r11, r3 +; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: vcmp.f32 s17, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r7, [r4, #45] +; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: lsrs r7, r0, #28 -; CHECK-NEXT: vcmp.f32 s19, #0 -; CHECK-NEXT: orr.w r7, r7, r1, lsl #4 -; CHECK-NEXT: vmov r1, s17 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 -; CHECK-NEXT: str.w r7, [r4, #41] +; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #0 +; CHECK-NEXT: movlt.w r10, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r2, r2, #28 -; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: vcmp.f32 s19, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #15 -; CHECK-NEXT: orr.w r2, r2, r3, lsl #4 +; CHECK-NEXT: movgt.w r10, #15 +; CHECK-NEXT: and r7, r10, #15 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: strb.w r2, [r4, #49] +; CHECK-NEXT: vcmp.f32 s19, s20 +; CHECK-NEXT: orr.w r7, r7, r0, lsl #4 +; CHECK-NEXT: str.w r7, [r8, #12] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r6, #0 -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: movlt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s19, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r6, #15 -; CHECK-NEXT: and r2, r6, #15 -; CHECK-NEXT: orr.w r0, r2, r0, lsl #4 -; CHECK-NEXT: str.w r0, [r4, #37] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: vcmp.f32 s17, #0 +; CHECK-NEXT: movgt.w r5, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s20 +; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, #0 +; CHECK-NEXT: lsrl r4, r5, #28 +; CHECK-NEXT: vcmp.f32 s19, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: movgt.w r6, #-1 +; CHECK-NEXT: orr.w r7, r5, r6, lsl #4 +; CHECK-NEXT: str.w r7, [r8, #45] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s20 +; CHECK-NEXT: str.w r4, [r8, #41] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: movlt.w r11, #0 +; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 -; CHECK-NEXT: lsrs r7, r1, #28 +; CHECK-NEXT: movgt.w r11, #15 +; CHECK-NEXT: and r5, r11, #15 +; CHECK-NEXT: vcmp.f32 s17, #0 +; CHECK-NEXT: lsrl r6, r5, #28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s20 -; CHECK-NEXT: orr.w r7, r7, r2, lsl #4 -; CHECK-NEXT: str r7, [r4, #20] +; CHECK-NEXT: strb.w r6, [r8, #49] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: movlt r3, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: lsrs r7, r0, #28 -; CHECK-NEXT: orr.w r1, r7, r1, lsl #4 +; CHECK-NEXT: movgt r3, #15 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s20 -; CHECK-NEXT: str r1, [r4, #16] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #0 +; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r1, r2, #28 -; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #15 -; CHECK-NEXT: orr.w r1, r1, r3, lsl #4 +; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vcmp.f32 s17, #0 +; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: strb r1, [r4, #24] +; CHECK-NEXT: lsrl r0, r1, #28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #0 -; CHECK-NEXT: vcmp.f32 s16, s20 +; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r5, #15 -; CHECK-NEXT: and r1, r5, #15 -; CHECK-NEXT: orr.w r0, r1, r0, lsl #4 -; CHECK-NEXT: str r0, [r4, #12] +; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: orr.w r1, r1, r2, lsl #4 +; CHECK-NEXT: strd r0, r1, [r8, #16] +; CHECK-NEXT: and r1, r3, #15 +; CHECK-NEXT: lsrl r2, r1, #28 +; CHECK-NEXT: strb.w r2, [r8, #24] ; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI30_0: @@ -2923,195 +2924,196 @@ define arm_aapcs_vfpcc <2 x i100> @test_unsigned_v2f64_v2i100(<2 x double> %f) { ; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI40_0 -; CHECK-NEXT: vmov r9, r5, d8 -; CHECK-NEXT: str r0, [sp, #44] @ 4-byte Spill -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: vmov r6, r5, d8 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: vmov r2, r9, d0 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r7, r2 -; CHECK-NEXT: mov r6, r3 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r10, r2 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI40_1 -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: str r2, [sp, #40] @ 4-byte Spill -; CHECK-NEXT: mov r10, r3 +; CHECK-NEXT: vmov r11, r3, d0 +; CHECK-NEXT: str r3, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: str r5, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: strd r1, r0, [sp, #8] @ 8-byte Folded Spill -; CHECK-NEXT: csel r0, r2, r8, ne -; CHECK-NEXT: str r3, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: cmp.w r11, #0 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: strd r1, r0, [sp, #20] @ 8-byte Folded Spill +; CHECK-NEXT: csel r0, r2, r4, ne +; CHECK-NEXT: str r3, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r4, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: str.w r0, [r8, #8] +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: mov r11, r7 -; CHECK-NEXT: str r0, [r4, #8] -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: str r5, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: str r7, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: str r6, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r7, [sp, #40] @ 4-byte Reload -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: ldr r3, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: str.w r9, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r7, r6 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: csel r0, r1, r0, ne -; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: str r0, [r4, #4] -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: str.w r0, [r8, #4] +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r6, r8 +; CHECK-NEXT: strd r8, r7, [sp, #28] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: ldr r7, [sp, #44] @ 4-byte Reload ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r9, r7 -; CHECK-NEXT: str.w r10, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r5, r11 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: vmov r8, r11, d9 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: csel r0, r1, r0, ne -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: str r0, [r4] -; CHECK-NEXT: ldr r5, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldr r6, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: str r0, [r6] +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r11 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r6, r5 +; CHECK-NEXT: str r5, [sp] @ 4-byte Spill +; CHECK-NEXT: mov r5, r7 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: strd r2, r0, [sp, #4] @ 8-byte Folded Spill -; CHECK-NEXT: csel r7, r1, r10, ne -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r11 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: strd r0, r2, [sp, #20] @ 8-byte Folded Spill +; CHECK-NEXT: csel r0, r3, r7, ne +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r7, #-1 -; CHECK-NEXT: mov r4, r6 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr.w r10, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: movne r0, #15 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r11 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: csel r9, r1, r0, ne -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: csel r0, r1, r0, ne +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r9, #-1 -; CHECK-NEXT: ldr r6, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: lsrs r0, r7, #28 +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r7 +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: mov r1, r11 -; CHECK-NEXT: orr.w r0, r0, r9, lsl #4 -; CHECK-NEXT: str r0, [r6, #20] +; CHECK-NEXT: mov r6, r10 +; CHECK-NEXT: str.w r10, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str.w r9, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #40] @ 4-byte Reload -; CHECK-NEXT: mov r1, r11 -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r5, r10 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r2, r7 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: csel r4, r1, r0, ne -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r1, r11 -; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: lsrs r0, r4, #28 -; CHECK-NEXT: orr.w r0, r0, r7, lsl #4 -; CHECK-NEXT: str r0, [r6, #16] -; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r10, r4 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: ldr.w r10, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: lsrl r10, r5, #28 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r1, r11 -; CHECK-NEXT: ldr.w r11, [sp, #40] @ 4-byte Reload -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: ldr.w r11, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r9, r7 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: csel r0, r1, r0, ne -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r0, #15 -; CHECK-NEXT: lsr.w r1, r9, #28 -; CHECK-NEXT: ldr.w r9, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: orr.w r0, r1, r0, lsl #4 -; CHECK-NEXT: strb.w r0, [r9, #24] -; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r5, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: orr.w r1, r5, r0, lsl #4 +; CHECK-NEXT: strd r10, r1, [r2, #16] +; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: and r1, r1, #15 +; CHECK-NEXT: lsrl r0, r1, #28 +; CHECK-NEXT: strb r0, [r2, #24] +; CHECK-NEXT: ldr r6, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: ldrd r3, r2, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r3, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csel r0, r1, r0, ne -; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #15 ; CHECK-NEXT: and r0, r0, #15 ; CHECK-NEXT: orr.w r0, r0, r4, lsl #4 -; CHECK-NEXT: str.w r0, [r9, #12] +; CHECK-NEXT: str.w r0, [r8, #12] ; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -3639,73 +3641,90 @@ define arm_aapcs_vfpcc <8 x i16> @test_unsigned_v8f16_v8i16(<8 x half> %f) { define arm_aapcs_vfpcc <8 x i19> @test_unsigned_v8f16_v8i19(<8 x half> %f) { ; CHECK-LABEL: test_unsigned_v8f16_v8i19: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldr s6, .LCPI46_1 -; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vcvtb.f32.f16 s12, s2 -; CHECK-NEXT: vcvtb.f32.f16 s10, s1 -; CHECK-NEXT: vcvtt.f32.f16 s14, s1 -; CHECK-NEXT: vcvtb.f32.f16 s1, s3 -; CHECK-NEXT: vcvtt.f32.f16 s0, s0 -; CHECK-NEXT: vcvtt.f32.f16 s2, s2 +; CHECK-NEXT: .save {r4, r5, r6, r7, r9, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r9, r11, lr} ; CHECK-NEXT: vldr s4, .LCPI46_0 -; CHECK-NEXT: vcvtt.f32.f16 s3, s3 -; CHECK-NEXT: vmaxnm.f32 s8, s8, s6 -; CHECK-NEXT: vmaxnm.f32 s10, s10, s6 -; CHECK-NEXT: vmaxnm.f32 s0, s0, s6 -; CHECK-NEXT: vmaxnm.f32 s12, s12, s6 -; CHECK-NEXT: vmaxnm.f32 s14, s14, s6 -; CHECK-NEXT: vmaxnm.f32 s2, s2, s6 -; CHECK-NEXT: vmaxnm.f32 s1, s1, s6 -; CHECK-NEXT: vmaxnm.f32 s6, s3, s6 -; CHECK-NEXT: vminnm.f32 s8, s8, s4 -; CHECK-NEXT: vminnm.f32 s10, s10, s4 -; CHECK-NEXT: vminnm.f32 s0, s0, s4 -; CHECK-NEXT: vminnm.f32 s12, s12, s4 -; CHECK-NEXT: vminnm.f32 s14, s14, s4 -; CHECK-NEXT: vminnm.f32 s2, s2, s4 -; CHECK-NEXT: vminnm.f32 s1, s1, s4 -; CHECK-NEXT: vminnm.f32 s4, s6, s4 -; CHECK-NEXT: vcvt.u32.f32 s1, s1 -; CHECK-NEXT: vcvt.u32.f32 s4, s4 -; CHECK-NEXT: vcvt.u32.f32 s2, s2 +; CHECK-NEXT: vcvtb.f32.f16 s14, s1 +; CHECK-NEXT: vldr s6, .LCPI46_1 +; CHECK-NEXT: vcvtt.f32.f16 s12, s1 +; CHECK-NEXT: vmaxnm.f32 s14, s14, s4 +; CHECK-NEXT: vmaxnm.f32 s12, s12, s4 +; CHECK-NEXT: vminnm.f32 s14, s14, s6 +; CHECK-NEXT: vminnm.f32 s12, s12, s6 ; CHECK-NEXT: vcvt.u32.f32 s14, s14 +; CHECK-NEXT: vcvtb.f32.f16 s10, s0 ; CHECK-NEXT: vcvt.u32.f32 s12, s12 +; CHECK-NEXT: vcvtt.f32.f16 s0, s0 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: vmaxnm.f32 s10, s10, s4 +; CHECK-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-NEXT: vminnm.f32 s10, s10, s6 ; CHECK-NEXT: vcvt.u32.f32 s0, s0 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vcvt.u32.f32 s10, s10 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vcvt.u32.f32 s8, s8 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: lsrs r2, r1, #14 -; CHECK-NEXT: orr.w r12, r2, r3, lsl #5 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: strh.w r12, [r0, #16] -; CHECK-NEXT: lsrs r2, r3, #1 -; CHECK-NEXT: orr.w lr, r2, r1, lsl #18 +; CHECK-NEXT: vcvtt.f32.f16 s8, s2 +; CHECK-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-NEXT: vmaxnm.f32 s8, s8, s4 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: lsrs r1, r2, #7 -; CHECK-NEXT: orr.w r1, r1, r4, lsl #12 -; CHECK-NEXT: orr.w r1, r1, r3, lsl #31 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: lsrs r4, r3, #13 -; CHECK-NEXT: orr.w r4, r4, r5, lsl #6 -; CHECK-NEXT: orr.w r2, r4, r2, lsl #25 +; CHECK-NEXT: vmaxnm.f32 s2, s2, s4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vminnm.f32 s2, s2, s6 +; CHECK-NEXT: vcvt.u32.f32 s2, s2 +; CHECK-NEXT: vminnm.f32 s8, s8, s6 +; CHECK-NEXT: vcvt.u32.f32 s8, s8 +; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s3 +; CHECK-NEXT: lsll r12, r1, #19 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vcvt.u32.f32 s0, s0 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: orr.w r1, r1, r2, lsl #6 +; CHECK-NEXT: lsrl r2, r5, #26 +; CHECK-NEXT: orr.w r1, r1, r4, lsl #25 +; CHECK-NEXT: str r1, [r0, #4] +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: lsrl r4, r11, #7 +; CHECK-NEXT: orr.w r1, r1, r12 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: orr.w r1, r2, r4 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: lsll r2, r7, #12 ; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: orr.w r3, r4, r3, lsl #19 -; CHECK-NEXT: strd r3, r2, [r0] -; CHECK-NEXT: strd r1, lr, [r0, #8] -; CHECK-NEXT: lsr.w r1, r12, #16 -; CHECK-NEXT: strb r1, [r0, #18] -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: orrs r2, r1 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: lsll r4, r1, #31 +; CHECK-NEXT: orr.w r12, r2, r4 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s3 +; CHECK-NEXT: lsll r4, r3, #5 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-NEXT: vcvt.u32.f32 s0, s0 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: lsrl r6, r9, #14 +; CHECK-NEXT: orr.w r3, r6, r4 +; CHECK-NEXT: strh r3, [r0, #16] +; CHECK-NEXT: str.w r12, [r0, #8] +; CHECK-NEXT: lsrs r3, r3, #16 +; CHECK-NEXT: strb r3, [r0, #18] +; CHECK-NEXT: orr.w r3, r5, r11 +; CHECK-NEXT: orrs r3, r7 +; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: orr.w r1, r1, r2, lsl #18 +; CHECK-NEXT: str r1, [r0, #12] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r9, r11, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI46_0: -; CHECK-NEXT: .long 0x48ffffe0 @ float 524287 -; CHECK-NEXT: .LCPI46_1: ; CHECK-NEXT: .long 0x00000000 @ float 0 +; CHECK-NEXT: .LCPI46_1: +; CHECK-NEXT: .long 0x48ffffe0 @ float 524287 %x = call <8 x i19> @llvm.fptoui.sat.v8f16.v8i19(<8 x half> %f) ret <8 x i19> %x } @@ -3749,46 +3768,46 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) { ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: vcvtb.f32.f16 s24, s18 ; CHECK-NEXT: vmov r0, s24 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vcvtt.f32.f16 s26, s19 +; CHECK-NEXT: vcvtt.f32.f16 s28, s19 ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov r0, s28 ; CHECK-NEXT: vcvtb.f32.f16 s22, s16 -; CHECK-NEXT: vcvtt.f32.f16 s18, s18 +; CHECK-NEXT: vcvtb.f32.f16 s26, s19 ; CHECK-NEXT: vcmp.f32 s24, #0 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: mov r9, r1 ; CHECK-NEXT: vmov r5, s22 ; CHECK-NEXT: vldr s20, .LCPI48_0 -; CHECK-NEXT: vmov r8, s18 +; CHECK-NEXT: vmov r11, s26 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r7, #0 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vcmp.f32 s26, #0 -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: vcmp.f32 s28, #0 +; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, s20 +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r10, #0 +; CHECK-NEXT: movlt r4, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: vcmp.f32 s24, s20 ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r10, #65535 -; CHECK-NEXT: movtgt r10, #3 +; CHECK-NEXT: movwgt r4, #65535 +; CHECK-NEXT: movtgt r4, #3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r7, #-1 -; CHECK-NEXT: str.w r7, [r4, #25] +; CHECK-NEXT: str.w r7, [r10, #25] ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s22, #0 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill @@ -3797,188 +3816,184 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) { ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, #0 +; CHECK-NEXT: vcmp.f32 s28, #0 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: str r0, [r4] +; CHECK-NEXT: mov r7, r4 +; CHECK-NEXT: str.w r0, [r10] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r6, #0 -; CHECK-NEXT: vcmp.f32 s26, s20 +; CHECK-NEXT: vcmp.f32 s28, s20 +; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r6, #-1 -; CHECK-NEXT: lsl.w r0, r10, #22 -; CHECK-NEXT: str r6, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: orr.w r6, r0, r6, lsr #10 -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: bfc r7, #18, #14 +; CHECK-NEXT: lsll r6, r7, #22 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: vcmp.f32 s26, #0 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s26, #0 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt.w r5, #-1 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s26, s20 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: itt gt +; CHECK-NEXT: movwgt r1, #65535 +; CHECK-NEXT: movtgt r1, #3 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: bfc r1, #18, #14 +; CHECK-NEXT: vcvtt.f32.f16 s26, s18 +; CHECK-NEXT: lsrl r2, r1, #28 +; CHECK-NEXT: orr.w r0, r1, r7 +; CHECK-NEXT: str.w r0, [r10, #45] +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: orrs r6, r2 +; CHECK-NEXT: bl __aeabi_f2ulz +; CHECK-NEXT: vcmp.f32 s26, #0 ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: vcmp.f32 s26, s20 +; CHECK-NEXT: vcvtb.f32.f16 s18, s17 +; CHECK-NEXT: lsrs r0, r4, #10 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r5, #65535 -; CHECK-NEXT: movtgt r5, #3 +; CHECK-NEXT: movwgt r1, #65535 +; CHECK-NEXT: movtgt r1, #3 +; CHECK-NEXT: str.w r6, [r10, #41] +; CHECK-NEXT: strb.w r0, [r10, #49] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vcmp.f32 s26, #0 +; CHECK-NEXT: bfc r1, #18, #14 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r6, [r4, #45] -; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r7, #0 +; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r7, #-1 -; CHECK-NEXT: lsrs r0, r7, #14 -; CHECK-NEXT: orr.w r0, r0, r5, lsl #18 -; CHECK-NEXT: vcvtt.f32.f16 s18, s17 -; CHECK-NEXT: str.w r0, [r4, #33] -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: mov r4, r7 +; CHECK-NEXT: lsrl r4, r1, #14 +; CHECK-NEXT: orr.w r6, r1, r5, lsl #4 ; CHECK-NEXT: bl __aeabi_f2ulz +; CHECK-NEXT: vcvtt.f32.f16 s26, s17 +; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: mov r9, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r5, #0 ; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: itt gt +; CHECK-NEXT: movwgt r5, #65535 +; CHECK-NEXT: movtgt r5, #3 +; CHECK-NEXT: str.w r6, [r10, #37] +; CHECK-NEXT: str.w r4, [r10, #33] +; CHECK-NEXT: bl __aeabi_f2ulz +; CHECK-NEXT: vcmp.f32 s26, #0 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: vcmp.f32 s24, #0 +; CHECK-NEXT: itt gt +; CHECK-NEXT: movwgt r6, #65535 +; CHECK-NEXT: movtgt r6, #3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w r9, #0 -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: vcmp.f32 s24, s20 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: itt gt ; CHECK-NEXT: movwgt r9, #65535 ; CHECK-NEXT: movtgt r9, #3 -; CHECK-NEXT: lsl.w r0, r9, #22 -; CHECK-NEXT: orr.w r0, r0, r1, lsr #10 +; CHECK-NEXT: bfc r9, #18, #14 ; CHECK-NEXT: vcvtt.f32.f16 s16, s16 -; CHECK-NEXT: str r0, [r4, #20] +; CHECK-NEXT: orr.w r0, r9, r7, lsl #18 +; CHECK-NEXT: str.w r0, [r10, #29] ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vcmp.f32 s16, #0 -; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: vcmp.f32 s26, #0 +; CHECK-NEXT: bfc r1, #18, #14 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 +; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r11, #0 +; CHECK-NEXT: movlt r4, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: vcmp.f32 s16, #0 -; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r11, #65535 -; CHECK-NEXT: movtgt r11, #3 +; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt.w r4, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r8, #0 +; CHECK-NEXT: movlt.w r11, #0 +; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: bfc r5, #18, #14 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r8, #-1 -; CHECK-NEXT: lsr.w r0, r8, #14 -; CHECK-NEXT: vcvtb.f32.f16 s16, s19 -; CHECK-NEXT: orr.w r0, r0, r11, lsl #18 -; CHECK-NEXT: str r0, [r4, #8] -; CHECK-NEXT: lsr.w r0, r10, #10 -; CHECK-NEXT: strb.w r0, [r4, #49] -; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: movgt.w r11, #-1 +; CHECK-NEXT: mov r8, r11 +; CHECK-NEXT: vcmp.f32 s22, #0 +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: lsll r4, r1, #22 +; CHECK-NEXT: lsrl r8, r5, #28 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r7, #0 +; CHECK-NEXT: vcmp.f32 s22, s20 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: itt gt +; CHECK-NEXT: movwgt r7, #65535 +; CHECK-NEXT: movtgt r7, #3 +; CHECK-NEXT: orrs r1, r5 +; CHECK-NEXT: str.w r1, [r10, #20] ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vcmp.f32 s16, #0 +; CHECK-NEXT: orr.w r2, r8, r4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r6, #0 +; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: ubfx r0, r5, #14, #4 +; CHECK-NEXT: bfc r7, #18, #14 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r6, #-1 -; CHECK-NEXT: orr.w r0, r0, r6, lsl #4 -; CHECK-NEXT: str.w r0, [r4, #37] -; CHECK-NEXT: vcmp.f32 s24, #0 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s24, s20 -; CHECK-NEXT: vcvtb.f32.f16 s18, s17 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r0, #65535 -; CHECK-NEXT: movtgt r0, #3 -; CHECK-NEXT: bfc r0, #18, #14 -; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: orr.w r0, r0, r7, lsl #18 -; CHECK-NEXT: str.w r0, [r4, #29] -; CHECK-NEXT: lsr.w r0, r9, #10 -; CHECK-NEXT: strb r0, [r4, #24] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: ubfx r2, r11, #14, #4 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: orr.w r2, r2, r0, lsl #4 -; CHECK-NEXT: str r2, [r4, #12] -; CHECK-NEXT: vcmp.f32 s22, #0 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s22, s20 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r2, #65535 -; CHECK-NEXT: movtgt r2, #3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: itt gt ; CHECK-NEXT: movwgt r1, #65535 ; CHECK-NEXT: movtgt r1, #3 -; CHECK-NEXT: bfc r2, #18, #14 +; CHECK-NEXT: str.w r2, [r10, #16] +; CHECK-NEXT: lsrs r2, r6, #10 +; CHECK-NEXT: vcmp.f32 s16, #0 +; CHECK-NEXT: strb.w r2, [r10, #24] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r10, #0 +; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r10, #65535 -; CHECK-NEXT: movtgt r10, #3 -; CHECK-NEXT: orr.w r2, r2, r8, lsl #18 -; CHECK-NEXT: str r2, [r4, #4] -; CHECK-NEXT: bfc r10, #18, #14 -; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: lsrs r2, r6, #28 ; CHECK-NEXT: bfc r1, #18, #14 -; CHECK-NEXT: orr.w r2, r2, r10, lsl #4 -; CHECK-NEXT: lsrs r0, r0, #28 -; CHECK-NEXT: orr.w r2, r2, r3, lsl #22 -; CHECK-NEXT: str.w r2, [r4, #41] -; CHECK-NEXT: orr.w r0, r0, r1, lsl #4 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: orr.w r0, r0, r1, lsl #22 -; CHECK-NEXT: str r0, [r4, #16] -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: orr.w r0, r7, r0, lsl #18 +; CHECK-NEXT: lsrl r2, r1, #14 +; CHECK-NEXT: orr.w r1, r1, r11, lsl #4 +; CHECK-NEXT: strd r2, r1, [r10, #8] +; CHECK-NEXT: str.w r0, [r10, #4] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 2 @@ -4192,21 +4207,61 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-LABEL: test_unsigned_v8f16_v8i100: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vcvtb.f32.f16 s28, s19 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: vcvtb.f32.f16 s26, s18 -; CHECK-NEXT: mov r5, r3 -; CHECK-NEXT: vmov r3, s26 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: vcvtb.f32.f16 s30, s19 +; CHECK-NEXT: vcvtb.f32.f16 s28, s18 +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vcvtt.f32.f16 s22, s19 +; CHECK-NEXT: vcvtb.f32.f16 s24, s16 +; CHECK-NEXT: vcvtb.f32.f16 s26, s17 ; CHECK-NEXT: vldr s20, .LCPI50_1 +; CHECK-NEXT: vmov r8, s22 +; CHECK-NEXT: vmov r5, s28 +; CHECK-NEXT: vcvtt.f32.f16 s18, s18 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vmov r6, s26 +; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: vcmp.f32 s30, #0 +; CHECK-NEXT: mov r7, r3 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s30, #0 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: str.w r2, [r9, #83] +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s30, #0 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: str.w r1, [r9, #79] +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: str.w r0, [r9, #75] +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vcmp.f32 s28, #0 -; CHECK-NEXT: vcvtt.f32.f16 s30, s19 +; CHECK-NEXT: mov r5, r3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: it lt @@ -4217,7 +4272,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s28, s20 -; CHECK-NEXT: str.w r2, [r4, #83] +; CHECK-NEXT: str.w r2, [r9, #58] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -4225,23 +4280,18 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r1, [r4, #79] +; CHECK-NEXT: str.w r1, [r9, #54] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s28, s20 -; CHECK-NEXT: vcvtb.f32.f16 s22, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: vcvtb.f32.f16 s24, s17 -; CHECK-NEXT: str.w r0, [r4, #75] -; CHECK-NEXT: vmov r9, s30 -; CHECK-NEXT: vmov r8, s22 -; CHECK-NEXT: vmov r6, s24 -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: str.w r0, [r9, #50] +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vcmp.f32 s26, #0 -; CHECK-NEXT: mov r7, r3 +; CHECK-NEXT: str r3, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: it lt @@ -4252,7 +4302,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s26, s20 -; CHECK-NEXT: str.w r2, [r4, #58] +; CHECK-NEXT: str.w r2, [r9, #33] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -4260,18 +4310,18 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r1, [r4, #54] +; CHECK-NEXT: str.w r1, [r9, #29] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: str.w r0, [r4, #50] -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: str.w r0, [r9, #25] +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vcmp.f32 s24, #0 -; CHECK-NEXT: mov r10, r3 +; CHECK-NEXT: mov r4, r3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s24, s20 ; CHECK-NEXT: it lt @@ -4282,7 +4332,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s24, s20 -; CHECK-NEXT: str.w r2, [r4, #33] +; CHECK-NEXT: str.w r2, [r9, #8] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -4290,227 +4340,200 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r1, [r4, #29] +; CHECK-NEXT: str.w r1, [r9, #4] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s24, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: str.w r0, [r4, #25] +; CHECK-NEXT: str.w r0, [r9] ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vcmp.f32 s22, #0 -; CHECK-NEXT: mov r8, r3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s22, s20 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s22, #0 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s22, s20 -; CHECK-NEXT: str r2, [r4, #8] +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s22, #0 +; CHECK-NEXT: vcmp.f32 s30, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: movgt.w r6, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str r1, [r4, #4] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s22, s20 +; CHECK-NEXT: movlt r7, #0 +; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: str r0, [r4] -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: movgt r7, #15 +; CHECK-NEXT: and r0, r7, #15 +; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: orr.w r1, r0, r6, lsl #4 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: mov r10, r2 +; CHECK-NEXT: str.w r1, [r9, #87] ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: vcmp.f32 s30, #0 +; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r7, r3 +; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt.w r8, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, #0 +; CHECK-NEXT: vcmp.f32 s28, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: movgt.w r8, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: movlt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r6, r1, #28 -; CHECK-NEXT: vcmp.f32 s30, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 -; CHECK-NEXT: orr.w r6, r6, r2, lsl #4 +; CHECK-NEXT: movgt r5, #15 +; CHECK-NEXT: and r0, r5, #15 +; CHECK-NEXT: vcvtt.f32.f16 s28, s17 +; CHECK-NEXT: orr.w r0, r0, r8, lsl #4 +; CHECK-NEXT: str.w r0, [r9, #62] +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: vcmp.f32 s28, #0 +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r6, [r4, #95] +; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vcmp.f32 s28, s20 +; CHECK-NEXT: vcvtt.f32.f16 s16, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: lsrs r6, r0, #28 -; CHECK-NEXT: orr.w r1, r6, r1, lsl #4 -; CHECK-NEXT: vcmp.f32 s30, #0 -; CHECK-NEXT: str.w r1, [r4, #91] -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #0 -; CHECK-NEXT: lsrs r1, r2, #28 -; CHECK-NEXT: vcvtt.f32.f16 s30, s18 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #15 -; CHECK-NEXT: orr.w r2, r1, r3, lsl #4 -; CHECK-NEXT: vmov r1, s30 -; CHECK-NEXT: strb.w r2, [r4, #99] -; CHECK-NEXT: vcmp.f32 s28, #0 +; CHECK-NEXT: vcmp.f32 s26, #0 +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #0 -; CHECK-NEXT: vcmp.f32 s28, s20 +; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r5, #15 -; CHECK-NEXT: and r2, r5, #15 -; CHECK-NEXT: orr.w r0, r2, r0, lsl #4 -; CHECK-NEXT: str.w r0, [r4, #87] -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: movgt r0, #15 +; CHECK-NEXT: and r0, r0, #15 +; CHECK-NEXT: orr.w r0, r0, r1, lsl #4 +; CHECK-NEXT: str.w r0, [r9, #37] +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: vcmp.f32 s30, #0 -; CHECK-NEXT: vcvtt.f32.f16 s18, s17 +; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, #0 +; CHECK-NEXT: vcmp.f32 s24, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: movlt r4, #0 +; CHECK-NEXT: vcmp.f32 s24, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r6, r1, #28 -; CHECK-NEXT: vcmp.f32 s30, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 -; CHECK-NEXT: orr.w r6, r6, r2, lsl #4 +; CHECK-NEXT: movgt r4, #15 +; CHECK-NEXT: and r5, r4, #15 +; CHECK-NEXT: vcmp.f32 s22, #0 +; CHECK-NEXT: orr.w r5, r5, r0, lsl #4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r6, [r4, #70] +; CHECK-NEXT: str.w r5, [r9, #12] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: movlt.w r11, #0 +; CHECK-NEXT: vcmp.f32 s22, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: lsrs r6, r0, #28 -; CHECK-NEXT: orr.w r1, r6, r1, lsl #4 -; CHECK-NEXT: str.w r1, [r4, #66] -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: vcmp.f32 s30, #0 -; CHECK-NEXT: lsrs r2, r2, #28 +; CHECK-NEXT: movgt.w r11, #-1 +; CHECK-NEXT: vcmp.f32 s22, #0 +; CHECK-NEXT: lsrl r6, r11, #28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #0 +; CHECK-NEXT: movlt.w r10, #0 +; CHECK-NEXT: vcmp.f32 s22, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s26, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #15 -; CHECK-NEXT: orr.w r2, r2, r3, lsl #4 +; CHECK-NEXT: movgt.w r10, #-1 +; CHECK-NEXT: orr.w r5, r11, r10, lsl #4 +; CHECK-NEXT: str.w r5, [r9, #95] +; CHECK-NEXT: str.w r6, [r9, #91] +; CHECK-NEXT: vcmp.f32 s22, #0 +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: strb.w r2, [r4, #74] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #0 -; CHECK-NEXT: vcmp.f32 s26, s20 -; CHECK-NEXT: vcvtt.f32.f16 s16, s16 +; CHECK-NEXT: movlt r6, #0 +; CHECK-NEXT: vcmp.f32 s22, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r7, #15 -; CHECK-NEXT: and r2, r7, #15 -; CHECK-NEXT: orr.w r0, r2, r0, lsl #4 -; CHECK-NEXT: str.w r0, [r4, #62] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: movgt r6, #15 +; CHECK-NEXT: and r5, r6, #15 ; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: lsrl r10, r5, #28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: strb.w r10, [r9, #99] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movlt r7, #0 ; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r7, r1, #28 -; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 -; CHECK-NEXT: orr.w r7, r7, r2, lsl #4 +; CHECK-NEXT: movgt r7, #15 +; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r7, [r4, #45] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: lsrs r7, r0, #28 +; CHECK-NEXT: movgt.w r6, #-1 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r7 ; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: orr.w r7, r7, r1, lsl #4 -; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 -; CHECK-NEXT: str.w r7, [r4, #41] +; CHECK-NEXT: lsrl r8, r5, #28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #0 +; CHECK-NEXT: movlt r4, #0 +; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r2, r2, #28 -; CHECK-NEXT: vcmp.f32 s24, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #15 -; CHECK-NEXT: orr.w r2, r2, r3, lsl #4 +; CHECK-NEXT: movgt.w r4, #-1 +; CHECK-NEXT: orr.w r6, r5, r4, lsl #4 +; CHECK-NEXT: and r5, r7, #15 +; CHECK-NEXT: lsrl r4, r5, #28 +; CHECK-NEXT: str.w r6, [r9, #70] +; CHECK-NEXT: str.w r8, [r9, #66] +; CHECK-NEXT: vcmp.f32 s28, #0 +; CHECK-NEXT: strb.w r4, [r9, #74] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: strb.w r2, [r4, #49] +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r10, #0 -; CHECK-NEXT: vcmp.f32 s24, s20 +; CHECK-NEXT: movlt r7, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r10, #15 -; CHECK-NEXT: and r2, r10, #15 -; CHECK-NEXT: orr.w r0, r2, r0, lsl #4 -; CHECK-NEXT: str.w r0, [r4, #37] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: vcmp.f32 s16, #0 +; CHECK-NEXT: movgt r7, #15 +; CHECK-NEXT: mov r12, r7 +; CHECK-NEXT: vcmp.f32 s28, #0 +; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: movlt r7, #0 +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r1, #-1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: movgt.w r7, #-1 ; CHECK-NEXT: b.w .LBB50_2 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: @@ -4518,47 +4541,61 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: .long 0x717fffff @ float 1.26765052E+30 ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: .LBB50_2: +; CHECK-NEXT: vmov q0[3], q0[1], r7, r12 +; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: vcmp.f32 s28, #0 +; CHECK-NEXT: lsrl r4, r5, #28 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r6, #0 +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r2, #-1 -; CHECK-NEXT: lsrs r7, r1, #28 +; CHECK-NEXT: movgt.w r6, #-1 +; CHECK-NEXT: orr.w r7, r5, r6, lsl #4 +; CHECK-NEXT: and r5, r12, #15 +; CHECK-NEXT: vcmp.f32 s16, #0 +; CHECK-NEXT: lsrl r6, r5, #28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: orr.w r7, r7, r2, lsl #4 -; CHECK-NEXT: str r7, [r4, #20] +; CHECK-NEXT: str.w r7, [r9, #45] +; CHECK-NEXT: str.w r4, [r9, #41] +; CHECK-NEXT: strb.w r6, [r9, #49] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: movlt r3, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: lsrs r7, r0, #28 -; CHECK-NEXT: orr.w r1, r7, r1, lsl #4 +; CHECK-NEXT: movgt r3, #15 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: str r1, [r4, #16] ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #0 +; CHECK-NEXT: movlt r1, #0 +; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: lsr.w r1, r2, #28 -; CHECK-NEXT: vcmp.f32 s22, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #15 -; CHECK-NEXT: orr.w r1, r1, r3, lsl #4 +; CHECK-NEXT: movgt.w r1, #-1 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vcmp.f32 s16, #0 +; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: strb r1, [r4, #24] +; CHECK-NEXT: lsrl r0, r1, #28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r8, #0 -; CHECK-NEXT: vcmp.f32 s22, s20 +; CHECK-NEXT: movlt r2, #0 +; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r8, #15 -; CHECK-NEXT: and r1, r8, #15 -; CHECK-NEXT: orr.w r0, r1, r0, lsl #4 -; CHECK-NEXT: str r0, [r4, #12] +; CHECK-NEXT: movgt.w r2, #-1 +; CHECK-NEXT: orr.w r1, r1, r2, lsl #4 +; CHECK-NEXT: strd r0, r1, [r9, #16] +; CHECK-NEXT: and r1, r3, #15 +; CHECK-NEXT: lsrl r2, r1, #28 +; CHECK-NEXT: strb.w r2, [r9, #24] +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: @ %bb.3: %x = call <8 x i100> @llvm.fptoui.sat.v8f16.v8i100(<8 x half> %f) ret <8 x i100> %x -- cgit v1.1 From 00e80fbfb9151a68e7383dcec7da69c867225e54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danny=20M=C3=B6sch?= Date: Sun, 11 Feb 2024 19:43:34 +0100 Subject: [NFC] Correct C++ standard names (#81421) --- .../clang-tidy/modernize/DeprecatedHeadersCheck.cpp | 2 +- .../docs/clang-tidy/checks/modernize/deprecated-headers.rst | 2 +- .../docs/clang-tidy/checks/modernize/use-override.rst | 2 +- .../docs/clang-tidy/checks/readability/container-contains.rst | 2 +- .../docs/clang-tidy/checks/readability/use-anyofallof.rst | 2 +- clang/include/clang/Basic/Module.h | 6 +++--- clang/lib/Basic/Module.cpp | 4 ++-- clang/lib/Headers/stdatomic.h | 2 +- clang/lib/Lex/DependencyDirectivesScanner.cpp | 2 +- clang/test/Analysis/bitwise-shift-common.c | 2 +- clang/unittests/Analysis/FlowSensitive/TransferTest.cpp | 2 +- clang/unittests/Lex/DependencyDirectivesScannerTest.cpp | 2 +- libcxx/docs/FeatureTestMacroTable.rst | 10 +++++----- libcxx/include/__locale_dir/locale_base_api/ibm.h | 2 +- .../ostream.inserters.arithmetic/pointer.volatile.pass.cpp | 2 +- libcxx/utils/generate_feature_test_macro_components.py | 2 +- llvm/docs/CMake.rst | 2 +- 17 files changed, 24 insertions(+), 24 deletions(-) diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp index 6d287eb..6a46791 100644 --- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp @@ -158,7 +158,7 @@ IncludeModernizePPCallbacks::IncludeModernizePPCallbacks( {"wctype.h", "cwctype"}})) { CStyledHeaderToCxx.insert(KeyValue); } - // Add C++ 11 headers. + // Add C++11 headers. if (LangOpts.CPlusPlus11) { for (const auto &KeyValue : std::vector>( diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst index 974a56a..298243f 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst @@ -4,7 +4,7 @@ modernize-deprecated-headers ============================ Some headers from C library were deprecated in C++ and are no longer welcome in -C++ codebases. Some have no effect in C++. For more details refer to the C++ 14 +C++ codebases. Some have no effect in C++. For more details refer to the C++14 Standard [depr.c.headers] section. This check replaces C standard library headers with their C++ alternatives and diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst index 0440ab85..f8f3479 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst @@ -10,7 +10,7 @@ removes ``virtual`` from those functions as it is not required. user that a function was virtual. C++ compilers did not use the presence of this to signify an overridden function. -In C++ 11 ``override`` and ``final`` keywords were introduced to allow +In C++11 ``override`` and ``final`` keywords were introduced to allow overridden functions to be marked appropriately. Their presence allows compilers to verify that an overridden function correctly overrides a base class implementation. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst index 07d1e35..b28daec 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst @@ -3,7 +3,7 @@ readability-container-contains ============================== -Finds usages of ``container.count()`` and ``container.find() == container.end()`` which should be replaced by a call to the ``container.contains()`` method introduced in C++ 20. +Finds usages of ``container.count()`` and ``container.find() == container.end()`` which should be replaced by a call to the ``container.contains()`` method introduced in C++20. Whether an element is contained inside a container should be checked with ``contains`` instead of ``count``/``find`` because ``contains`` conveys the intent more clearly. Furthermore, for containers which permit multiple entries per key (``multimap``, ``multiset``, ...), ``contains`` is more efficient than ``count`` because ``count`` has to do unnecessary additional work. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst index f7bd9ff..6e58766 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst @@ -4,7 +4,7 @@ readability-use-anyofallof ========================== Finds range-based for loops that can be replaced by a call to ``std::any_of`` or -``std::all_of``. In C++ 20 mode, suggests ``std::ranges::any_of`` or +``std::all_of``. In C++20 mode, suggests ``std::ranges::any_of`` or ``std::ranges::all_of``. Example: diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h index 62786e3..30ec9c9 100644 --- a/clang/include/clang/Basic/Module.h +++ b/clang/include/clang/Basic/Module.h @@ -118,7 +118,7 @@ public: /// of header files. ModuleMapModule, - /// This is a C++ 20 header unit. + /// This is a C++20 header unit. ModuleHeaderUnit, /// This is a C++20 module interface unit. @@ -127,10 +127,10 @@ public: /// This is a C++20 module implementation unit. ModuleImplementationUnit, - /// This is a C++ 20 module partition interface. + /// This is a C++20 module partition interface. ModulePartitionInterface, - /// This is a C++ 20 module partition implementation. + /// This is a C++20 module partition implementation. ModulePartitionImplementation, /// This is the explicit Global Module Fragment of a modular TU. diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp index 9252174..1c5043a 100644 --- a/clang/lib/Basic/Module.cpp +++ b/clang/lib/Basic/Module.cpp @@ -376,7 +376,7 @@ Module *Module::findOrInferSubmodule(StringRef Name) { Module *Module::getGlobalModuleFragment() const { assert(isNamedModuleUnit() && "We should only query the global module " - "fragment from the C++ 20 Named modules"); + "fragment from the C++20 Named modules"); for (auto *SubModule : SubModules) if (SubModule->isExplicitGlobalModule()) @@ -387,7 +387,7 @@ Module *Module::getGlobalModuleFragment() const { Module *Module::getPrivateModuleFragment() const { assert(isNamedModuleUnit() && "We should only query the private module " - "fragment from the C++ 20 Named modules"); + "fragment from the C++20 Named modules"); for (auto *SubModule : SubModules) if (SubModule->isPrivateModule()) diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h index 521c473d..9c103d9 100644 --- a/clang/lib/Headers/stdatomic.h +++ b/clang/lib/Headers/stdatomic.h @@ -16,7 +16,7 @@ * Exclude the MSVC path as well as the MSVC header as of the 14.31.30818 * explicitly disallows `stdatomic.h` in the C mode via an `#error`. Fallback * to the clang resource header until that is fully supported. The - * `stdatomic.h` header requires C++ 23 or newer. + * `stdatomic.h` header requires C++23 or newer. */ #if __STDC_HOSTED__ && \ __has_include_next() && \ diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp index 980f865..0971daa 100644 --- a/clang/lib/Lex/DependencyDirectivesScanner.cpp +++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp @@ -369,7 +369,7 @@ static void skipBlockComment(const char *&First, const char *const End) { } } -/// \returns True if the current single quotation mark character is a C++ 14 +/// \returns True if the current single quotation mark character is a C++14 /// digit separator. static bool isQuoteCppDigitSeparator(const char *const Start, const char *const Cur, diff --git a/clang/test/Analysis/bitwise-shift-common.c b/clang/test/Analysis/bitwise-shift-common.c index 39108bc..5f37d99 100644 --- a/clang/test/Analysis/bitwise-shift-common.c +++ b/clang/test/Analysis/bitwise-shift-common.c @@ -154,7 +154,7 @@ int expression_tracked_back(void) { //===----------------------------------------------------------------------===// int allow_overflows_and_negative_operands(void) { - // These are all legal under C++ 20 and many compilers accept them under + // These are all legal under C++20 and many compilers accept them under // earlier standards as well. int int_min = 1 << 31; // no-warning int this_overflows = 1027 << 30; // no-warning diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp index 8bbb040..55af702 100644 --- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp @@ -2093,7 +2093,7 @@ TEST(TransferTest, TemporaryObject) { TEST(TransferTest, ElidableConstructor) { // This test is effectively the same as TransferTest.TemporaryObject, but - // the code is compiled as C++ 14. + // the code is compiled as C++14. std::string Code = R"( struct A { int Bar; diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp index bc4eee7..59fef9e 100644 --- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp +++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp @@ -583,7 +583,7 @@ TEST(MinimizeSourceToDependencyDirectivesTest, UnderscorePragma) { R"(_Pragma(u"clang module import"))", Out)); EXPECT_STREQ("\n", Out.data()); - // FIXME: R"()" strings depend on using C++ 11 language mode + // FIXME: R"()" strings depend on using C++11 language mode ASSERT_FALSE(minimizeSourceToDependencyDirectives( R"(_Pragma(R"abc(clang module import)abc"))", Out)); EXPECT_STREQ("\n", Out.data()); diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index a5c6fa2..468226c 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -24,7 +24,7 @@ Status =================================================== ================= Macro Name Value =================================================== ================= - **C++ 14** + **C++14** --------------------------------------------------------------------- ``__cpp_lib_chrono_udls`` ``201304L`` --------------------------------------------------- ----------------- @@ -66,7 +66,7 @@ Status --------------------------------------------------- ----------------- ``__cpp_lib_tuples_by_type`` ``201304L`` --------------------------------------------------- ----------------- - **C++ 17** + **C++17** --------------------------------------------------------------------- ``__cpp_lib_addressof_constexpr`` ``201603L`` --------------------------------------------------- ----------------- @@ -166,7 +166,7 @@ Status --------------------------------------------------- ----------------- ``__cpp_lib_void_t`` ``201411L`` --------------------------------------------------- ----------------- - **C++ 20** + **C++20** --------------------------------------------------------------------- ``__cpp_lib_array_constexpr`` ``201811L`` --------------------------------------------------- ----------------- @@ -300,7 +300,7 @@ Status --------------------------------------------------- ----------------- ``__cpp_lib_unwrap_ref`` ``201811L`` --------------------------------------------------- ----------------- - **C++ 23** + **C++23** --------------------------------------------------------------------- ``__cpp_lib_adaptor_iterator_pair_constructor`` ``202106L`` --------------------------------------------------- ----------------- @@ -388,7 +388,7 @@ Status --------------------------------------------------- ----------------- ``__cpp_lib_unreachable`` ``202202L`` --------------------------------------------------- ----------------- - **C++ 26** + **C++26** --------------------------------------------------------------------- ``__cpp_lib_associative_heterogeneous_insertion`` *unimplemented* --------------------------------------------------- ----------------- diff --git a/libcxx/include/__locale_dir/locale_base_api/ibm.h b/libcxx/include/__locale_dir/locale_base_api/ibm.h index 498ea1e..c5d7f34 100644 --- a/libcxx/include/__locale_dir/locale_base_api/ibm.h +++ b/libcxx/include/__locale_dir/locale_base_api/ibm.h @@ -100,7 +100,7 @@ inline _LIBCPP_HIDE_FROM_ABI int vasprintf(char** strp, const char* fmt, va_list } va_list ap_copy; - // va_copy may not be provided by the C library in C++ 03 mode. + // va_copy may not be provided by the C library in C++03 mode. #if defined(_LIBCPP_CXX03_LANG) && __has_builtin(__builtin_va_copy) __builtin_va_copy(ap_copy, ap); #else diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp index b016bab..69d84f64 100644 --- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp +++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp @@ -17,7 +17,7 @@ // // If the library was built in c++23 mode, this test would succeed. // -// Older CMake passed -std:c++latest to set C++ 20 mode on clang-cl, which +// Older CMake passed -std:c++latest to set C++20 mode on clang-cl, which // hid this issue. With newer CMake versions, it passes -std:c++20 which // makes this fail. // diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index cc1fc50..9e7ea86 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -1769,7 +1769,7 @@ def pad_cell(s, length, left_align=True): def get_status_table(): table = [["Macro Name", "Value"]] for std in get_std_dialects(): - table += [["**" + std.replace("c++", "C++ ") + "**", ""]] + table += [["**" + std.replace("c++", "C++") + "**", ""]] for tc in feature_test_macros: if std not in tc["values"].keys(): continue diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst index 20f73c9..abef4f8 100644 --- a/llvm/docs/CMake.rst +++ b/llvm/docs/CMake.rst @@ -277,7 +277,7 @@ manual, or execute ``cmake --help-variable VARIABLE_NAME``. **CMAKE_CXX_STANDARD**:STRING Sets the C++ standard to conform to when building LLVM. Possible values are - 17 and 20. LLVM Requires C++ 17 or higher. This defaults to 17. + 17 and 20. LLVM Requires C++17 or higher. This defaults to 17. **CMAKE_INSTALL_BINDIR**:PATH The path to install executables, relative to the *CMAKE_INSTALL_PREFIX*. -- cgit v1.1 From ffab5a089b1e94b3305fbdfdf1547b751121c090 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Sun, 11 Feb 2024 10:50:59 -0800 Subject: Add a test for the A16/A17 parts of eb1b428750181ea742c547db0bc7136cd5b8f732 There are a couple of open questions on what we should do for A14, so I'll leave that off for now. https://github.com/llvm/llvm-project/pull/81325#issuecomment-1937489565 --- llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll index cddcd46..a75c303 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll +++ b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll @@ -13,6 +13,8 @@ ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1 | FileCheck %s ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2 | FileCheck %s ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a16 -mattr=-fuse-literals | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a17 -mattr=-fuse-literals | FileCheck %s @g = common local_unnamed_addr global ptr null, align 8 -- cgit v1.1 From 03f571995b4f0c260254955afd16ec44d0764794 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sun, 11 Feb 2024 12:59:05 -0800 Subject: [clang-format][NFC] Make LangOpts global in namespace Format (#81390) --- clang/include/clang/Format/Format.h | 6 ---- clang/lib/Format/Format.cpp | 30 ------------------- clang/lib/Format/FormatTokenLexer.cpp | 12 +++----- clang/lib/Format/FormatTokenLexer.h | 2 -- clang/lib/Format/IntegerLiteralSeparatorFixer.cpp | 2 +- clang/lib/Format/TokenAnalyzer.cpp | 36 ++++++++++++++++++++++- clang/lib/Format/TokenAnalyzer.h | 2 ++ clang/unittests/Format/TestLexer.h | 4 ++- 8 files changed, 45 insertions(+), 49 deletions(-) diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index d9c18e5..b7e8246 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -14,7 +14,6 @@ #ifndef LLVM_CLANG_FORMAT_FORMAT_H #define LLVM_CLANG_FORMAT_FORMAT_H -#include "clang/Basic/LangOptions.h" #include "clang/Tooling/Core/Replacement.h" #include "clang/Tooling/Inclusions/IncludeStyle.h" #include "llvm/ADT/ArrayRef.h" @@ -5179,11 +5178,6 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style, ArrayRef Ranges, StringRef FileName = ""); -/// Returns the ``LangOpts`` that the formatter expects you to set. -/// -/// \param Style determines specific settings for lexing mode. -LangOptions getFormattingLangOpts(const FormatStyle &Style = getLLVMStyle()); - /// Description to be used for help text for a ``llvm::cl`` option for /// specifying format style. The description is closely related to the operation /// of ``getStyle()``. diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index d2cc466..8431d3c 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -3823,36 +3823,6 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style, return UsingDeclarationsSorter(*Env, Style).process().first; } -LangOptions getFormattingLangOpts(const FormatStyle &Style) { - LangOptions LangOpts; - - FormatStyle::LanguageStandard LexingStd = Style.Standard; - if (LexingStd == FormatStyle::LS_Auto) - LexingStd = FormatStyle::LS_Latest; - if (LexingStd == FormatStyle::LS_Latest) - LexingStd = FormatStyle::LS_Cpp20; - LangOpts.CPlusPlus = 1; - LangOpts.CPlusPlus11 = LexingStd >= FormatStyle::LS_Cpp11; - LangOpts.CPlusPlus14 = LexingStd >= FormatStyle::LS_Cpp14; - LangOpts.CPlusPlus17 = LexingStd >= FormatStyle::LS_Cpp17; - LangOpts.CPlusPlus20 = LexingStd >= FormatStyle::LS_Cpp20; - LangOpts.Char8 = LexingStd >= FormatStyle::LS_Cpp20; - // Turning on digraphs in standards before C++0x is error-prone, because e.g. - // the sequence "<::" will be unconditionally treated as "[:". - // Cf. Lexer::LexTokenInternal. - LangOpts.Digraphs = LexingStd >= FormatStyle::LS_Cpp11; - - LangOpts.LineComment = 1; - bool AlternativeOperators = Style.isCpp(); - LangOpts.CXXOperatorNames = AlternativeOperators ? 1 : 0; - LangOpts.Bool = 1; - LangOpts.ObjC = 1; - LangOpts.MicrosoftExt = 1; // To get kw___try, kw___finally. - LangOpts.DeclSpecKeyword = 1; // To get __declspec. - LangOpts.C99 = 1; // To get kw_restrict for non-underscore-prefixed restrict. - return LangOpts; -} - const char *StyleOptionHelpDescription = "Set coding style. can be:\n" "1. A preset: LLVM, GNU, Google, Chromium, Microsoft,\n" diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index a87d0ba..a57659f 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -13,11 +13,7 @@ //===----------------------------------------------------------------------===// #include "FormatTokenLexer.h" -#include "FormatToken.h" -#include "clang/Basic/SourceLocation.h" -#include "clang/Basic/SourceManager.h" -#include "clang/Format/Format.h" -#include "llvm/Support/Regex.h" +#include "TokenAnalyzer.h" namespace clang { namespace format { @@ -28,12 +24,12 @@ FormatTokenLexer::FormatTokenLexer( llvm::SpecificBumpPtrAllocator &Allocator, IdentifierTable &IdentTable) : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), - Column(Column), TrailingWhitespace(0), - LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID), + Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID), Style(Style), IdentTable(IdentTable), Keywords(IdentTable), Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0), FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), MacroBlockEndRegex(Style.MacroBlockEnd) { + assert(LangOpts.CPlusPlus); Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts)); Lex->SetKeepWhitespaceMode(true); @@ -1442,7 +1438,7 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) { void FormatTokenLexer::resetLexer(unsigned Offset) { StringRef Buffer = SourceMgr.getBufferData(ID); - LangOpts = getFormattingLangOpts(Style); + assert(LangOpts.CPlusPlus); Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts, Buffer.begin(), Buffer.begin() + Offset, Buffer.end())); Lex->SetKeepWhitespaceMode(true); diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h index 65dd733..0d0f36f 100644 --- a/clang/lib/Format/FormatTokenLexer.h +++ b/clang/lib/Format/FormatTokenLexer.h @@ -17,7 +17,6 @@ #include "Encoding.h" #include "FormatToken.h" -#include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Format/Format.h" @@ -120,7 +119,6 @@ private: unsigned Column; unsigned TrailingWhitespace; std::unique_ptr Lex; - LangOptions LangOpts; const SourceManager &SourceMgr; FileID ID; const FormatStyle &Style; diff --git a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp index 87823ae..3c2cedd 100644 --- a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp +++ b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp @@ -79,7 +79,7 @@ IntegerLiteralSeparatorFixer::process(const Environment &Env, AffectedRangeManager AffectedRangeMgr(SourceMgr, Env.getCharRanges()); const auto ID = Env.getFileID(); - const auto LangOpts = getFormattingLangOpts(Style); + assert(LangOpts.CPlusPlus); Lexer Lex(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts); Lex.SetCommentRetentionState(true); diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp index bd648c4..4e77683 100644 --- a/clang/lib/Format/TokenAnalyzer.cpp +++ b/clang/lib/Format/TokenAnalyzer.cpp @@ -35,6 +35,38 @@ namespace clang { namespace format { +LangOptions LangOpts; + +/// Sets `LangOpts` for the formatter. +/// +/// \param `Style` determines specific settings for lexing mode. +static void setFormattingLangOpts(const FormatStyle &Style) { + FormatStyle::LanguageStandard LexingStd = Style.Standard; + if (LexingStd == FormatStyle::LS_Auto) + LexingStd = FormatStyle::LS_Latest; + if (LexingStd == FormatStyle::LS_Latest) + LexingStd = FormatStyle::LS_Cpp20; + LangOpts.CPlusPlus = 1; + LangOpts.CPlusPlus11 = LexingStd >= FormatStyle::LS_Cpp11; + LangOpts.CPlusPlus14 = LexingStd >= FormatStyle::LS_Cpp14; + LangOpts.CPlusPlus17 = LexingStd >= FormatStyle::LS_Cpp17; + LangOpts.CPlusPlus20 = LexingStd >= FormatStyle::LS_Cpp20; + LangOpts.Char8 = LexingStd >= FormatStyle::LS_Cpp20; + // Turning on digraphs in standards before C++0x is error-prone, because e.g. + // the sequence "<::" will be unconditionally treated as "[:". + // Cf. Lexer::LexTokenInternal. + LangOpts.Digraphs = LexingStd >= FormatStyle::LS_Cpp11; + + LangOpts.LineComment = 1; + bool AlternativeOperators = Style.isCpp(); + LangOpts.CXXOperatorNames = AlternativeOperators ? 1 : 0; + LangOpts.Bool = 1; + LangOpts.ObjC = 1; + LangOpts.MicrosoftExt = 1; // To get kw___try, kw___finally. + LangOpts.DeclSpecKeyword = 1; // To get __declspec. + LangOpts.C99 = 1; // To get kw_restrict for non-underscore-prefixed restrict. +} + // FIXME: Instead of printing the diagnostic we should store it and have a // better way to return errors through the format APIs. class FatalDiagnosticConsumer : public DiagnosticConsumer { @@ -99,9 +131,11 @@ TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style) std::pair TokenAnalyzer::process(bool SkipAnnotation) { + setFormattingLangOpts(Style); + tooling::Replacements Result; llvm::SpecificBumpPtrAllocator Allocator; - IdentifierTable IdentTable(getFormattingLangOpts(Style)); + IdentifierTable IdentTable(LangOpts); FormatTokenLexer Lex(Env.getSourceManager(), Env.getFileID(), Env.getFirstStartColumn(), Style, Encoding, Allocator, IdentTable); diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h index 4086dab..18c1431 100644 --- a/clang/lib/Format/TokenAnalyzer.h +++ b/clang/lib/Format/TokenAnalyzer.h @@ -34,6 +34,8 @@ namespace clang { namespace format { +extern LangOptions LangOpts; + class Environment { public: // This sets up an virtual file system with file \p FileName containing the diff --git a/clang/unittests/Format/TestLexer.h b/clang/unittests/Format/TestLexer.h index 8b5949b..6a3d0bd 100644 --- a/clang/unittests/Format/TestLexer.h +++ b/clang/unittests/Format/TestLexer.h @@ -61,7 +61,9 @@ public: std::vector> &Buffers, FormatStyle Style = getLLVMStyle()) : Allocator(Allocator), Buffers(Buffers), Style(Style), - SourceMgr("test.cpp", ""), IdentTable(getFormattingLangOpts(Style)) {} + SourceMgr("test.cpp", ""), IdentTable(LangOpts) { + assert(LangOpts.CPlusPlus); + } TokenList lex(llvm::StringRef Code) { FormatTokenLexer Lex = getNewLexer(Code); -- cgit v1.1 From 3dc8ef677d7d05116a0bf6524eb38b02ca6ba042 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sun, 11 Feb 2024 13:08:28 -0800 Subject: Revert "[clang-format][NFC] Make LangOpts global in namespace Format (#81390)" This reverts commit 03f571995b4f0c260254955afd16ec44d0764794. We can't hide getFormattingLangOpts() as it's used by other tools. --- clang/include/clang/Format/Format.h | 6 ++++ clang/lib/Format/Format.cpp | 30 +++++++++++++++++++ clang/lib/Format/FormatTokenLexer.cpp | 12 +++++--- clang/lib/Format/FormatTokenLexer.h | 2 ++ clang/lib/Format/IntegerLiteralSeparatorFixer.cpp | 2 +- clang/lib/Format/TokenAnalyzer.cpp | 36 +---------------------- clang/lib/Format/TokenAnalyzer.h | 2 -- clang/unittests/Format/TestLexer.h | 4 +-- 8 files changed, 49 insertions(+), 45 deletions(-) diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index b7e8246..d9c18e5 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -14,6 +14,7 @@ #ifndef LLVM_CLANG_FORMAT_FORMAT_H #define LLVM_CLANG_FORMAT_FORMAT_H +#include "clang/Basic/LangOptions.h" #include "clang/Tooling/Core/Replacement.h" #include "clang/Tooling/Inclusions/IncludeStyle.h" #include "llvm/ADT/ArrayRef.h" @@ -5178,6 +5179,11 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style, ArrayRef Ranges, StringRef FileName = ""); +/// Returns the ``LangOpts`` that the formatter expects you to set. +/// +/// \param Style determines specific settings for lexing mode. +LangOptions getFormattingLangOpts(const FormatStyle &Style = getLLVMStyle()); + /// Description to be used for help text for a ``llvm::cl`` option for /// specifying format style. The description is closely related to the operation /// of ``getStyle()``. diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 8431d3c..d2cc466 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -3823,6 +3823,36 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style, return UsingDeclarationsSorter(*Env, Style).process().first; } +LangOptions getFormattingLangOpts(const FormatStyle &Style) { + LangOptions LangOpts; + + FormatStyle::LanguageStandard LexingStd = Style.Standard; + if (LexingStd == FormatStyle::LS_Auto) + LexingStd = FormatStyle::LS_Latest; + if (LexingStd == FormatStyle::LS_Latest) + LexingStd = FormatStyle::LS_Cpp20; + LangOpts.CPlusPlus = 1; + LangOpts.CPlusPlus11 = LexingStd >= FormatStyle::LS_Cpp11; + LangOpts.CPlusPlus14 = LexingStd >= FormatStyle::LS_Cpp14; + LangOpts.CPlusPlus17 = LexingStd >= FormatStyle::LS_Cpp17; + LangOpts.CPlusPlus20 = LexingStd >= FormatStyle::LS_Cpp20; + LangOpts.Char8 = LexingStd >= FormatStyle::LS_Cpp20; + // Turning on digraphs in standards before C++0x is error-prone, because e.g. + // the sequence "<::" will be unconditionally treated as "[:". + // Cf. Lexer::LexTokenInternal. + LangOpts.Digraphs = LexingStd >= FormatStyle::LS_Cpp11; + + LangOpts.LineComment = 1; + bool AlternativeOperators = Style.isCpp(); + LangOpts.CXXOperatorNames = AlternativeOperators ? 1 : 0; + LangOpts.Bool = 1; + LangOpts.ObjC = 1; + LangOpts.MicrosoftExt = 1; // To get kw___try, kw___finally. + LangOpts.DeclSpecKeyword = 1; // To get __declspec. + LangOpts.C99 = 1; // To get kw_restrict for non-underscore-prefixed restrict. + return LangOpts; +} + const char *StyleOptionHelpDescription = "Set coding style. can be:\n" "1. A preset: LLVM, GNU, Google, Chromium, Microsoft,\n" diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index a57659f..a87d0ba 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -13,7 +13,11 @@ //===----------------------------------------------------------------------===// #include "FormatTokenLexer.h" -#include "TokenAnalyzer.h" +#include "FormatToken.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Format/Format.h" +#include "llvm/Support/Regex.h" namespace clang { namespace format { @@ -24,12 +28,12 @@ FormatTokenLexer::FormatTokenLexer( llvm::SpecificBumpPtrAllocator &Allocator, IdentifierTable &IdentTable) : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), - Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID), + Column(Column), TrailingWhitespace(0), + LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID), Style(Style), IdentTable(IdentTable), Keywords(IdentTable), Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0), FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), MacroBlockEndRegex(Style.MacroBlockEnd) { - assert(LangOpts.CPlusPlus); Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts)); Lex->SetKeepWhitespaceMode(true); @@ -1438,7 +1442,7 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) { void FormatTokenLexer::resetLexer(unsigned Offset) { StringRef Buffer = SourceMgr.getBufferData(ID); - assert(LangOpts.CPlusPlus); + LangOpts = getFormattingLangOpts(Style); Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts, Buffer.begin(), Buffer.begin() + Offset, Buffer.end())); Lex->SetKeepWhitespaceMode(true); diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h index 0d0f36f..65dd733 100644 --- a/clang/lib/Format/FormatTokenLexer.h +++ b/clang/lib/Format/FormatTokenLexer.h @@ -17,6 +17,7 @@ #include "Encoding.h" #include "FormatToken.h" +#include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Format/Format.h" @@ -119,6 +120,7 @@ private: unsigned Column; unsigned TrailingWhitespace; std::unique_ptr Lex; + LangOptions LangOpts; const SourceManager &SourceMgr; FileID ID; const FormatStyle &Style; diff --git a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp index 3c2cedd..87823ae 100644 --- a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp +++ b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp @@ -79,7 +79,7 @@ IntegerLiteralSeparatorFixer::process(const Environment &Env, AffectedRangeManager AffectedRangeMgr(SourceMgr, Env.getCharRanges()); const auto ID = Env.getFileID(); - assert(LangOpts.CPlusPlus); + const auto LangOpts = getFormattingLangOpts(Style); Lexer Lex(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts); Lex.SetCommentRetentionState(true); diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp index 4e77683..bd648c4 100644 --- a/clang/lib/Format/TokenAnalyzer.cpp +++ b/clang/lib/Format/TokenAnalyzer.cpp @@ -35,38 +35,6 @@ namespace clang { namespace format { -LangOptions LangOpts; - -/// Sets `LangOpts` for the formatter. -/// -/// \param `Style` determines specific settings for lexing mode. -static void setFormattingLangOpts(const FormatStyle &Style) { - FormatStyle::LanguageStandard LexingStd = Style.Standard; - if (LexingStd == FormatStyle::LS_Auto) - LexingStd = FormatStyle::LS_Latest; - if (LexingStd == FormatStyle::LS_Latest) - LexingStd = FormatStyle::LS_Cpp20; - LangOpts.CPlusPlus = 1; - LangOpts.CPlusPlus11 = LexingStd >= FormatStyle::LS_Cpp11; - LangOpts.CPlusPlus14 = LexingStd >= FormatStyle::LS_Cpp14; - LangOpts.CPlusPlus17 = LexingStd >= FormatStyle::LS_Cpp17; - LangOpts.CPlusPlus20 = LexingStd >= FormatStyle::LS_Cpp20; - LangOpts.Char8 = LexingStd >= FormatStyle::LS_Cpp20; - // Turning on digraphs in standards before C++0x is error-prone, because e.g. - // the sequence "<::" will be unconditionally treated as "[:". - // Cf. Lexer::LexTokenInternal. - LangOpts.Digraphs = LexingStd >= FormatStyle::LS_Cpp11; - - LangOpts.LineComment = 1; - bool AlternativeOperators = Style.isCpp(); - LangOpts.CXXOperatorNames = AlternativeOperators ? 1 : 0; - LangOpts.Bool = 1; - LangOpts.ObjC = 1; - LangOpts.MicrosoftExt = 1; // To get kw___try, kw___finally. - LangOpts.DeclSpecKeyword = 1; // To get __declspec. - LangOpts.C99 = 1; // To get kw_restrict for non-underscore-prefixed restrict. -} - // FIXME: Instead of printing the diagnostic we should store it and have a // better way to return errors through the format APIs. class FatalDiagnosticConsumer : public DiagnosticConsumer { @@ -131,11 +99,9 @@ TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style) std::pair TokenAnalyzer::process(bool SkipAnnotation) { - setFormattingLangOpts(Style); - tooling::Replacements Result; llvm::SpecificBumpPtrAllocator Allocator; - IdentifierTable IdentTable(LangOpts); + IdentifierTable IdentTable(getFormattingLangOpts(Style)); FormatTokenLexer Lex(Env.getSourceManager(), Env.getFileID(), Env.getFirstStartColumn(), Style, Encoding, Allocator, IdentTable); diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h index 18c1431..4086dab 100644 --- a/clang/lib/Format/TokenAnalyzer.h +++ b/clang/lib/Format/TokenAnalyzer.h @@ -34,8 +34,6 @@ namespace clang { namespace format { -extern LangOptions LangOpts; - class Environment { public: // This sets up an virtual file system with file \p FileName containing the diff --git a/clang/unittests/Format/TestLexer.h b/clang/unittests/Format/TestLexer.h index 6a3d0bd..8b5949b 100644 --- a/clang/unittests/Format/TestLexer.h +++ b/clang/unittests/Format/TestLexer.h @@ -61,9 +61,7 @@ public: std::vector> &Buffers, FormatStyle Style = getLLVMStyle()) : Allocator(Allocator), Buffers(Buffers), Style(Style), - SourceMgr("test.cpp", ""), IdentTable(LangOpts) { - assert(LangOpts.CPlusPlus); - } + SourceMgr("test.cpp", ""), IdentTable(getFormattingLangOpts(Style)) {} TokenList lex(llvm::StringRef Code) { FormatTokenLexer Lex = getNewLexer(Code); -- cgit v1.1 From b1771475da91805a4ac1831810b62a7b3655ccca Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 11 Feb 2024 22:25:16 +0000 Subject: [AArch64][GlobalISel] Additional insert and extract GISel tests. NFC --- llvm/test/CodeGen/AArch64/insertextract.ll | 2256 ++++++++++++++++++++++++++++ 1 file changed, 2256 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/insertextract.ll diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll new file mode 100644 index 0000000..794abca --- /dev/null +++ b/llvm/test/CodeGen/AArch64/insertextract.ll @@ -0,0 +1,2256 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for insert_v2f64_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v3f64_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4f64_0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4f64_2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4f64_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2f32_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v3f32_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4f32_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8f32_0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8f32_2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8f32_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4f16_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8f16_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16f16_0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16f16_2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16f16_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8i8_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16i8_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v32i8_0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v32i8_2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v32i8_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4i16_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8i16_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16i16_0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16i16_2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16i16_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2i32_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v3i32_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4i32_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8i32_0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8i32_2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8i32_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2i64_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v3i64_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4i64_0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4i64_2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4i64_c +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v32i8_0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v32i8_2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v32i8_c + +define <2 x double> @insert_v2f64_0(<2 x double> %a, double %b, i32 %c) { +; CHECK-LABEL: insert_v2f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[0], v1.d[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x double> %a, double %b, i32 0 + ret <2 x double> %d +} + +define <2 x double> @insert_v2f64_1(<2 x double> %a, double %b, i32 %c) { +; CHECK-LABEL: insert_v2f64_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x double> %a, double %b, i32 1 + ret <2 x double> %d +} + +define <2 x double> @insert_v2f64_c(<2 x double> %a, double %b, i32 %c) { +; CHECK-LABEL: insert_v2f64_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x8, x0, #3, #1 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x double> %a, double %b, i32 %c + ret <2 x double> %d +} + +define <3 x double> @insert_v3f64_0(<3 x double> %a, double %b, i32 %c) { +; CHECK-SD-LABEL: insert_v3f64_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-SD-NEXT: mov v0.d[0], v3.d[0] +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v3f64_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: ret +entry: + %d = insertelement <3 x double> %a, double %b, i32 0 + ret <3 x double> %d +} + +define <3 x double> @insert_v3f64_2(<3 x double> %a, double %b, i32 %c) { +; CHECK-LABEL: insert_v3f64_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d2, d3 +; CHECK-NEXT: ret +entry: + %d = insertelement <3 x double> %a, double %b, i32 2 + ret <3 x double> %d +} + +define <3 x double> @insert_v3f64_c(<3 x double> %a, double %b, i32 %c) { +; CHECK-LABEL: insert_v3f64_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: stp q0, q2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x0, #0x3 +; CHECK-NEXT: str d3, [x8, x9, lsl #3] +; CHECK-NEXT: ldr q0, [sp] +; CHECK-NEXT: ldr d2, [sp, #16] +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %d = insertelement <3 x double> %a, double %b, i32 %c + ret <3 x double> %d +} + +define <4 x double> @insert_v4f64_0(<4 x double> %a, double %b, i32 %c) { +; CHECK-LABEL: insert_v4f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: mov v0.d[0], v2.d[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x double> %a, double %b, i32 0 + ret <4 x double> %d +} + +define <4 x double> @insert_v4f64_2(<4 x double> %a, double %b, i32 %c) { +; CHECK-LABEL: insert_v4f64_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: mov v1.d[0], v2.d[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x double> %a, double %b, i32 2 + ret <4 x double> %d +} + +define <4 x double> @insert_v4f64_c(<4 x double> %a, double %b, i32 %c) { +; CHECK-LABEL: insert_v4f64_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: and x8, x0, #0x3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str d2, [x9, x8, lsl #3] +; CHECK-NEXT: ldp q0, q1, [sp], #32 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x double> %a, double %b, i32 %c + ret <4 x double> %d +} + +define <2 x float> @insert_v2f32_0(<2 x float> %a, float %b, i32 %c) { +; CHECK-LABEL: insert_v2f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x float> %a, float %b, i32 0 + ret <2 x float> %d +} + +define <2 x float> @insert_v2f32_1(<2 x float> %a, float %b, i32 %c) { +; CHECK-LABEL: insert_v2f32_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: mov v0.s[1], v1.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x float> %a, float %b, i32 1 + ret <2 x float> %d +} + +define <2 x float> @insert_v2f32_c(<2 x float> %a, float %b, i32 %c) { +; CHECK-LABEL: insert_v2f32_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: str d0, [sp, #8] +; CHECK-NEXT: bfi x8, x0, #2, #1 +; CHECK-NEXT: str s1, [x8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x float> %a, float %b, i32 %c + ret <2 x float> %d +} + +define <3 x float> @insert_v3f32_0(<3 x float> %a, float %b, i32 %c) { +; CHECK-SD-LABEL: insert_v3f32_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-SD-NEXT: mov v1.s[1], v0.s[1] +; CHECK-SD-NEXT: mov v1.s[2], v0.s[2] +; CHECK-SD-NEXT: mov v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v3f32_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-GI-NEXT: mov s0, v0.s[2] +; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[3], v0.s[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: ret +entry: + %d = insertelement <3 x float> %a, float %b, i32 0 + ret <3 x float> %d +} + +define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) { +; CHECK-SD-LABEL: insert_v3f32_2: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-SD-NEXT: mov v0.s[2], v1.s[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v3f32_2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[3], v0.s[0] +; CHECK-GI-NEXT: ret +entry: + %d = insertelement <3 x float> %a, float %b, i32 2 + ret <3 x float> %d +} + +define <3 x float> @insert_v3f32_c(<3 x float> %a, float %b, i32 %c) { +; CHECK-LABEL: insert_v3f32_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x8, x0, #2, #2 +; CHECK-NEXT: str s1, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <3 x float> %a, float %b, i32 %c + ret <3 x float> %d +} + +define <4 x float> @insert_v4f32_0(<4 x float> %a, float %b, i32 %c) { +; CHECK-LABEL: insert_v4f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x float> %a, float %b, i32 0 + ret <4 x float> %d +} + +define <4 x float> @insert_v4f32_2(<4 x float> %a, float %b, i32 %c) { +; CHECK-LABEL: insert_v4f32_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x float> %a, float %b, i32 2 + ret <4 x float> %d +} + +define <4 x float> @insert_v4f32_c(<4 x float> %a, float %b, i32 %c) { +; CHECK-LABEL: insert_v4f32_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x8, x0, #2, #2 +; CHECK-NEXT: str s1, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x float> %a, float %b, i32 %c + ret <4 x float> %d +} + +define <8 x float> @insert_v8f32_0(<8 x float> %a, float %b, i32 %c) { +; CHECK-LABEL: insert_v8f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: mov v0.s[0], v2.s[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x float> %a, float %b, i32 0 + ret <8 x float> %d +} + +define <8 x float> @insert_v8f32_2(<8 x float> %a, float %b, i32 %c) { +; CHECK-LABEL: insert_v8f32_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: mov v0.s[2], v2.s[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x float> %a, float %b, i32 2 + ret <8 x float> %d +} + +define <8 x float> @insert_v8f32_c(<8 x float> %a, float %b, i32 %c) { +; CHECK-LABEL: insert_v8f32_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: and x8, x0, #0x7 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str s2, [x9, x8, lsl #2] +; CHECK-NEXT: ldp q0, q1, [sp], #32 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x float> %a, float %b, i32 %c + ret <8 x float> %d +} + +define <4 x half> @insert_v4f16_0(<4 x half> %a, half %b, i32 %c) { +; CHECK-LABEL: insert_v4f16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $q1 +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x half> %a, half %b, i32 0 + ret <4 x half> %d +} + +define <4 x half> @insert_v4f16_2(<4 x half> %a, half %b, i32 %c) { +; CHECK-LABEL: insert_v4f16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $q1 +; CHECK-NEXT: mov v0.h[2], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x half> %a, half %b, i32 2 + ret <4 x half> %d +} + +define <4 x half> @insert_v4f16_c(<4 x half> %a, half %b, i32 %c) { +; CHECK-LABEL: insert_v4f16_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: str d0, [sp, #8] +; CHECK-NEXT: bfi x8, x0, #1, #2 +; CHECK-NEXT: str h1, [x8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x half> %a, half %b, i32 %c + ret <4 x half> %d +} + +define <8 x half> @insert_v8f16_0(<8 x half> %a, half %b, i32 %c) { +; CHECK-LABEL: insert_v8f16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h1 killed $h1 def $q1 +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x half> %a, half %b, i32 0 + ret <8 x half> %d +} + +define <8 x half> @insert_v8f16_2(<8 x half> %a, half %b, i32 %c) { +; CHECK-LABEL: insert_v8f16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h1 killed $h1 def $q1 +; CHECK-NEXT: mov v0.h[2], v1.h[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x half> %a, half %b, i32 2 + ret <8 x half> %d +} + +define <8 x half> @insert_v8f16_c(<8 x half> %a, half %b, i32 %c) { +; CHECK-LABEL: insert_v8f16_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x8, x0, #1, #3 +; CHECK-NEXT: str h1, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x half> %a, half %b, i32 %c + ret <8 x half> %d +} + +define <16 x half> @insert_v16f16_0(<16 x half> %a, half %b, i32 %c) { +; CHECK-LABEL: insert_v16f16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: mov v0.h[0], v2.h[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <16 x half> %a, half %b, i32 0 + ret <16 x half> %d +} + +define <16 x half> @insert_v16f16_2(<16 x half> %a, half %b, i32 %c) { +; CHECK-LABEL: insert_v16f16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: mov v0.h[2], v2.h[0] +; CHECK-NEXT: ret +entry: + %d = insertelement <16 x half> %a, half %b, i32 2 + ret <16 x half> %d +} + +define <16 x half> @insert_v16f16_c(<16 x half> %a, half %b, i32 %c) { +; CHECK-LABEL: insert_v16f16_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: and x8, x0, #0xf +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str h2, [x9, x8, lsl #1] +; CHECK-NEXT: ldp q0, q1, [sp], #32 +; CHECK-NEXT: ret +entry: + %d = insertelement <16 x half> %a, half %b, i32 %c + ret <16 x half> %d +} + +define <8 x i8> @insert_v8i8_0(<8 x i8> %a, i8 %b, i32 %c) { +; CHECK-LABEL: insert_v8i8_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.b[0], w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x i8> %a, i8 %b, i32 0 + ret <8 x i8> %d +} + +define <8 x i8> @insert_v8i8_2(<8 x i8> %a, i8 %b, i32 %c) { +; CHECK-LABEL: insert_v8i8_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.b[2], w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x i8> %a, i8 %b, i32 2 + ret <8 x i8> %d +} + +define <8 x i8> @insert_v8i8_c(<8 x i8> %a, i8 %b, i32 %c) { +; CHECK-LABEL: insert_v8i8_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: str d0, [sp, #8] +; CHECK-NEXT: bfxil x8, x1, #0, #3 +; CHECK-NEXT: strb w0, [x8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x i8> %a, i8 %b, i32 %c + ret <8 x i8> %d +} + +define <16 x i8> @insert_v16i8_0(<16 x i8> %a, i8 %b, i32 %c) { +; CHECK-LABEL: insert_v16i8_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.b[0], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <16 x i8> %a, i8 %b, i32 0 + ret <16 x i8> %d +} + +define <16 x i8> @insert_v16i8_2(<16 x i8> %a, i8 %b, i32 %c) { +; CHECK-LABEL: insert_v16i8_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.b[2], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <16 x i8> %a, i8 %b, i32 2 + ret <16 x i8> %d +} + +define <16 x i8> @insert_v16i8_c(<16 x i8> %a, i8 %b, i32 %c) { +; CHECK-LABEL: insert_v16i8_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfxil x8, x1, #0, #4 +; CHECK-NEXT: strb w0, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <16 x i8> %a, i8 %b, i32 %c + ret <16 x i8> %d +} + +define <32 x i8> @insert_v32i8_0(<32 x i8> %a, i8 %b, i32 %c) { +; CHECK-LABEL: insert_v32i8_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.b[0], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <32 x i8> %a, i8 %b, i32 0 + ret <32 x i8> %d +} + +define <32 x i8> @insert_v32i8_2(<32 x i8> %a, i8 %b, i32 %c) { +; CHECK-LABEL: insert_v32i8_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.b[2], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <32 x i8> %a, i8 %b, i32 2 + ret <32 x i8> %d +} + +define <32 x i8> @insert_v32i8_c(<32 x i8> %a, i8 %b, i32 %c) { +; CHECK-LABEL: insert_v32i8_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: and x8, x1, #0x1f +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: strb w0, [x9, x8] +; CHECK-NEXT: ldp q0, q1, [sp], #32 +; CHECK-NEXT: ret +entry: + %d = insertelement <32 x i8> %a, i8 %b, i32 %c + ret <32 x i8> %d +} + +define <4 x i16> @insert_v4i16_0(<4 x i16> %a, i16 %b, i32 %c) { +; CHECK-LABEL: insert_v4i16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x i16> %a, i16 %b, i32 0 + ret <4 x i16> %d +} + +define <4 x i16> @insert_v4i16_2(<4 x i16> %a, i16 %b, i32 %c) { +; CHECK-LABEL: insert_v4i16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[2], w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x i16> %a, i16 %b, i32 2 + ret <4 x i16> %d +} + +define <4 x i16> @insert_v4i16_c(<4 x i16> %a, i16 %b, i32 %c) { +; CHECK-LABEL: insert_v4i16_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: str d0, [sp, #8] +; CHECK-NEXT: bfi x8, x1, #1, #2 +; CHECK-NEXT: strh w0, [x8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x i16> %a, i16 %b, i32 %c + ret <4 x i16> %d +} + +define <8 x i16> @insert_v8i16_0(<8 x i16> %a, i16 %b, i32 %c) { +; CHECK-LABEL: insert_v8i16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x i16> %a, i16 %b, i32 0 + ret <8 x i16> %d +} + +define <8 x i16> @insert_v8i16_2(<8 x i16> %a, i16 %b, i32 %c) { +; CHECK-LABEL: insert_v8i16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.h[2], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x i16> %a, i16 %b, i32 2 + ret <8 x i16> %d +} + +define <8 x i16> @insert_v8i16_c(<8 x i16> %a, i16 %b, i32 %c) { +; CHECK-LABEL: insert_v8i16_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x8, x1, #1, #3 +; CHECK-NEXT: strh w0, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x i16> %a, i16 %b, i32 %c + ret <8 x i16> %d +} + +define <16 x i16> @insert_v16i16_0(<16 x i16> %a, i16 %b, i32 %c) { +; CHECK-LABEL: insert_v16i16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <16 x i16> %a, i16 %b, i32 0 + ret <16 x i16> %d +} + +define <16 x i16> @insert_v16i16_2(<16 x i16> %a, i16 %b, i32 %c) { +; CHECK-LABEL: insert_v16i16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.h[2], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <16 x i16> %a, i16 %b, i32 2 + ret <16 x i16> %d +} + +define <16 x i16> @insert_v16i16_c(<16 x i16> %a, i16 %b, i32 %c) { +; CHECK-LABEL: insert_v16i16_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: and x8, x1, #0xf +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: strh w0, [x9, x8, lsl #1] +; CHECK-NEXT: ldp q0, q1, [sp], #32 +; CHECK-NEXT: ret +entry: + %d = insertelement <16 x i16> %a, i16 %b, i32 %c + ret <16 x i16> %d +} + +define <2 x i32> @insert_v2i32_0(<2 x i32> %a, i32 %b, i32 %c) { +; CHECK-LABEL: insert_v2i32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[0], w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x i32> %a, i32 %b, i32 0 + ret <2 x i32> %d +} + +define <2 x i32> @insert_v2i32_1(<2 x i32> %a, i32 %b, i32 %c) { +; CHECK-LABEL: insert_v2i32_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[1], w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x i32> %a, i32 %b, i32 1 + ret <2 x i32> %d +} + +define <2 x i32> @insert_v2i32_c(<2 x i32> %a, i32 %b, i32 %c) { +; CHECK-LABEL: insert_v2i32_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: str d0, [sp, #8] +; CHECK-NEXT: bfi x8, x1, #2, #1 +; CHECK-NEXT: str w0, [x8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x i32> %a, i32 %b, i32 %c + ret <2 x i32> %d +} + +define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) { +; CHECK-SD-LABEL: insert_v3i32_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov s1, w0 +; CHECK-SD-NEXT: mov v1.s[1], v0.s[1] +; CHECK-SD-NEXT: mov v1.s[2], v0.s[2] +; CHECK-SD-NEXT: mov v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v3i32_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[2] +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: ret +entry: + %d = insertelement <3 x i32> %a, i32 %b, i32 0 + ret <3 x i32> %d +} + +define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) { +; CHECK-SD-LABEL: insert_v3i32_2: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov v0.s[2], w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v3i32_2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: fmov s1, w0 +; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[3], v0.s[0] +; CHECK-GI-NEXT: ret +entry: + %d = insertelement <3 x i32> %a, i32 %b, i32 2 + ret <3 x i32> %d +} + +define <3 x i32> @insert_v3i32_c(<3 x i32> %a, i32 %b, i32 %c) { +; CHECK-LABEL: insert_v3i32_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x8, x1, #2, #2 +; CHECK-NEXT: str w0, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <3 x i32> %a, i32 %b, i32 %c + ret <3 x i32> %d +} + +define <4 x i32> @insert_v4i32_0(<4 x i32> %a, i32 %b, i32 %c) { +; CHECK-LABEL: insert_v4i32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.s[0], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x i32> %a, i32 %b, i32 0 + ret <4 x i32> %d +} + +define <4 x i32> @insert_v4i32_2(<4 x i32> %a, i32 %b, i32 %c) { +; CHECK-LABEL: insert_v4i32_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.s[2], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x i32> %a, i32 %b, i32 2 + ret <4 x i32> %d +} + +define <4 x i32> @insert_v4i32_c(<4 x i32> %a, i32 %b, i32 %c) { +; CHECK-LABEL: insert_v4i32_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x8, x1, #2, #2 +; CHECK-NEXT: str w0, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x i32> %a, i32 %b, i32 %c + ret <4 x i32> %d +} + +define <8 x i32> @insert_v8i32_0(<8 x i32> %a, i32 %b, i32 %c) { +; CHECK-LABEL: insert_v8i32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.s[0], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x i32> %a, i32 %b, i32 0 + ret <8 x i32> %d +} + +define <8 x i32> @insert_v8i32_2(<8 x i32> %a, i32 %b, i32 %c) { +; CHECK-LABEL: insert_v8i32_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.s[2], w0 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x i32> %a, i32 %b, i32 2 + ret <8 x i32> %d +} + +define <8 x i32> @insert_v8i32_c(<8 x i32> %a, i32 %b, i32 %c) { +; CHECK-LABEL: insert_v8i32_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: and x8, x1, #0x7 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str w0, [x9, x8, lsl #2] +; CHECK-NEXT: ldp q0, q1, [sp], #32 +; CHECK-NEXT: ret +entry: + %d = insertelement <8 x i32> %a, i32 %b, i32 %c + ret <8 x i32> %d +} + +define <2 x i64> @insert_v2i64_0(<2 x i64> %a, i64 %b, i32 %c) { +; CHECK-LABEL: insert_v2i64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[0], x0 +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x i64> %a, i64 %b, i32 0 + ret <2 x i64> %d +} + +define <2 x i64> @insert_v2i64_1(<2 x i64> %a, i64 %b, i32 %c) { +; CHECK-LABEL: insert_v2i64_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], x0 +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x i64> %a, i64 %b, i32 1 + ret <2 x i64> %d +} + +define <2 x i64> @insert_v2i64_c(<2 x i64> %a, i64 %b, i32 %c) { +; CHECK-LABEL: insert_v2i64_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x8, x1, #3, #1 +; CHECK-NEXT: str x0, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %d = insertelement <2 x i64> %a, i64 %b, i32 %c + ret <2 x i64> %d +} + +define <3 x i64> @insert_v3i64_0(<3 x i64> %a, i64 %b, i32 %c) { +; CHECK-SD-LABEL: insert_v3i64_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: mov v0.d[0], x0 +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v3i64_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov d0, x0 +; CHECK-GI-NEXT: ret +entry: + %d = insertelement <3 x i64> %a, i64 %b, i32 0 + ret <3 x i64> %d +} + +define <3 x i64> @insert_v3i64_2(<3 x i64> %a, i64 %b, i32 %c) { +; CHECK-LABEL: insert_v3i64_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d2, x0 +; CHECK-NEXT: ret +entry: + %d = insertelement <3 x i64> %a, i64 %b, i32 2 + ret <3 x i64> %d +} + +define <3 x i64> @insert_v3i64_c(<3 x i64> %a, i64 %b, i32 %c) { +; CHECK-LABEL: insert_v3i64_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: stp q0, q2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x1, #0x3 +; CHECK-NEXT: str x0, [x8, x9, lsl #3] +; CHECK-NEXT: ldr q0, [sp] +; CHECK-NEXT: ldr d2, [sp, #16] +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %d = insertelement <3 x i64> %a, i64 %b, i32 %c + ret <3 x i64> %d +} + +define <4 x i64> @insert_v4i64_0(<4 x i64> %a, i64 %b, i32 %c) { +; CHECK-LABEL: insert_v4i64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[0], x0 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x i64> %a, i64 %b, i32 0 + ret <4 x i64> %d +} + +define <4 x i64> @insert_v4i64_2(<4 x i64> %a, i64 %b, i32 %c) { +; CHECK-LABEL: insert_v4i64_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v1.d[0], x0 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x i64> %a, i64 %b, i32 2 + ret <4 x i64> %d +} + +define <4 x i64> @insert_v4i64_c(<4 x i64> %a, i64 %b, i32 %c) { +; CHECK-LABEL: insert_v4i64_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: and x8, x1, #0x3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str x0, [x9, x8, lsl #3] +; CHECK-NEXT: ldp q0, q1, [sp], #32 +; CHECK-NEXT: ret +entry: + %d = insertelement <4 x i64> %a, i64 %b, i32 %c + ret <4 x i64> %d +} + +define double @extract_v2f64_0(<2 x double> %a, i32 %c) { +; CHECK-LABEL: extract_v2f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = extractelement <2 x double> %a, i32 0 + ret double %d +} + +define double @extract_v2f64_1(<2 x double> %a, i32 %c) { +; CHECK-LABEL: extract_v2f64_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov d0, v0.d[1] +; CHECK-NEXT: ret +entry: + %d = extractelement <2 x double> %a, i32 1 + ret double %d +} + +define double @extract_v2f64_c(<2 x double> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v2f64_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: bfi x8, x0, #3, #1 +; CHECK-SD-NEXT: ldr d0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2f64_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: mov x8, sp +; CHECK-GI-NEXT: str q0, [sp] +; CHECK-GI-NEXT: and x9, x9, #0x1 +; CHECK-GI-NEXT: ldr d0, [x8, x9, lsl #3] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <2 x double> %a, i32 %c + ret double %d +} + +define double @extract_v3f64_0(<3 x double> %a, i32 %c) { +; CHECK-LABEL: extract_v3f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ret +entry: + %d = extractelement <3 x double> %a, i32 0 + ret double %d +} + +define double @extract_v3f64_2(<3 x double> %a, i32 %c) { +; CHECK-LABEL: extract_v3f64_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret +entry: + %d = extractelement <3 x double> %a, i32 2 + ret double %d +} + +define double @extract_v3f64_c(<3 x double> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v3f64_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: and x8, x0, #0x3 +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: stp q0, q2, [sp, #-32]! +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: mov x9, sp +; CHECK-SD-NEXT: ldr d0, [x9, x8, lsl #3] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v3f64_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: and x8, x8, #0x3 +; CHECK-GI-NEXT: stp q0, q2, [sp] +; CHECK-GI-NEXT: ldr d0, [x9, x8, lsl #3] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <3 x double> %a, i32 %c + ret double %d +} + +define double @extract_v4f64_0(<4 x double> %a, i32 %c) { +; CHECK-LABEL: extract_v4f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = extractelement <4 x double> %a, i32 0 + ret double %d +} + +define double @extract_v4f64_2(<4 x double> %a, i32 %c) { +; CHECK-LABEL: extract_v4f64_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = extractelement <4 x double> %a, i32 2 + ret double %d +} + +define double @extract_v4f64_c(<4 x double> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v4f64_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: and x8, x0, #0x3 +; CHECK-SD-NEXT: mov x9, sp +; CHECK-SD-NEXT: ldr d0, [x9, x8, lsl #3] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v4f64_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: and x8, x8, #0x3 +; CHECK-GI-NEXT: ldr d0, [x9, x8, lsl #3] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <4 x double> %a, i32 %c + ret double %d +} + +define float @extract_v2f32_0(<2 x float> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v2f32_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2f32_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <2 x float> %a, i32 0 + ret float %d +} + +define float @extract_v2f32_1(<2 x float> %a, i32 %c) { +; CHECK-LABEL: extract_v2f32_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov s0, v0.s[1] +; CHECK-NEXT: ret +entry: + %d = extractelement <2 x float> %a, i32 1 + ret float %d +} + +define float @extract_v2f32_c(<2 x float> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v2f32_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: add x8, sp, #8 +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str d0, [sp, #8] +; CHECK-SD-NEXT: bfi x8, x0, #2, #1 +; CHECK-SD-NEXT: ldr s0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2f32_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: add x8, sp, #8 +; CHECK-GI-NEXT: str d0, [sp, #8] +; CHECK-GI-NEXT: and x9, x9, #0x1 +; CHECK-GI-NEXT: ldr s0, [x8, x9, lsl #2] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <2 x float> %a, i32 %c + ret float %d +} + +define float @extract_v3f32_0(<3 x float> %a, i32 %c) { +; CHECK-LABEL: extract_v3f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = extractelement <3 x float> %a, i32 0 + ret float %d +} + +define float @extract_v3f32_2(<3 x float> %a, i32 %c) { +; CHECK-LABEL: extract_v3f32_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov s0, v0.s[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <3 x float> %a, i32 2 + ret float %d +} + +define float @extract_v3f32_c(<3 x float> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v3f32_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: bfi x8, x0, #2, #2 +; CHECK-SD-NEXT: ldr s0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v3f32_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: mov x8, sp +; CHECK-GI-NEXT: str q0, [sp] +; CHECK-GI-NEXT: and x9, x9, #0x3 +; CHECK-GI-NEXT: ldr s0, [x8, x9, lsl #2] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <3 x float> %a, i32 %c + ret float %d +} + +define float @extract_v4f32_0(<4 x float> %a, i32 %c) { +; CHECK-LABEL: extract_v4f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = extractelement <4 x float> %a, i32 0 + ret float %d +} + +define float @extract_v4f32_2(<4 x float> %a, i32 %c) { +; CHECK-LABEL: extract_v4f32_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov s0, v0.s[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <4 x float> %a, i32 2 + ret float %d +} + +define float @extract_v4f32_c(<4 x float> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v4f32_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: bfi x8, x0, #2, #2 +; CHECK-SD-NEXT: ldr s0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v4f32_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: mov x8, sp +; CHECK-GI-NEXT: str q0, [sp] +; CHECK-GI-NEXT: and x9, x9, #0x3 +; CHECK-GI-NEXT: ldr s0, [x8, x9, lsl #2] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <4 x float> %a, i32 %c + ret float %d +} + +define float @extract_v8f32_0(<8 x float> %a, i32 %c) { +; CHECK-LABEL: extract_v8f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = extractelement <8 x float> %a, i32 0 + ret float %d +} + +define float @extract_v8f32_2(<8 x float> %a, i32 %c) { +; CHECK-LABEL: extract_v8f32_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov s0, v0.s[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <8 x float> %a, i32 2 + ret float %d +} + +define float @extract_v8f32_c(<8 x float> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v8f32_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: and x8, x0, #0x7 +; CHECK-SD-NEXT: mov x9, sp +; CHECK-SD-NEXT: ldr s0, [x9, x8, lsl #2] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v8f32_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: and x8, x8, #0x7 +; CHECK-GI-NEXT: ldr s0, [x9, x8, lsl #2] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <8 x float> %a, i32 %c + ret float %d +} + +define half @extract_v4f16_0(<4 x half> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v4f16_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v4f16_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <4 x half> %a, i32 0 + ret half %d +} + +define half @extract_v4f16_2(<4 x half> %a, i32 %c) { +; CHECK-LABEL: extract_v4f16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <4 x half> %a, i32 2 + ret half %d +} + +define half @extract_v4f16_c(<4 x half> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v4f16_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: add x8, sp, #8 +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str d0, [sp, #8] +; CHECK-SD-NEXT: bfi x8, x0, #1, #2 +; CHECK-SD-NEXT: ldr h0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v4f16_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: add x8, sp, #8 +; CHECK-GI-NEXT: str d0, [sp, #8] +; CHECK-GI-NEXT: and x9, x9, #0x3 +; CHECK-GI-NEXT: ldr h0, [x8, x9, lsl #1] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <4 x half> %a, i32 %c + ret half %d +} + +define half @extract_v8f16_0(<8 x half> %a, i32 %c) { +; CHECK-LABEL: extract_v8f16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = extractelement <8 x half> %a, i32 0 + ret half %d +} + +define half @extract_v8f16_2(<8 x half> %a, i32 %c) { +; CHECK-LABEL: extract_v8f16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <8 x half> %a, i32 2 + ret half %d +} + +define half @extract_v8f16_c(<8 x half> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v8f16_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: bfi x8, x0, #1, #3 +; CHECK-SD-NEXT: ldr h0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v8f16_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: mov x8, sp +; CHECK-GI-NEXT: str q0, [sp] +; CHECK-GI-NEXT: and x9, x9, #0x7 +; CHECK-GI-NEXT: ldr h0, [x8, x9, lsl #1] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <8 x half> %a, i32 %c + ret half %d +} + +define half @extract_v16f16_0(<16 x half> %a, i32 %c) { +; CHECK-LABEL: extract_v16f16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-NEXT: ret +entry: + %d = extractelement <16 x half> %a, i32 0 + ret half %d +} + +define half @extract_v16f16_2(<16 x half> %a, i32 %c) { +; CHECK-LABEL: extract_v16f16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <16 x half> %a, i32 2 + ret half %d +} + +define half @extract_v16f16_c(<16 x half> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v16f16_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: and x8, x0, #0xf +; CHECK-SD-NEXT: mov x9, sp +; CHECK-SD-NEXT: ldr h0, [x9, x8, lsl #1] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v16f16_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: and x8, x8, #0xf +; CHECK-GI-NEXT: ldr h0, [x9, x8, lsl #1] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <16 x half> %a, i32 %c + ret half %d +} + +define i8 @extract_v8i8_0(<8 x i8> %a, i32 %c) { +; CHECK-LABEL: extract_v8i8_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: ret +entry: + %d = extractelement <8 x i8> %a, i32 0 + ret i8 %d +} + +define i8 @extract_v8i8_2(<8 x i8> %a, i32 %c) { +; CHECK-LABEL: extract_v8i8_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.b[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <8 x i8> %a, i32 2 + ret i8 %d +} + +define i8 @extract_v8i8_c(<8 x i8> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v8i8_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: add x8, sp, #8 +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str d0, [sp, #8] +; CHECK-SD-NEXT: bfxil x8, x0, #0, #3 +; CHECK-SD-NEXT: ldrb w0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v8i8_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: add x8, sp, #8 +; CHECK-GI-NEXT: str d0, [sp, #8] +; CHECK-GI-NEXT: and x9, x9, #0x7 +; CHECK-GI-NEXT: lsl x10, x9, #1 +; CHECK-GI-NEXT: sub x9, x10, x9 +; CHECK-GI-NEXT: ldrb w0, [x8, x9] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <8 x i8> %a, i32 %c + ret i8 %d +} + +define i8 @extract_v16i8_0(<16 x i8> %a, i32 %c) { +; CHECK-LABEL: extract_v16i8_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: ret +entry: + %d = extractelement <16 x i8> %a, i32 0 + ret i8 %d +} + +define i8 @extract_v16i8_2(<16 x i8> %a, i32 %c) { +; CHECK-LABEL: extract_v16i8_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w0, v0.b[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <16 x i8> %a, i32 2 + ret i8 %d +} + +define i8 @extract_v16i8_c(<16 x i8> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v16i8_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: bfxil x8, x0, #0, #4 +; CHECK-SD-NEXT: ldrb w0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v16i8_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: mov x8, sp +; CHECK-GI-NEXT: str q0, [sp] +; CHECK-GI-NEXT: and x9, x9, #0xf +; CHECK-GI-NEXT: lsl x10, x9, #1 +; CHECK-GI-NEXT: sub x9, x10, x9 +; CHECK-GI-NEXT: ldrb w0, [x8, x9] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <16 x i8> %a, i32 %c + ret i8 %d +} + +define i8 @extract_v32i8_0(<32 x i8> %a, i32 %c) { +; CHECK-LABEL: extract_v32i8_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: ret +entry: + %d = extractelement <32 x i8> %a, i32 0 + ret i8 %d +} + +define i8 @extract_v32i8_2(<32 x i8> %a, i32 %c) { +; CHECK-LABEL: extract_v32i8_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w0, v0.b[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <32 x i8> %a, i32 2 + ret i8 %d +} + +define i8 @extract_v32i8_c(<32 x i8> %a, i32 %c) { +; CHECK-LABEL: extract_v32i8_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: and x8, x0, #0x1f +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: ldrb w0, [x9, x8] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %d = extractelement <32 x i8> %a, i32 %c + ret i8 %d +} + +define i16 @extract_v4i16_0(<4 x i16> %a, i32 %c) { +; CHECK-LABEL: extract_v4i16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: ret +entry: + %d = extractelement <4 x i16> %a, i32 0 + ret i16 %d +} + +define i16 @extract_v4i16_2(<4 x i16> %a, i32 %c) { +; CHECK-LABEL: extract_v4i16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.h[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <4 x i16> %a, i32 2 + ret i16 %d +} + +define i16 @extract_v4i16_c(<4 x i16> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v4i16_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: add x8, sp, #8 +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str d0, [sp, #8] +; CHECK-SD-NEXT: bfi x8, x0, #1, #2 +; CHECK-SD-NEXT: ldrh w0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v4i16_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: add x8, sp, #8 +; CHECK-GI-NEXT: str d0, [sp, #8] +; CHECK-GI-NEXT: and x9, x9, #0x3 +; CHECK-GI-NEXT: ldrh w0, [x8, x9, lsl #1] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <4 x i16> %a, i32 %c + ret i16 %d +} + +define i16 @extract_v8i16_0(<8 x i16> %a, i32 %c) { +; CHECK-LABEL: extract_v8i16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: ret +entry: + %d = extractelement <8 x i16> %a, i32 0 + ret i16 %d +} + +define i16 @extract_v8i16_2(<8 x i16> %a, i32 %c) { +; CHECK-LABEL: extract_v8i16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w0, v0.h[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <8 x i16> %a, i32 2 + ret i16 %d +} + +define i16 @extract_v8i16_c(<8 x i16> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v8i16_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: bfi x8, x0, #1, #3 +; CHECK-SD-NEXT: ldrh w0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v8i16_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: mov x8, sp +; CHECK-GI-NEXT: str q0, [sp] +; CHECK-GI-NEXT: and x9, x9, #0x7 +; CHECK-GI-NEXT: ldrh w0, [x8, x9, lsl #1] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <8 x i16> %a, i32 %c + ret i16 %d +} + +define i16 @extract_v16i16_0(<16 x i16> %a, i32 %c) { +; CHECK-LABEL: extract_v16i16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: ret +entry: + %d = extractelement <16 x i16> %a, i32 0 + ret i16 %d +} + +define i16 @extract_v16i16_2(<16 x i16> %a, i32 %c) { +; CHECK-LABEL: extract_v16i16_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w0, v0.h[2] +; CHECK-NEXT: ret +entry: + %d = extractelement <16 x i16> %a, i32 2 + ret i16 %d +} + +define i16 @extract_v16i16_c(<16 x i16> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v16i16_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: and x8, x0, #0xf +; CHECK-SD-NEXT: mov x9, sp +; CHECK-SD-NEXT: ldrh w0, [x9, x8, lsl #1] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v16i16_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: and x8, x8, #0xf +; CHECK-GI-NEXT: ldrh w0, [x9, x8, lsl #1] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <16 x i16> %a, i32 %c + ret i16 %d +} + +define i32 @extract_v2i32_0(<2 x i32> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v2i32_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2i32_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <2 x i32> %a, i32 0 + ret i32 %d +} + +define i32 @extract_v2i32_1(<2 x i32> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v2i32_1: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: mov w0, v0.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2i32_1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov s0, v0.s[1] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <2 x i32> %a, i32 1 + ret i32 %d +} + +define i32 @extract_v2i32_c(<2 x i32> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v2i32_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: add x8, sp, #8 +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str d0, [sp, #8] +; CHECK-SD-NEXT: bfi x8, x0, #2, #1 +; CHECK-SD-NEXT: ldr w0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2i32_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: add x8, sp, #8 +; CHECK-GI-NEXT: str d0, [sp, #8] +; CHECK-GI-NEXT: and x9, x9, #0x1 +; CHECK-GI-NEXT: ldr w0, [x8, x9, lsl #2] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <2 x i32> %a, i32 %c + ret i32 %d +} + +define i32 @extract_v3i32_0(<3 x i32> %a, i32 %c) { +; CHECK-LABEL: extract_v3i32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %d = extractelement <3 x i32> %a, i32 0 + ret i32 %d +} + +define i32 @extract_v3i32_2(<3 x i32> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v3i32_2: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w0, v0.s[2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v3i32_2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s0, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <3 x i32> %a, i32 2 + ret i32 %d +} + +define i32 @extract_v3i32_c(<3 x i32> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v3i32_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: bfi x8, x0, #2, #2 +; CHECK-SD-NEXT: ldr w0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v3i32_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: mov x8, sp +; CHECK-GI-NEXT: str q0, [sp] +; CHECK-GI-NEXT: and x9, x9, #0x3 +; CHECK-GI-NEXT: ldr w0, [x8, x9, lsl #2] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <3 x i32> %a, i32 %c + ret i32 %d +} + +define i32 @extract_v4i32_0(<4 x i32> %a, i32 %c) { +; CHECK-LABEL: extract_v4i32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %d = extractelement <4 x i32> %a, i32 0 + ret i32 %d +} + +define i32 @extract_v4i32_2(<4 x i32> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v4i32_2: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w0, v0.s[2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v4i32_2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s0, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <4 x i32> %a, i32 2 + ret i32 %d +} + +define i32 @extract_v4i32_c(<4 x i32> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v4i32_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: bfi x8, x0, #2, #2 +; CHECK-SD-NEXT: ldr w0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v4i32_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: mov x8, sp +; CHECK-GI-NEXT: str q0, [sp] +; CHECK-GI-NEXT: and x9, x9, #0x3 +; CHECK-GI-NEXT: ldr w0, [x8, x9, lsl #2] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <4 x i32> %a, i32 %c + ret i32 %d +} + +define i32 @extract_v8i32_0(<8 x i32> %a, i32 %c) { +; CHECK-LABEL: extract_v8i32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %d = extractelement <8 x i32> %a, i32 0 + ret i32 %d +} + +define i32 @extract_v8i32_2(<8 x i32> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v8i32_2: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w0, v0.s[2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v8i32_2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s0, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <8 x i32> %a, i32 2 + ret i32 %d +} + +define i32 @extract_v8i32_c(<8 x i32> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v8i32_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: and x8, x0, #0x7 +; CHECK-SD-NEXT: mov x9, sp +; CHECK-SD-NEXT: ldr w0, [x9, x8, lsl #2] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v8i32_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: and x8, x8, #0x7 +; CHECK-GI-NEXT: ldr w0, [x9, x8, lsl #2] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <8 x i32> %a, i32 %c + ret i32 %d +} + +define i64 @extract_v2i64_0(<2 x i64> %a, i32 %c) { +; CHECK-LABEL: extract_v2i64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %d = extractelement <2 x i64> %a, i32 0 + ret i64 %d +} + +define i64 @extract_v2i64_1(<2 x i64> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v2i64_1: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x0, v0.d[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2i64_1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <2 x i64> %a, i32 1 + ret i64 %d +} + +define i64 @extract_v2i64_c(<2 x i64> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v2i64_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: bfi x8, x0, #3, #1 +; CHECK-SD-NEXT: ldr x0, [x8] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2i64_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w9, w0 +; CHECK-GI-NEXT: mov x8, sp +; CHECK-GI-NEXT: str q0, [sp] +; CHECK-GI-NEXT: and x9, x9, #0x1 +; CHECK-GI-NEXT: ldr x0, [x8, x9, lsl #3] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <2 x i64> %a, i32 %c + ret i64 %d +} + +define i64 @extract_v3i64_0(<3 x i64> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v3i64_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v3i64_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <3 x i64> %a, i32 0 + ret i64 %d +} + +define i64 @extract_v3i64_2(<3 x i64> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v3i64_2: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: fmov x0, d2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v3i64_2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x0, d2 +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <3 x i64> %a, i32 2 + ret i64 %d +} + +define i64 @extract_v3i64_c(<3 x i64> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v3i64_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: and x8, x0, #0x3 +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: stp q0, q2, [sp, #-32]! +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: mov x9, sp +; CHECK-SD-NEXT: ldr x0, [x9, x8, lsl #3] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v3i64_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: and x8, x8, #0x3 +; CHECK-GI-NEXT: stp q0, q2, [sp] +; CHECK-GI-NEXT: ldr x0, [x9, x8, lsl #3] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <3 x i64> %a, i32 %c + ret i64 %d +} + +define i64 @extract_v4i64_0(<4 x i64> %a, i32 %c) { +; CHECK-LABEL: extract_v4i64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %d = extractelement <4 x i64> %a, i32 0 + ret i64 %d +} + +define i64 @extract_v4i64_2(<4 x i64> %a, i32 %c) { +; CHECK-LABEL: extract_v4i64_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov x0, d1 +; CHECK-NEXT: ret +entry: + %d = extractelement <4 x i64> %a, i32 2 + ret i64 %d +} + +define i64 @extract_v4i64_c(<4 x i64> %a, i32 %c) { +; CHECK-SD-LABEL: extract_v4i64_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: and x8, x0, #0x3 +; CHECK-SD-NEXT: mov x9, sp +; CHECK-SD-NEXT: ldr x0, [x9, x8, lsl #3] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v4i64_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: and x8, x8, #0x3 +; CHECK-GI-NEXT: ldr x0, [x9, x8, lsl #3] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %d = extractelement <4 x i64> %a, i32 %c + ret i64 %d +} -- cgit v1.1 From 1a988869319bb4cfe04b3d2618818180b3cfb28c Mon Sep 17 00:00:00 2001 From: lntue <35648136+lntue@users.noreply.github.com> Date: Sun, 11 Feb 2024 20:02:37 -0500 Subject: [libc] Remove extra ] in stdc.td. (#81438) --- libc/spec/stdc.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index afddc77..011abbf 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -401,7 +401,7 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"frexp", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"frexpf", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"frexpl", RetValSpec, [ArgSpec, ArgSpec]>, - GuardedFunctionSpec<"frexpf128", RetValSpec, [ArgSpec, ArgSpec]], "LIBC_COMPILER_HAS_FLOAT128">, + GuardedFunctionSpec<"frexpf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_COMPILER_HAS_FLOAT128">, FunctionSpec<"hypot", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"hypotf", RetValSpec, [ArgSpec, ArgSpec]>, -- cgit v1.1 From 5da801386c2b820a4596fc6d8da6b5f4a6da94b4 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 11 Feb 2024 20:21:29 -0800 Subject: [AArch64AsmParser] Allow branch target symbol to have a shift/extend modifier name (#80571) Shift and extend modifiers are parsed as separate operands. When a symbol operand of a branch instruction has such a "bad" name, AArch64AsmParser will report an error. ``` % cat a.c void lsl(); void lsr(); void asr(); void ror(); void uxtb(); void sxtx(); void foo() { lsl(); asr(); asr(); ror(); uxtb(); sxtx(); } % clang --target=aarch64 -c -save-temps a.c a.s:15:8: error: expected #imm after shift specifier bl lsl ^ a.s:16:8: error: expected #imm after shift specifier bl asr ^ a.s:17:8: error: expected #imm after shift specifier bl asr ^ a.s:18:8: error: expected #imm after shift specifier bl ror ^ a.s:19:5: error: expected label or encodable integer pc offset bl uxtb ^ a.s:20:5: error: expected label or encodable integer pc offset bl sxtx ^ ``` In contrast, gas correctly parses these instructions. Fix #79729 by parsing shift/extend modifier after an immediate value/register --- .../Target/AArch64/AsmParser/AArch64AsmParser.cpp | 33 +++++++++++++++++----- llvm/test/MC/AArch64/arm64-adr.s | 10 +++++++ llvm/test/MC/AArch64/arm64-branch-encoding.s | 10 +++++++ llvm/test/MC/AArch64/basic-a64-diagnostics.s | 8 ++++++ 4 files changed, 54 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index e9d96f3..4e7c8f6 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -4809,20 +4809,30 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, return parseCondCode(Operands, invertCondCode); // If it's a register name, parse it. - if (!parseRegister(Operands)) + if (!parseRegister(Operands)) { + // Parse an optional shift/extend modifier. + AsmToken SavedTok = getTok(); + if (parseOptionalToken(AsmToken::Comma)) { + // The operand after the register may be a label (e.g. ADR/ADRP). Check + // such cases and don't report an error when

- - -