diff options
182 files changed, 6817 insertions, 1617 deletions
diff --git a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp index d2ae13c..e825547 100644 --- a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp +++ b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp @@ -96,7 +96,7 @@ bool IncludeFixerActionFactory::runInvocation( // diagnostics here. Compiler.createDiagnostics(new clang::IgnoringDiagConsumer, /*ShouldOwnClient=*/true); - Compiler.createSourceManager(*Files); + Compiler.createSourceManager(); // We abort on fatal errors so don't let a large number of errors become // fatal. A missing #include can cause thousands of errors. diff --git a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp index 3fb49796..cbf7bae 100644 --- a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp +++ b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp @@ -649,11 +649,12 @@ TEST_F(PragmaIncludeTest, ExportInUnnamedBuffer) { Clang->createVirtualFileSystem(VFS); Clang->createDiagnostics(); - auto *FM = Clang->createFileManager(); + Clang->createFileManager(); + FileManager &FM = Clang->getFileManager(); ASSERT_TRUE(Clang->ExecuteAction(*Inputs.MakeAction())); EXPECT_THAT( - PI.getExporters(llvm::cantFail(FM->getFileRef("foo.h")), *FM), - testing::ElementsAre(llvm::cantFail(FM->getFileRef("exporter.h")))); + PI.getExporters(llvm::cantFail(FM.getFileRef("foo.h")), FM), + testing::ElementsAre(llvm::cantFail(FM.getFileRef("exporter.h")))); } TEST_F(PragmaIncludeTest, OutlivesFMAndSM) { diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 74b0647..145a83a 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -246,8 +246,6 @@ Non-comprehensive list of changes in this release - ``__builtin_assume_dereferenceable`` now accepts non-constant size operands. -- Fixed a crash when the second argument to ``__builtin_assume_aligned`` was not constant (#GH161314) - New Compiler Flags ------------------ - New option ``-fno-sanitize-debug-trap-reasons`` added to disable emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``). diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h index 5f06117..58ba8d91 100644 --- a/clang/include/clang/AST/OpenACCClause.h +++ b/clang/include/clang/AST/OpenACCClause.h @@ -840,14 +840,13 @@ public: // alloca at the level of the base, and the init at the element level. struct OpenACCPrivateRecipe { VarDecl *AllocaDecl; - Expr *InitExpr; - OpenACCPrivateRecipe(VarDecl *A, Expr *I) : AllocaDecl(A), InitExpr(I) {} + OpenACCPrivateRecipe(VarDecl *A) : AllocaDecl(A) {} bool isSet() const { return AllocaDecl; } static OpenACCPrivateRecipe Empty() { - return OpenACCPrivateRecipe(nullptr, nullptr); + return OpenACCPrivateRecipe(/*AllocaDecl=*/nullptr); } }; @@ -899,18 +898,17 @@ public: // InitFromTemporary is the 'temp' declaration we put in to be 'copied from'. struct OpenACCFirstPrivateRecipe { VarDecl *AllocaDecl; - Expr *InitExpr; VarDecl *InitFromTemporary; - OpenACCFirstPrivateRecipe(VarDecl *A, Expr *I, VarDecl *T) - : AllocaDecl(A), InitExpr(I), InitFromTemporary(T) { - assert(!AllocaDecl || AllocaDecl->getInit() == nullptr); + OpenACCFirstPrivateRecipe(VarDecl *A, VarDecl *T) + : AllocaDecl(A), InitFromTemporary(T) { assert(!InitFromTemporary || InitFromTemporary->getInit() == nullptr); } bool isSet() const { return AllocaDecl; } static OpenACCFirstPrivateRecipe Empty() { - return OpenACCFirstPrivateRecipe(nullptr, nullptr, nullptr); + return OpenACCFirstPrivateRecipe(/*AllocaDecl=*/nullptr, + /*InitFromTemporary=*/nullptr); } }; @@ -1282,16 +1280,13 @@ public: // 'main' declaration used for initializaiton, which is fixed. struct OpenACCReductionRecipe { VarDecl *AllocaDecl; - Expr *InitExpr; // TODO: OpenACC: this should eventually have the operations here too. - OpenACCReductionRecipe(VarDecl *A, Expr *I) : AllocaDecl(A), InitExpr(I) { - assert(!AllocaDecl || AllocaDecl->getInit() == nullptr); - } + OpenACCReductionRecipe(VarDecl *A) : AllocaDecl(A) {} bool isSet() const { return AllocaDecl; } static OpenACCReductionRecipe Empty() { - return OpenACCReductionRecipe(nullptr, nullptr); + return OpenACCReductionRecipe(/*AllocaDecl=*/nullptr); } }; diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h index e0d00b8..6786b2f 100644 --- a/clang/include/clang/AST/TypeBase.h +++ b/clang/include/clang/AST/TypeBase.h @@ -6702,15 +6702,21 @@ public: LLVM_PREFERRED_TYPE(bool) uint8_t RawBuffer : 1; + LLVM_PREFERRED_TYPE(bool) + uint8_t IsCounter : 1; + Attributes(llvm::dxil::ResourceClass ResourceClass, bool IsROV = false, - bool RawBuffer = false) - : ResourceClass(ResourceClass), IsROV(IsROV), RawBuffer(RawBuffer) {} + bool RawBuffer = false, bool IsCounter = false) + : ResourceClass(ResourceClass), IsROV(IsROV), RawBuffer(RawBuffer), + IsCounter(IsCounter) {} - Attributes() : Attributes(llvm::dxil::ResourceClass::UAV, false, false) {} + Attributes() + : Attributes(llvm::dxil::ResourceClass::UAV, false, false, false) {} friend bool operator==(const Attributes &LHS, const Attributes &RHS) { - return std::tie(LHS.ResourceClass, LHS.IsROV, LHS.RawBuffer) == - std::tie(RHS.ResourceClass, RHS.IsROV, RHS.RawBuffer); + return std::tie(LHS.ResourceClass, LHS.IsROV, LHS.RawBuffer, + LHS.IsCounter) == std::tie(RHS.ResourceClass, RHS.IsROV, + RHS.RawBuffer, RHS.IsCounter); } friend bool operator!=(const Attributes &LHS, const Attributes &RHS) { return !(LHS == RHS); @@ -6751,6 +6757,7 @@ public: ID.AddInteger(static_cast<uint32_t>(Attrs.ResourceClass)); ID.AddBoolean(Attrs.IsROV); ID.AddBoolean(Attrs.RawBuffer); + ID.AddBoolean(Attrs.IsCounter); } static bool classof(const Type *T) { diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index b3932a6..9dc85fb 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -662,6 +662,9 @@ let Class = HLSLAttributedResourceType in { def : Property<"rawBuffer", Bool> { let Read = [{ node->getAttrs().RawBuffer }]; } + def : Property<"isCounter", Bool> { + let Read = [{ node->getAttrs().IsCounter }]; + } def : Property<"wrappedTy", QualType> { let Read = [{ node->getWrappedType() }]; } @@ -669,7 +672,7 @@ let Class = HLSLAttributedResourceType in { let Read = [{ node->getContainedType() }]; } def : Creator<[{ - HLSLAttributedResourceType::Attributes attrs(static_cast<llvm::dxil::ResourceClass>(resClass), isROV, rawBuffer); + HLSLAttributedResourceType::Attributes attrs(static_cast<llvm::dxil::ResourceClass>(resClass), isROV, rawBuffer, isCounter); return ctx.getHLSLAttributedResourceType(wrappedTy, containedTy, attrs); }]>; } diff --git a/clang/include/clang/Analysis/CFG.h b/clang/include/clang/Analysis/CFG.h index 1b1ff5e..6dd7d13 100644 --- a/clang/include/clang/Analysis/CFG.h +++ b/clang/include/clang/Analysis/CFG.h @@ -1251,6 +1251,7 @@ public: bool MarkElidedCXXConstructors = false; bool AddVirtualBaseBranches = false; bool OmitImplicitValueInitializers = false; + bool AssumeReachableDefaultInSwitchStatements = false; BuildOptions() = default; diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index fe3ca70..3c697ed 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -5074,6 +5074,12 @@ def HLSLRawBuffer : TypeAttr { let Documentation = [InternalOnly]; } +def HLSLIsCounter : TypeAttr { + let Spellings = [CXX11<"hlsl", "is_counter">]; + let LangOpts = [HLSL]; + let Documentation = [InternalOnly]; +} + def HLSLGroupSharedAddressSpace : TypeAttr { let Spellings = [CustomKeyword<"groupshared">]; let Subjects = SubjectList<[Var]>; diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 7a6c084..3dfcafc 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -133,7 +133,6 @@ struct MissingFeatures { // RecordType static bool skippedLayout() { return false; } static bool astRecordDeclAttr() { return false; } - static bool recordZeroInit() { return false; } static bool recordZeroInitPadding() { return false; } static bool zeroSizeRecordMembers() { return false; } @@ -192,6 +191,7 @@ struct MissingFeatures { static bool builtinCheckKind() { return false; } static bool cgCapturedStmtInfo() { return false; } static bool cgFPOptionsRAII() { return false; } + static bool checkBitfieldClipping() { return false; } static bool cirgenABIInfo() { return false; } static bool cleanupAfterErrorDiags() { return false; } static bool cleanupsToDeactivate() { return false; } diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h index a6b6993..44fff69 100644 --- a/clang/include/clang/Frontend/CompilerInstance.h +++ b/clang/include/clang/Frontend/CompilerInstance.h @@ -712,12 +712,10 @@ public: const CodeGenOptions *CodeGenOpts = nullptr); /// Create the file manager and replace any existing one with it. - /// - /// \return The new file manager on success, or null on failure. - FileManager *createFileManager(); + void createFileManager(); /// Create the source manager and replace any existing one with it. - void createSourceManager(FileManager &FileMgr); + void createSourceManager(); /// Create the preprocessor, using the invocation, file, and source managers, /// and replace any existing one with it. diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 2173aed..844db79 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -4624,6 +4624,8 @@ void CXXNameMangler::mangleType(const HLSLAttributedResourceType *T) { Str += "_ROV"; if (Attrs.RawBuffer) Str += "_Raw"; + if (Attrs.IsCounter) + Str += "_Counter"; if (T->hasContainedType()) Str += "_CT"; mangleVendorQualifier(Str); diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 589a156..f3b5478 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -2655,8 +2655,6 @@ void OpenACCClauseProfiler::VisitPrivateClause( for (auto &Recipe : Clause.getInitRecipes()) { Profiler.VisitDecl(Recipe.AllocaDecl); - if (Recipe.InitExpr) - Profiler.VisitExpr(Recipe.InitExpr); } } @@ -2666,8 +2664,6 @@ void OpenACCClauseProfiler::VisitFirstPrivateClause( for (auto &Recipe : Clause.getInitRecipes()) { Profiler.VisitDecl(Recipe.AllocaDecl); - if (Recipe.InitExpr) - Profiler.VisitExpr(Recipe.InitExpr); Profiler.VisitDecl(Recipe.InitFromTemporary); } } @@ -2773,12 +2769,10 @@ void OpenACCClauseProfiler::VisitReductionClause( for (auto &Recipe : Clause.getRecipes()) { Profiler.VisitDecl(Recipe.AllocaDecl); - if (Recipe.InitExpr) - Profiler.VisitExpr(Recipe.InitExpr); // TODO: OpenACC: Make sure we remember to update this when we figure out // what we're adding for the operation recipe, in the meantime, a static // assert will make sure we don't add something. - static_assert(sizeof(OpenACCReductionRecipe) == 2 * sizeof(int *)); + static_assert(sizeof(OpenACCReductionRecipe) == sizeof(int *)); } } diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index f3448af..66a1b68 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -2062,6 +2062,7 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, case attr::HLSLROV: case attr::HLSLRawBuffer: case attr::HLSLContainedType: + case attr::HLSLIsCounter: llvm_unreachable("HLSL resource type attributes handled separately"); case attr::OpenCLPrivateAddressSpace: @@ -2210,6 +2211,8 @@ void TypePrinter::printHLSLAttributedResourceAfter( OS << " [[hlsl::is_rov]]"; if (Attrs.RawBuffer) OS << " [[hlsl::raw_buffer]]"; + if (Attrs.IsCounter) + OS << " [[hlsl::is_counter]]"; QualType ContainedTy = T->getContainedType(); if (!ContainedTy.isNull()) { diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp index 60a2d11..cdde849 100644 --- a/clang/lib/Analysis/CFG.cpp +++ b/clang/lib/Analysis/CFG.cpp @@ -4516,10 +4516,13 @@ CFGBlock *CFGBuilder::VisitSwitchStmt(SwitchStmt *Terminator) { // // Note: We add a successor to a switch that is considered covered yet has no // case statements if the enumeration has no enumerators. + // We also consider this successor reachable if + // BuildOpts.SwitchReqDefaultCoveredEnum is true. bool SwitchAlwaysHasSuccessor = false; SwitchAlwaysHasSuccessor |= switchExclusivelyCovered; - SwitchAlwaysHasSuccessor |= Terminator->isAllEnumCasesCovered() && - Terminator->getSwitchCaseList(); + SwitchAlwaysHasSuccessor |= + !BuildOpts.AssumeReachableDefaultInSwitchStatements && + Terminator->isAllEnumCasesCovered() && Terminator->getSwitchCaseList(); addSuccessor(SwitchTerminatedBlock, DefaultCaseBlock, !SwitchAlwaysHasSuccessor); diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp index 10b8255..563a753 100644 --- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp @@ -35,8 +35,8 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d, getContext().getLangOpts().ElideConstructors && d.isNRVOVariable(); CIRGenFunction::AutoVarEmission emission(d); - emission.IsEscapingByRef = d.isEscapingByref(); - if (emission.IsEscapingByRef) + emission.isEscapingByRef = d.isEscapingByref(); + if (emission.isEscapingByRef) cgm.errorNYI(d.getSourceRange(), "emitAutoVarDecl: decl escaping by reference"); @@ -78,7 +78,7 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d, alignment); } - emission.Addr = address; + emission.addr = address; setAddrOfLocalVar(&d, address); return emission; @@ -101,13 +101,13 @@ bool CIRGenFunction::isTrivialInitializer(const Expr *init) { void CIRGenFunction::emitAutoVarInit( const CIRGenFunction::AutoVarEmission &emission) { - assert(emission.Variable && "emission was not valid!"); + assert(emission.variable && "emission was not valid!"); // If this was emitted as a global constant, we're done. if (emission.wasEmittedAsGlobal()) return; - const VarDecl &d = *emission.Variable; + const VarDecl &d = *emission.variable; QualType type = d.getType(); @@ -124,7 +124,7 @@ void CIRGenFunction::emitAutoVarInit( return; } - const Address addr = emission.Addr; + const Address addr = emission.addr; // Check whether this is a byref variable that's potentially // captured and moved by its own initializer. If so, we'll need to @@ -153,7 +153,7 @@ void CIRGenFunction::emitAutoVarInit( } mlir::Attribute constant; - if (emission.IsConstantAggregate || + if (emission.isConstantAggregate || d.mightBeUsableInConstantExpressions(getContext())) { // FIXME: Differently from LLVM we try not to emit / lower too much // here for CIR since we are interested in seeing the ctor in some @@ -196,7 +196,7 @@ void CIRGenFunction::emitAutoVarInit( // FIXME(cir): migrate most of this file to use mlir::TypedAttr directly. auto typedConstant = mlir::dyn_cast<mlir::TypedAttr>(constant); assert(typedConstant && "expected typed attribute"); - if (!emission.IsConstantAggregate) { + if (!emission.isConstantAggregate) { // For simple scalar/complex initialization, store the value directly. LValue lv = makeAddrLValue(addr, type); assert(init && "expected initializer"); @@ -209,7 +209,7 @@ void CIRGenFunction::emitAutoVarInit( void CIRGenFunction::emitAutoVarCleanups( const CIRGenFunction::AutoVarEmission &emission) { - const VarDecl &d = *emission.Variable; + const VarDecl &d = *emission.variable; // Check the type for a cleanup. if (QualType::DestructionKind dtorKind = d.needsDestruction(getContext())) @@ -821,7 +821,7 @@ void CIRGenFunction::emitAutoVarTypeCleanup( // original stack object, not the possibly forwarded object. Address addr = emission.getObjectAddress(*this); - const VarDecl *var = emission.Variable; + const VarDecl *var = emission.variable; QualType type = var->getType(); CleanupKind cleanupKind = NormalAndEHCleanup; @@ -834,7 +834,7 @@ void CIRGenFunction::emitAutoVarTypeCleanup( case QualType::DK_cxx_destructor: // If there's an NRVO flag on the emission, we need a different // cleanup. - if (emission.NRVOFlag) { + if (emission.nrvoFlag) { cgm.errorNYI(var->getSourceRange(), "emitAutoVarTypeCleanup: NRVO"); return; } diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index c0ed8b4..cb7cf98 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -479,55 +479,55 @@ public: ConstantEmission tryEmitAsConstant(const MemberExpr *me); struct AutoVarEmission { - const clang::VarDecl *Variable; + const clang::VarDecl *variable; /// The address of the alloca for languages with explicit address space /// (e.g. OpenCL) or alloca casted to generic pointer for address space /// agnostic languages (e.g. C++). Invalid if the variable was emitted /// as a global constant. - Address Addr; + Address addr; /// True if the variable is of aggregate type and has a constant /// initializer. - bool IsConstantAggregate = false; + bool isConstantAggregate = false; /// True if the variable is a __block variable that is captured by an /// escaping block. - bool IsEscapingByRef = false; + bool isEscapingByRef = false; /// True if the variable was emitted as an offload recipe, and thus doesn't /// have the same sort of alloca initialization. - bool EmittedAsOffload = false; + bool emittedAsOffload = false; - mlir::Value NRVOFlag{}; + mlir::Value nrvoFlag{}; struct Invalid {}; - AutoVarEmission(Invalid) : Variable(nullptr), Addr(Address::invalid()) {} + AutoVarEmission(Invalid) : variable(nullptr), addr(Address::invalid()) {} AutoVarEmission(const clang::VarDecl &variable) - : Variable(&variable), Addr(Address::invalid()) {} + : variable(&variable), addr(Address::invalid()) {} static AutoVarEmission invalid() { return AutoVarEmission(Invalid()); } - bool wasEmittedAsGlobal() const { return !Addr.isValid(); } + bool wasEmittedAsGlobal() const { return !addr.isValid(); } - bool wasEmittedAsOffloadClause() const { return EmittedAsOffload; } + bool wasEmittedAsOffloadClause() const { return emittedAsOffload; } /// Returns the raw, allocated address, which is not necessarily /// the address of the object itself. It is casted to default /// address space for address space agnostic languages. - Address getAllocatedAddress() const { return Addr; } + Address getAllocatedAddress() const { return addr; } // Changes the stored address for the emission. This function should only // be used in extreme cases, and isn't required to model normal AST // initialization/variables. - void setAllocatedAddress(Address A) { Addr = A; } + void setAllocatedAddress(Address a) { addr = a; } /// Returns the address of the object within this declaration. /// Note that this does not chase the forwarding pointer for /// __block decls. Address getObjectAddress(CIRGenFunction &cgf) const { - if (!IsEscapingByRef) - return Addr; + if (!isEscapingByRef) + return addr; assert(!cir::MissingFeatures::opAllocaEscapeByReference()); return Address::invalid(); diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp index f22f3e8..3d86f71 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp @@ -1001,7 +1001,7 @@ public: OpenACCRecipeBuilder<mlir::acc::PrivateRecipeOp>(cgf, builder) .getOrCreateRecipe( cgf.getContext(), recipeInsertLocation, varExpr, - varRecipe.AllocaDecl, varRecipe.InitExpr, + varRecipe.AllocaDecl, /*temporary=*/nullptr, OpenACCReductionOperator::Invalid, Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType, opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType, @@ -1036,20 +1036,13 @@ public: { mlir::OpBuilder::InsertionGuard guardCase(builder); - // TODO: OpenACC: At the moment this is a bit of a hacky way of doing - // this, and won't work when we get to bounds/etc. Do this for now to - // limit the scope of this refactor. - VarDecl *allocaDecl = varRecipe.AllocaDecl; - allocaDecl->setInit(varRecipe.InitExpr); - allocaDecl->setInitStyle(VarDecl::CallInit); auto recipe = OpenACCRecipeBuilder<mlir::acc::FirstprivateRecipeOp>(cgf, builder) .getOrCreateRecipe( cgf.getContext(), recipeInsertLocation, varExpr, - varRecipe.AllocaDecl, varRecipe.InitExpr, - varRecipe.InitFromTemporary, + varRecipe.AllocaDecl, varRecipe.InitFromTemporary, OpenACCReductionOperator::Invalid, Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType, opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType, @@ -1086,18 +1079,12 @@ public: { mlir::OpBuilder::InsertionGuard guardCase(builder); - // TODO: OpenACC: At the moment this is a bit of a hacky way of doing - // this, and won't work when we get to bounds/etc. Do this for now to - // limit the scope of this refactor. - VarDecl *allocaDecl = varRecipe.AllocaDecl; - allocaDecl->setInit(varRecipe.InitExpr); - allocaDecl->setInitStyle(VarDecl::CallInit); auto recipe = OpenACCRecipeBuilder<mlir::acc::ReductionRecipeOp>(cgf, builder) .getOrCreateRecipe( cgf.getContext(), recipeInsertLocation, varExpr, - varRecipe.AllocaDecl, varRecipe.InitExpr, + varRecipe.AllocaDecl, /*temporary=*/nullptr, clause.getReductionOp(), Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType, opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType, diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp index f8e511e..ea6ea2c 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp @@ -408,7 +408,7 @@ void OpenACCRecipeBuilderBase::makeBoundsInit( CIRGenFunction::LexicalScope ls(cgf, loc, block); CIRGenFunction::AutoVarEmission tempDeclEmission{*allocaDecl}; - tempDeclEmission.EmittedAsOffload = true; + tempDeclEmission.emittedAsOffload = true; // The init section is the only one of the handful that only has a single // argument for the 'type', so we have to drop 1 for init, and future calls @@ -435,7 +435,7 @@ void OpenACCRecipeBuilderBase::createPrivateInitRecipe( mlir::Location loc, mlir::Location locEnd, SourceRange exprRange, mlir::Value mainOp, mlir::acc::PrivateRecipeOp recipe, size_t numBounds, llvm::ArrayRef<QualType> boundTypes, const VarDecl *allocaDecl, - QualType origType, const Expr *initExpr) { + QualType origType) { assert(allocaDecl && "Required recipe variable not set?"); CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, allocaDecl}; @@ -473,9 +473,10 @@ void OpenACCRecipeBuilderBase::createPrivateInitRecipe( // If the initializer is trivial, there is nothing to do here, so save // ourselves some effort. - if (initExpr && (!cgf.isTrivialInitializer(initExpr) || - cgf.getContext().getLangOpts().getTrivialAutoVarInit() != - LangOptions::TrivialAutoVarInitKind::Uninitialized)) + if (allocaDecl->getInit() && + (!cgf.isTrivialInitializer(allocaDecl->getInit()) || + cgf.getContext().getLangOpts().getTrivialAutoVarInit() != + LangOptions::TrivialAutoVarInitKind::Uninitialized)) makeBoundsInit(alloca, loc, block, allocaDecl, origType, /*isInitSection=*/true); } @@ -504,7 +505,7 @@ void OpenACCRecipeBuilderBase::createFirstprivateRecipeCopy( // that instead of the variable in the other block. tempDeclEmission.setAllocatedAddress( Address{toArg, elementTy, cgf.getContext().getDeclAlign(varRecipe)}); - tempDeclEmission.EmittedAsOffload = true; + tempDeclEmission.emittedAsOffload = true; CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, temporary}; cgf.setAddrOfLocalVar( diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h index 203eaff..a05b0bd 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h @@ -70,8 +70,7 @@ protected: mlir::acc::PrivateRecipeOp recipe, size_t numBounds, llvm::ArrayRef<QualType> boundTypes, - const VarDecl *allocaDecl, QualType origType, - const Expr *initExpr); + const VarDecl *allocaDecl, QualType origType); void createRecipeDestroySection(mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp, CharUnits alignment, @@ -212,15 +211,12 @@ public: OpenACCRecipeBuilder(CIRGen::CIRGenFunction &cgf, CIRGen::CIRGenBuilderTy &builder) : OpenACCRecipeBuilderBase(cgf, builder) {} - RecipeTy getOrCreateRecipe(ASTContext &astCtx, - mlir::OpBuilder::InsertPoint &insertLocation, - const Expr *varRef, const VarDecl *varRecipe, - const Expr *initExpr, const VarDecl *temporary, - OpenACCReductionOperator reductionOp, - DeclContext *dc, QualType origType, - size_t numBounds, - llvm::ArrayRef<QualType> boundTypes, - QualType baseType, mlir::Value mainOp) { + RecipeTy getOrCreateRecipe( + ASTContext &astCtx, mlir::OpBuilder::InsertPoint &insertLocation, + const Expr *varRef, const VarDecl *varRecipe, const VarDecl *temporary, + OpenACCReductionOperator reductionOp, DeclContext *dc, QualType origType, + size_t numBounds, llvm::ArrayRef<QualType> boundTypes, QualType baseType, + mlir::Value mainOp) { assert(!varRecipe->getType()->isSpecificBuiltinType( BuiltinType::ArraySection) && "array section shouldn't make it to recipe creation"); @@ -266,7 +262,7 @@ public: if constexpr (std::is_same_v<RecipeTy, mlir::acc::PrivateRecipeOp>) { createPrivateInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp, recipe, numBounds, boundTypes, varRecipe, - origType, initExpr); + origType); } else { createRecipeInitCopy(loc, locEnd, varRef->getSourceRange(), mainOp, recipe, varRecipe, temporary); diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp index 2baeb43..87f2340 100644 --- a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp @@ -296,9 +296,8 @@ void CIRRecordLowering::lower(bool nonVirtualBaseType) { } llvm::stable_sort(members); - // TODO: implement clipTailPadding once bitfields are implemented - assert(!cir::MissingFeatures::bitfields()); - assert(!cir::MissingFeatures::recordZeroInit()); + // TODO: Verify bitfield clipping + assert(!cir::MissingFeatures::checkBitfieldClipping()); members.push_back(makeStorageInfo(size, getUIntNType(8))); determinePacked(nonVirtualBaseType); @@ -319,9 +318,11 @@ void CIRRecordLowering::fillOutputFields() { fieldIdxMap[member.fieldDecl->getCanonicalDecl()] = fieldTypes.size() - 1; // A field without storage must be a bitfield. - assert(!cir::MissingFeatures::bitfields()); - if (!member.data) + if (!member.data) { + assert(member.fieldDecl && + "member.data is a nullptr so member.fieldDecl should not be"); setBitFieldInfo(member.fieldDecl, member.offset, fieldTypes.back()); + } } else if (member.kind == MemberInfo::InfoKind::Base) { nonVirtualBases[member.cxxRecordDecl] = fieldTypes.size() - 1; } else if (member.kind == MemberInfo::InfoKind::VBase) { @@ -697,13 +698,9 @@ CIRGenTypes::computeRecordLayout(const RecordDecl *rd, cir::RecordType *ty) { ty ? *ty : cir::RecordType{}, baseTy ? baseTy : cir::RecordType{}, (bool)lowering.zeroInitializable, (bool)lowering.zeroInitializableAsBase); - assert(!cir::MissingFeatures::recordZeroInit()); - rl->nonVirtualBases.swap(lowering.nonVirtualBases); rl->completeObjectVirtualBases.swap(lowering.virtualBases); - assert(!cir::MissingFeatures::bitfields()); - // Add all the field numbers. rl->fieldIdxMap.swap(lowering.fieldIdxMap); diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index 2e3fc53..4aa6314 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -486,6 +486,12 @@ llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType( return getSPIRVImageTypeFromHLSLResource(ResAttrs, ContainedTy, CGM); } + if (ResAttrs.IsCounter) { + llvm::Type *ElemType = llvm::Type::getInt32Ty(Ctx); + uint32_t StorageClass = /* StorageBuffer storage class */ 12; + return llvm::TargetExtType::get(Ctx, "spirv.VulkanBuffer", {ElemType}, + {StorageClass, true}); + } llvm::Type *ElemType = CGM.getTypes().ConvertTypeForMem(ContainedTy); llvm::ArrayType *RuntimeArrayType = llvm::ArrayType::get(ElemType, 0); uint32_t StorageClass = /* StorageBuffer storage class */ 12; diff --git a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp index 1087eb3..6966d40 100644 --- a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp +++ b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp @@ -444,8 +444,7 @@ bool ExtractAPIAction::PrepareToExecuteAction(CompilerInstance &CI) { return true; if (!CI.hasFileManager()) - if (!CI.createFileManager()) - return false; + CI.createFileManager(); auto Kind = Inputs[0].getKind(); diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 9413c13..cd4c1aa 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -368,7 +368,7 @@ bool ContinuationIndenter::canBreak(const LineState &State) { // If binary operators are moved to the next line (including commas for some // styles of constructor initializers), that's always ok. - if (!Current.isOneOf(TT_BinaryOperator, tok::comma) && + if (Current.isNoneOf(TT_BinaryOperator, tok::comma) && // Allow breaking opening brace of lambdas (when passed as function // arguments) to a new line when BeforeLambdaBody brace wrapping is // enabled. @@ -445,7 +445,7 @@ bool ContinuationIndenter::mustBreak(const LineState &State) { (!Style.BreakBeforeTernaryOperators && Previous.is(TT_ConditionalExpr))) && CurrentState.BreakBeforeParameter && !Current.isTrailingComment() && - !Current.isOneOf(tok::r_paren, tok::r_brace)) { + Current.isNoneOf(tok::r_paren, tok::r_brace)) { return true; } if (CurrentState.IsChainedConditional && @@ -523,9 +523,9 @@ bool ContinuationIndenter::mustBreak(const LineState &State) { if (Style.AlwaysBreakBeforeMultilineStrings && (NewLineColumn == State.FirstIndent + Style.ContinuationIndentWidth || Previous.is(tok::comma) || Current.NestingLevel < 2) && - !Previous.isOneOf(tok::kw_return, tok::lessless, tok::at, + Previous.isNoneOf(tok::kw_return, tok::lessless, tok::at, Keywords.kw_dollar) && - !Previous.isOneOf(TT_InlineASMColon, TT_ConditionalExpr) && + Previous.isNoneOf(TT_InlineASMColon, TT_ConditionalExpr) && nextIsMultilineString(State)) { return true; } @@ -648,7 +648,7 @@ bool ContinuationIndenter::mustBreak(const LineState &State) { // into the ColumnLimit, they are checked here in the ContinuationIndenter. if (Style.ColumnLimit != 0 && Previous.is(BK_Block) && Previous.is(tok::l_brace) && - !Current.isOneOf(tok::r_brace, tok::comment)) { + Current.isNoneOf(tok::r_brace, tok::comment)) { return true; } @@ -752,7 +752,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, return false; const auto *Next = Comma->getNextNonComment(); - return Next && !Next->isOneOf(TT_LambdaLSquare, tok::l_brace, tok::caret); + return Next && Next->isNoneOf(TT_LambdaLSquare, tok::l_brace, tok::caret); }; if (DisallowLineBreaks()) @@ -835,7 +835,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) && Style.Cpp11BracedListStyle; }; - if (!Tok.isOneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) && + if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) && !IsStartOfBracedList()) { return false; } @@ -843,7 +843,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, return true; if (Tok.Previous->isIf()) return Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak; - return !Tok.Previous->isOneOf(TT_CastRParen, tok::kw_for, tok::kw_while, + return Tok.Previous->isNoneOf(TT_CastRParen, tok::kw_for, tok::kw_while, tok::kw_switch) && !(Style.isJavaScript() && Tok.Previous->is(Keywords.kw_await)); }; @@ -882,8 +882,8 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, Tok.isOneOf(tok::ellipsis, Keywords.kw_await))) { return true; } - const auto *Previous = Tok.Previous; - if (!Previous || (!Previous->isOneOf(TT_FunctionDeclarationLParen, + if (const auto *Previous = Tok.Previous; + !Previous || (Previous->isNoneOf(TT_FunctionDeclarationLParen, TT_LambdaDefinitionLParen) && !IsFunctionCallParen(*Previous))) { return true; @@ -920,9 +920,9 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, // align the commas with the opening paren. if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign && !CurrentState.IsCSharpGenericTypeConstraint && Previous.opensScope() && - Previous.isNot(TT_ObjCMethodExpr) && Previous.isNot(TT_RequiresClause) && - Previous.isNot(TT_TableGenDAGArgOpener) && - Previous.isNot(TT_TableGenDAGArgOpenerToBreak) && + Previous.isNoneOf(TT_ObjCMethodExpr, TT_RequiresClause, + TT_TableGenDAGArgOpener, + TT_TableGenDAGArgOpenerToBreak) && !(Current.MacroParent && Previous.MacroParent) && (Current.isNot(TT_LineComment) || Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen)) && @@ -962,7 +962,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, if (Current.isNot(tok::comment) && P && (P->isOneOf(TT_BinaryOperator, tok::comma) || (P->is(TT_ConditionalExpr) && P->is(tok::colon))) && - !P->isOneOf(TT_OverloadedOperator, TT_CtorInitializerComma) && + P->isNoneOf(TT_OverloadedOperator, TT_CtorInitializerComma) && P->getPrecedence() != prec::Assignment && P->getPrecedence() != prec::Relational && P->getPrecedence() != prec::Spaceship) { @@ -992,7 +992,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, // parameter, i.e. let nested calls have a continuation indent. CurrentState.LastSpace = State.Column; CurrentState.NestedBlockIndent = State.Column; - } else if (!Current.isOneOf(tok::comment, tok::caret) && + } else if (Current.isNoneOf(tok::comment, tok::caret) && ((Previous.is(tok::comma) && Previous.isNot(TT_OverloadedOperator)) || (Previous.is(tok::colon) && Previous.is(TT_ObjCMethodExpr)))) { @@ -1099,7 +1099,7 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State, if (Current.isNot(TT_LambdaArrow) && (!Style.isJavaScript() || Current.NestingLevel != 0 || !PreviousNonComment || PreviousNonComment->isNot(tok::equal) || - !Current.isOneOf(Keywords.kw_async, Keywords.kw_function))) { + Current.isNoneOf(Keywords.kw_async, Keywords.kw_function))) { CurrentState.NestedBlockIndent = State.Column; } @@ -1239,11 +1239,11 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State, } if (PreviousNonComment && - !PreviousNonComment->isOneOf(tok::comma, tok::colon, tok::semi) && + PreviousNonComment->isNoneOf(tok::comma, tok::colon, tok::semi) && ((PreviousNonComment->isNot(TT_TemplateCloser) && !PreviousNonComment->ClosesRequiresClause) || Current.NestingLevel != 0) && - !PreviousNonComment->isOneOf( + PreviousNonComment->isNoneOf( TT_BinaryOperator, TT_FunctionAnnotationRParen, TT_JavaAnnotation, TT_LeadingJavaAnnotation) && Current.isNot(TT_BinaryOperator) && !PreviousNonComment->opensScope() && @@ -1281,8 +1281,8 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State, bool AllowAllConstructorInitializersOnNextLine = Style.PackConstructorInitializers == FormatStyle::PCIS_NextLine || Style.PackConstructorInitializers == FormatStyle::PCIS_NextLineOnly; - if (!(Previous.isOneOf(tok::l_paren, tok::l_brace, TT_BinaryOperator) || - PreviousIsBreakingCtorInitializerColon) || + if ((Previous.isNoneOf(tok::l_paren, tok::l_brace, TT_BinaryOperator) && + !PreviousIsBreakingCtorInitializerColon) || (!Style.AllowAllParametersOfDeclarationOnNextLine && State.Line->MustBeDeclaration) || (!Style.AllowAllArgumentsOnNextLine && @@ -1576,7 +1576,7 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) { if (Previous.is(tok::r_paren) && Previous.isNot(TT_TableGenDAGArgOperatorToBreak) && !Current.isBinaryOperator() && - !Current.isOneOf(tok::colon, tok::comment)) { + Current.isNoneOf(tok::colon, tok::comment)) { return ContinuationIndent; } if (Current.is(TT_ProtoExtensionLSquare)) @@ -1591,7 +1591,7 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) { NextNonComment->SpacesRequiredBefore; } if (CurrentState.Indent == State.FirstIndent && PreviousNonComment && - !PreviousNonComment->isOneOf(tok::r_brace, TT_CtorInitializerComma)) { + PreviousNonComment->isNoneOf(tok::r_brace, TT_CtorInitializerComma)) { // Ensure that we fall back to the continuation indent width instead of // just flushing continuations left. return CurrentState.Indent + Style.ContinuationIndentWidth; @@ -1734,7 +1734,7 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State, } if (Previous && (Previous->isOneOf(TT_BinaryOperator, TT_ConditionalExpr) || (Previous->isOneOf(tok::l_paren, tok::comma, tok::colon) && - !Previous->isOneOf(TT_DictLiteral, TT_ObjCMethodExpr, + Previous->isNoneOf(TT_DictLiteral, TT_ObjCMethodExpr, TT_CtorInitializerColon)))) { CurrentState.NestedBlockInlined = !Newline && hasNestedBlockInlined(Previous, Current, Style); @@ -1758,7 +1758,7 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State, State.StartOfStringLiteral = State.Column + 1; } else if (Current.isStringLiteral() && State.StartOfStringLiteral == 0) { State.StartOfStringLiteral = State.Column; - } else if (!Current.isOneOf(tok::comment, tok::identifier, tok::hash) && + } else if (Current.isNoneOf(tok::comment, tok::identifier, tok::hash) && !Current.isStringLiteral()) { State.StartOfStringLiteral = 0; } @@ -2057,7 +2057,7 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State, // array literals as these follow different indentation rules. bool NoLineBreak = Current.Children.empty() && - !Current.isOneOf(TT_DictLiteral, TT_ArrayInitializerLSquare) && + Current.isNoneOf(TT_DictLiteral, TT_ArrayInitializerLSquare) && (CurrentState.NoLineBreak || CurrentState.NoLineBreakInOperand || (Current.is(TT_TemplateOpener) && CurrentState.ContainsUnwrappedBuilder)); diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 835071d..2bf6244 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -2435,7 +2435,7 @@ private: const auto *NextLine = I + 1 == End ? nullptr : I[1]; for (const auto *Token = Line->First; Token && !Token->Finalized; Token = Token->Next) { - if (!Token->Optional || !Token->isOneOf(tok::l_brace, tok::r_brace)) + if (!Token->Optional || Token->isNoneOf(tok::l_brace, tok::r_brace)) continue; auto *Next = Token->Next; assert(Next || Token == Line->Last); diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index c60ae8f..c2956a1 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -108,7 +108,7 @@ unsigned CommaSeparatedList::formatAfterToken(LineState &State, // Ensure that we start on the opening brace. const FormatToken *LBrace = State.NextToken->Previous->getPreviousNonComment(); - if (!LBrace || !LBrace->isOneOf(tok::l_brace, TT_ArrayInitializerLSquare) || + if (!LBrace || LBrace->isNoneOf(tok::l_brace, TT_ArrayInitializerLSquare) || LBrace->is(BK_Block) || LBrace->is(TT_DictLiteral) || LBrace->Next->is(TT_DesignatedInitializerPeriod)) { return 0; @@ -177,7 +177,7 @@ static unsigned CodePointsBetween(const FormatToken *Begin, void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) { // FIXME: At some point we might want to do this for other lists, too. if (!Token->MatchingParen || - !Token->isOneOf(tok::l_brace, TT_ArrayInitializerLSquare)) { + Token->isNoneOf(tok::l_brace, TT_ArrayInitializerLSquare)) { return; } diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index a28446a..e4ddd61 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -645,6 +645,9 @@ public: return is(K1) || isOneOf(K2, Ks...); } template <typename T> bool isNot(T Kind) const { return !is(Kind); } + template <typename... Ts> bool isNoneOf(Ts... Ks) const { + return !isOneOf(Ks...); + } bool isIf(bool AllowConstexprMacro = true) const { return is(tok::kw_if) || endsSequence(tok::kw_constexpr, tok::kw_if) || @@ -748,7 +751,7 @@ public: /// Returns \c true if this is a "." or "->" accessing a member. bool isMemberAccess() const { return isOneOf(tok::arrow, tok::period, tok::arrowstar) && - !isOneOf(TT_DesignatedInitializerPeriod, TT_TrailingReturnArrow, + isNoneOf(TT_DesignatedInitializerPeriod, TT_TrailingReturnArrow, TT_LambdaArrow, TT_LeadingJavaAnnotation); } diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index 3f4aa52..86a5185 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -733,7 +733,7 @@ void FormatTokenLexer::tryParseJavaTextBlock() { // its text if successful. void FormatTokenLexer::tryParseJSRegexLiteral() { FormatToken *RegexToken = Tokens.back(); - if (!RegexToken->isOneOf(tok::slash, tok::slashequal)) + if (RegexToken->isNoneOf(tok::slash, tok::slashequal)) return; FormatToken *Prev = nullptr; @@ -1041,7 +1041,7 @@ void FormatTokenLexer::handleTemplateStrings() { void FormatTokenLexer::tryParsePythonComment() { FormatToken *HashToken = Tokens.back(); - if (!HashToken->isOneOf(tok::hash, tok::hashhash)) + if (HashToken->isNoneOf(tok::hash, tok::hashhash)) return; // Turn the remainder of this line into a comment. const char *CommentBegin = diff --git a/clang/lib/Format/MacroExpander.cpp b/clang/lib/Format/MacroExpander.cpp index 85a53c9..445e173 100644 --- a/clang/lib/Format/MacroExpander.cpp +++ b/clang/lib/Format/MacroExpander.cpp @@ -86,7 +86,7 @@ private: } bool parseExpansion() { - if (!Current->isOneOf(tok::equal, tok::eof)) + if (Current->isNoneOf(tok::equal, tok::eof)) return false; if (Current->is(tok::equal)) nextToken(); diff --git a/clang/lib/Format/NamespaceEndCommentsFixer.cpp b/clang/lib/Format/NamespaceEndCommentsFixer.cpp index 08f8d68..95ccfac 100644 --- a/clang/lib/Format/NamespaceEndCommentsFixer.cpp +++ b/clang/lib/Format/NamespaceEndCommentsFixer.cpp @@ -70,7 +70,7 @@ std::string computeName(const FormatToken *NamespaceTok) { // and closing parenthesis or comma. assert(Tok && Tok->is(tok::l_paren) && "expected an opening parenthesis"); Tok = Tok->getNextNonComment(); - while (Tok && !Tok->isOneOf(tok::r_paren, tok::comma)) { + while (Tok && Tok->isNoneOf(tok::r_paren, tok::comma)) { name += Tok->TokenText; Tok = Tok->getNextNonComment(); } @@ -85,7 +85,7 @@ std::string computeName(const FormatToken *NamespaceTok) { // one token before that up until the '{'. A '(' might be a macro with // arguments. const FormatToken *FirstNSTok = nullptr; - while (Tok && !Tok->isOneOf(tok::l_brace, tok::coloncolon, tok::l_paren)) { + while (Tok && Tok->isNoneOf(tok::l_brace, tok::coloncolon, tok::l_paren)) { if (FirstNSTok) FirstNSName += FirstNSTok->TokenText; FirstNSTok = Tok; diff --git a/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp b/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp index b885942..b12b370 100644 --- a/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp +++ b/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp @@ -61,7 +61,7 @@ void ObjCPropertyAttributeOrderFixer::sortPropertyAttributes( } // Most attributes look like identifiers, but `class` is a keyword. - if (!Tok->isOneOf(tok::identifier, tok::kw_class)) { + if (Tok->isNoneOf(tok::identifier, tok::kw_class)) { // If we hit any other kind of token, just bail. return; } diff --git a/clang/lib/Format/QualifierAlignmentFixer.cpp b/clang/lib/Format/QualifierAlignmentFixer.cpp index 043d957..e3e30ca 100644 --- a/clang/lib/Format/QualifierAlignmentFixer.cpp +++ b/clang/lib/Format/QualifierAlignmentFixer.cpp @@ -508,7 +508,7 @@ const FormatToken *LeftRightQualifierAlignmentFixer::analyzeLeft( // Don't change declarations such as // `foo(struct Foo const a);` -> `foo(struct Foo const a);` - if (!Previous || !Previous->isOneOf(tok::kw_struct, tok::kw_class)) { + if (!Previous || Previous->isNoneOf(tok::kw_struct, tok::kw_class)) { insertQualifierBefore(SourceMgr, Fixes, TypeToken, Qualifier); removeToken(SourceMgr, Fixes, Tok); } diff --git a/clang/lib/Format/SortJavaScriptImports.cpp b/clang/lib/Format/SortJavaScriptImports.cpp index ace3dff..a403a4f 100644 --- a/clang/lib/Format/SortJavaScriptImports.cpp +++ b/clang/lib/Format/SortJavaScriptImports.cpp @@ -439,7 +439,7 @@ private: // for grammar EBNF (production ModuleItem). bool parseModuleReference(const AdditionalKeywords &Keywords, JsModuleReference &Reference) { - if (!Current || !Current->isOneOf(Keywords.kw_import, tok::kw_export)) + if (!Current || Current->isNoneOf(Keywords.kw_import, tok::kw_export)) return false; Reference.IsExport = Current->is(tok::kw_export); @@ -570,7 +570,7 @@ private: Symbol.Range.setEnd(Current->Tok.getLocation()); Reference.Symbols.push_back(Symbol); - if (!Current->isOneOf(tok::r_brace, tok::comma)) + if (Current->isNoneOf(tok::r_brace, tok::comma)) return false; } Reference.SymbolsEnd = Current->Tok.getLocation(); diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 0c9c88a..59f81b3 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -203,7 +203,7 @@ private: return false; } if (InExpr && SeenTernaryOperator && - (!Next || !Next->isOneOf(tok::l_paren, tok::l_brace))) { + (!Next || Next->isNoneOf(tok::l_paren, tok::l_brace))) { return false; } if (!MaybeAngles) @@ -577,7 +577,7 @@ private: if (IsIf && CurrentToken->is(tok::semi)) { for (auto *Tok = OpeningParen.Next; Tok != CurrentToken && - !Tok->isOneOf(tok::equal, tok::l_paren, tok::l_brace); + Tok->isNoneOf(tok::equal, tok::l_paren, tok::l_brace); Tok = Tok->Next) { if (Tok->isPointerOrReference()) Tok->setFinalizedType(TT_PointerOrReference); @@ -704,7 +704,7 @@ private: !IsCppStructuredBinding && !InsideInlineASM && !CppArrayTemplates && IsCpp && !IsCpp11AttributeSpecifier && !IsCSharpAttributeSpecifier && Contexts.back().CanBeExpression && Left->isNot(TT_LambdaLSquare) && - !CurrentToken->isOneOf(tok::l_brace, tok::r_square) && + CurrentToken->isNoneOf(tok::l_brace, tok::r_square) && (!Parent || Parent->isOneOf(tok::colon, tok::l_square, tok::l_paren, tok::kw_return, tok::kw_throw) || @@ -1334,7 +1334,7 @@ private: if (Style.isJavaScript()) { if (Contexts.back().ColonIsForRangeExpr || // colon in for loop (Contexts.size() == 1 && // switch/case labels - !Line.First->isOneOf(tok::kw_enum, tok::kw_case)) || + Line.First->isNoneOf(tok::kw_enum, tok::kw_case)) || Contexts.back().ContextKind == tok::l_paren || // function params Contexts.back().ContextKind == tok::l_square || // array type (!Contexts.back().IsExpression && @@ -1411,7 +1411,7 @@ private: } else if (Contexts.back().ColonIsForRangeExpr) { Tok->setType(TT_RangeBasedForLoopColon); for (auto *Token = Prev; - Token && !Token->isOneOf(tok::semi, tok::l_paren); + Token && Token->isNoneOf(tok::semi, tok::l_paren); Token = Token->Previous) { if (Token->isPointerOrReference()) Token->setFinalizedType(TT_PointerOrReference); @@ -1425,7 +1425,7 @@ private: Scopes.back() == ST_Class)) { Tok->setType(TT_BitFieldColon); } else if (Contexts.size() == 1 && - !Line.getFirstNonComment()->isOneOf(tok::kw_enum, tok::kw_case, + Line.getFirstNonComment()->isNoneOf(tok::kw_enum, tok::kw_case, tok::kw_default) && !Line.startsWith(tok::kw_typedef, tok::kw_enum)) { if (Prev->isOneOf(tok::r_paren, tok::kw_noexcept) || @@ -1562,10 +1562,10 @@ private: if (Line.MustBeDeclaration && Contexts.size() == 1 && !Contexts.back().IsExpression && !Line.startsWith(TT_ObjCProperty) && !Line.startsWith(tok::l_paren) && - !Tok->isOneOf(TT_TypeDeclarationParen, TT_RequiresExpressionLParen)) { + Tok->isNoneOf(TT_TypeDeclarationParen, TT_RequiresExpressionLParen)) { if (!Prev || (!Prev->isAttribute() && - !Prev->isOneOf(TT_RequiresClause, TT_LeadingJavaAnnotation, + Prev->isNoneOf(TT_RequiresClause, TT_LeadingJavaAnnotation, TT_BinaryOperator))) { Line.MightBeFunctionDecl = true; Tok->MightBeFunctionDeclParen = true; @@ -1664,7 +1664,7 @@ private: } } while (CurrentToken && - !CurrentToken->isOneOf(tok::l_paren, tok::semi, tok::r_paren)) { + CurrentToken->isNoneOf(tok::l_paren, tok::semi, tok::r_paren)) { if (CurrentToken->isOneOf(tok::star, tok::amp)) CurrentToken->setType(TT_PointerOrReference); auto Next = CurrentToken->getNextNonComment(); @@ -1728,8 +1728,8 @@ private: // cond ? id : "B"; // cond ? cond2 ? "A" : "B" : "C"; if (!Contexts.back().IsExpression && Line.MustBeDeclaration && - (!Next || !Next->isOneOf(tok::identifier, tok::string_literal) || - !Next->Next || !Next->Next->isOneOf(tok::colon, tok::question))) { + (!Next || Next->isNoneOf(tok::identifier, tok::string_literal) || + !Next->Next || Next->Next->isNoneOf(tok::colon, tok::question))) { Tok->setType(TT_CSharpNullable); break; } @@ -1796,7 +1796,7 @@ private: if (!parseTableGenValue()) return false; } else if (Tok->isOneOf(Keywords.kw_def, Keywords.kw_defm) && - (!Next || !Next->isOneOf(tok::colon, tok::l_brace))) { + (!Next || Next->isNoneOf(tok::colon, tok::l_brace))) { // The case NameValue appears. if (!parseTableGenValue(true)) return false; @@ -2094,7 +2094,7 @@ private: // Reset token type in case we have already looked at it and then // recovered from an error (e.g. failure to find the matching >). if (!CurrentToken->isTypeFinalized() && - !CurrentToken->isOneOf( + CurrentToken->isNoneOf( TT_LambdaLSquare, TT_LambdaLBrace, TT_AttributeMacro, TT_IfMacro, TT_ForEachMacro, TT_TypenameMacro, TT_FunctionLBrace, TT_ImplicitStringLiteral, TT_InlineASMBrace, TT_FatArrow, @@ -2230,7 +2230,7 @@ private: // type or non-type. if (Contexts.back().ContextKind == tok::less) { assert(Current.Previous->Previous); - return !Current.Previous->Previous->isOneOf(tok::kw_typename, + return Current.Previous->Previous->isNoneOf(tok::kw_typename, tok::kw_class); } @@ -2266,7 +2266,7 @@ private: if (!Line.startsWith(TT_UnaryOperator)) { for (FormatToken *Previous = Current.Previous; Previous && Previous->Previous && - !Previous->Previous->isOneOf(tok::comma, tok::semi); + Previous->Previous->isNoneOf(tok::comma, tok::semi); Previous = Previous->Previous) { if (Previous->isOneOf(tok::r_square, tok::r_paren, tok::greater)) { Previous = Previous->MatchingParen; @@ -2430,7 +2430,7 @@ private: Current.setType(TT_BinaryOperator); } else if (Current.is(tok::arrow) && AutoFound && Line.MightBeFunctionDecl && Current.NestingLevel == 0 && - !Current.Previous->isOneOf(tok::kw_operator, tok::identifier)) { + Current.Previous->isNoneOf(tok::kw_operator, tok::identifier)) { // not auto operator->() -> xxx; Current.setType(TT_TrailingReturnArrow); } else if (Current.is(tok::arrow) && Current.Previous && @@ -2511,7 +2511,7 @@ private: Current.setType(TT_CastRParen); if (Current.MatchingParen && Current.Next && !Current.Next->isBinaryOperator() && - !Current.Next->isOneOf( + Current.Next->isNoneOf( tok::semi, tok::colon, tok::l_brace, tok::l_paren, tok::comma, tok::period, tok::arrow, tok::coloncolon, tok::kw_noexcept)) { if (FormatToken *AfterParen = Current.MatchingParen->Next; @@ -2569,7 +2569,7 @@ private: } else if (Current.isOneOf(tok::identifier, tok::kw_const, tok::kw_noexcept, tok::kw_requires) && Current.Previous && - !Current.Previous->isOneOf(tok::equal, tok::at, + Current.Previous->isNoneOf(tok::equal, tok::at, TT_CtorInitializerComma, TT_CtorInitializerColon) && Line.MightBeFunctionDecl && Contexts.size() == 1) { @@ -2658,7 +2658,7 @@ private: if (PreviousNotConst->is(TT_TemplateCloser)) { return PreviousNotConst && PreviousNotConst->MatchingParen && PreviousNotConst->MatchingParen->Previous && - !PreviousNotConst->MatchingParen->Previous->isOneOf( + PreviousNotConst->MatchingParen->Previous->isNoneOf( tok::period, tok::kw_template); } @@ -2780,7 +2780,7 @@ private: // If there is an identifier (or with a few exceptions a keyword) right // before the parentheses, this is unlikely to be a cast. if (LeftOfParens->Tok.getIdentifierInfo() && - !LeftOfParens->isOneOf(Keywords.kw_in, tok::kw_return, tok::kw_case, + LeftOfParens->isNoneOf(Keywords.kw_in, tok::kw_return, tok::kw_case, tok::kw_delete, tok::kw_throw)) { return false; } @@ -2918,7 +2918,7 @@ private: const bool NextIsAmpOrStar = AfterRParen->isOneOf(tok::amp, tok::star); if (!(AfterRParen->isUnaryOperator() || NextIsAmpOrStar) || AfterRParen->is(tok::plus) || - !AfterRParen->Next->isOneOf(tok::identifier, tok::numeric_constant)) { + AfterRParen->Next->isNoneOf(tok::identifier, tok::numeric_constant)) { return false; } @@ -2948,7 +2948,7 @@ private: // Search for unexpected tokens. for (Prev = BeforeRParen; Prev != LParen; Prev = Prev->Previous) - if (!Prev->isOneOf(tok::kw_const, tok::identifier, tok::coloncolon)) + if (Prev->isNoneOf(tok::kw_const, tok::identifier, tok::coloncolon)) return false; return true; @@ -3740,7 +3740,7 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) { const bool InRequiresExpression = Line.Type == LT_RequiresExpression; for (auto &Child : Line.Children) { if (InRequiresExpression && - !Child->First->isOneOf(tok::kw_typename, tok::kw_requires, + Child->First->isNoneOf(tok::kw_typename, tok::kw_requires, TT_CompoundRequirementLBrace)) { Child->Type = LT_SimpleRequirement; } @@ -3857,7 +3857,7 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts, // Find parentheses of parameter list. if (Current.is(tok::kw_operator)) { if (Previous.Tok.getIdentifierInfo() && - !Previous.isOneOf(tok::kw_return, tok::kw_co_return)) { + Previous.isNoneOf(tok::kw_return, tok::kw_co_return)) { return true; } if (Previous.is(tok::r_paren) && Previous.is(TT_TypeDeclarationParen)) { @@ -4328,7 +4328,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line, // Slightly prefer formatting local lambda definitions like functions. if (Right.is(TT_LambdaLSquare) && Left.is(tok::equal)) return 35; - if (!Right.isOneOf(TT_ObjCMethodExpr, TT_LambdaLSquare, + if (Right.isNoneOf(TT_ObjCMethodExpr, TT_LambdaLSquare, TT_ArrayInitializerLSquare, TT_DesignatedInitializerLSquare, TT_AttributeSquare)) { return 500; @@ -4519,7 +4519,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, const FormatToken &Left, const FormatToken &Right) const { if (Left.is(tok::kw_return) && - !Right.isOneOf(tok::semi, tok::r_paren, tok::hashhash)) { + Right.isNoneOf(tok::semi, tok::r_paren, tok::hashhash)) { return true; } if (Left.is(tok::kw_throw) && Right.is(tok::l_paren) && Right.MatchingParen && @@ -4579,7 +4579,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, } // co_await (x), co_yield (x), co_return (x) if (Left.isOneOf(tok::kw_co_await, tok::kw_co_yield, tok::kw_co_return) && - !Right.isOneOf(tok::semi, tok::r_paren)) { + Right.isNoneOf(tok::semi, tok::r_paren)) { return true; } @@ -4656,7 +4656,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, return getTokenPointerOrReferenceAlignment(Right) != FormatStyle::PAS_Left; } - return !Left.isOneOf(TT_PointerOrReference, tok::l_paren) && + return Left.isNoneOf(TT_PointerOrReference, tok::l_paren) && (getTokenPointerOrReferenceAlignment(Right) != FormatStyle::PAS_Left || (Line.IsMultiVariableDeclStmt && @@ -4729,7 +4729,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, const auto *LParen = Right.Next->MatchingParen; return !LParen || LParen->isNot(TT_FunctionTypeLParen); } - return !BeforeLeft->isOneOf(tok::l_paren, tok::l_square); + return BeforeLeft->isNoneOf(tok::l_paren, tok::l_square); } // Ensure right pointer alignment with ellipsis e.g. int *...P if (Left.is(tok::ellipsis) && BeforeLeft && @@ -4808,10 +4808,10 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, TT_LambdaLSquare))); } if (Right.is(tok::l_square) && - !Right.isOneOf(TT_ObjCMethodExpr, TT_LambdaLSquare, + Right.isNoneOf(TT_ObjCMethodExpr, TT_LambdaLSquare, TT_DesignatedInitializerLSquare, TT_StructuredBindingLSquare, TT_AttributeSquare) && - !Left.isOneOf(tok::numeric_constant, TT_DictLiteral) && + Left.isNoneOf(tok::numeric_constant, TT_DictLiteral) && !(Left.isNot(tok::r_square) && Style.SpaceBeforeSquareBrackets && Right.is(TT_ArraySubscriptLSquare))) { return false; @@ -4894,7 +4894,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, return Style.SpaceBeforeParensOptions.AfterFunctionDefinitionName || spaceRequiredBeforeParens(Right); } - if (!BeforeLeft || !BeforeLeft->isOneOf(tok::period, tok::arrow)) { + if (!BeforeLeft || BeforeLeft->isNoneOf(tok::period, tok::arrow)) { if (Left.isOneOf(tok::kw_try, Keywords.kw___except, tok::kw_catch)) { return Style.SpaceBeforeParensOptions.AfterControlStatements || spaceRequiredBeforeParens(Right); @@ -4917,7 +4917,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, if (Left.is(tok::at) && Right.isNot(tok::objc_not_keyword)) return false; if (Right.is(TT_UnaryOperator)) { - return !Left.isOneOf(tok::l_paren, tok::l_square, tok::at) && + return Left.isNoneOf(tok::l_paren, tok::l_square, tok::at) && (Left.isNot(tok::colon) || Left.isNot(TT_ObjCMethodExpr)); } // No space between the variable name and the initializer list. @@ -5260,7 +5260,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, if (Left.is(tok::ellipsis)) return false; if (Left.is(TT_TemplateCloser) && - !Right.isOneOf(tok::equal, tok::l_brace, tok::comma, tok::l_square, + Right.isNoneOf(tok::equal, tok::l_brace, tok::comma, tok::l_square, Keywords.kw_implements, Keywords.kw_extends)) { // Type assertions ('<type>expr') are not followed by whitespace. Other // locations that should have whitespace following are identified by the @@ -5299,7 +5299,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, // Add space between things in a primitive's state table unless in a // transition like `(0?)`. if ((Left.is(TT_VerilogTableItem) && - !Right.isOneOf(tok::r_paren, tok::semi)) || + Right.isNoneOf(tok::r_paren, tok::semi)) || (Right.is(TT_VerilogTableItem) && Left.isNot(tok::l_paren))) { const FormatToken *Next = Right.getNextNonComment(); return !(Next && Next->is(tok::r_paren)); @@ -5348,8 +5348,8 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, // previous rule. if ((Right.is(Keywords.kw_apostrophe) || (Right.is(BK_BracedInit) && Right.is(tok::l_brace))) && - !(Left.isOneOf(Keywords.kw_assign, Keywords.kw_unique) || - Keywords.isVerilogWordOperator(Left)) && + Left.isNoneOf(Keywords.kw_assign, Keywords.kw_unique) && + !Keywords.isVerilogWordOperator(Left) && (Left.isOneOf(tok::r_square, tok::r_paren, tok::r_brace, tok::numeric_constant) || Keywords.isWordLike(Left))) { @@ -5549,14 +5549,14 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, return Right.hasWhitespaceBefore(); } if (Right.is(tok::coloncolon) && - !Left.isOneOf(tok::l_brace, tok::comment, tok::l_paren)) { + Left.isNoneOf(tok::l_brace, tok::comment, tok::l_paren)) { // Put a space between < and :: in vector< ::std::string > return (Left.is(TT_TemplateOpener) && ((Style.Standard < FormatStyle::LS_Cpp11) || ShouldAddSpacesInAngles())) || - !(Left.isOneOf(tok::l_paren, tok::r_paren, tok::l_square, - tok::kw___super, TT_TemplateOpener, - TT_TemplateCloser)) || + Left.isNoneOf(tok::l_paren, tok::r_paren, tok::l_square, + tok::kw___super, TT_TemplateOpener, + TT_TemplateCloser) || (Left.is(tok::l_paren) && Style.SpacesInParensOptions.Other); } if ((Left.is(TT_TemplateOpener)) != (Right.is(TT_TemplateCloser))) @@ -5567,7 +5567,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, } // Space before TT_StructuredBindingLSquare. if (Right.is(TT_StructuredBindingLSquare)) { - return !Left.isOneOf(tok::amp, tok::ampamp) || + return Left.isNoneOf(tok::amp, tok::ampamp) || getTokenReferenceAlignment(Left) != FormatStyle::PAS_Right; } // Space before & or && following a TT_StructuredBindingLSquare. @@ -5599,7 +5599,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, // Returns 'true' if 'Tok' is a brace we'd want to break before in Allman style. static bool isAllmanBrace(const FormatToken &Tok) { return Tok.is(tok::l_brace) && Tok.is(BK_Block) && - !Tok.isOneOf(TT_ObjCBlockLBrace, TT_LambdaLBrace, TT_DictLiteral); + Tok.isNoneOf(TT_ObjCBlockLBrace, TT_LambdaLBrace, TT_DictLiteral); } // Returns 'true' if 'Tok' is a function argument. @@ -5617,7 +5617,7 @@ isEmptyLambdaAllowed(const FormatToken &Tok, static bool isAllmanLambdaBrace(const FormatToken &Tok) { return Tok.is(tok::l_brace) && Tok.is(BK_Block) && - !Tok.isOneOf(TT_ObjCBlockLBrace, TT_DictLiteral); + Tok.isNoneOf(TT_ObjCBlockLBrace, TT_DictLiteral); } bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, @@ -5686,7 +5686,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, tok::kw_const) && // kw_var/kw_let are pseudo-tokens that are tok::identifier, so match // above. - !Line.First->isOneOf(Keywords.kw_var, Keywords.kw_let)) { + Line.First->isNoneOf(Keywords.kw_var, Keywords.kw_let)) { // Object literals on the top level of a file are treated as "enum-style". // Each key/value pair is put on a separate line, instead of bin-packing. return true; @@ -5831,7 +5831,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, } if (Right.is(tok::comment)) { - return !Left.isOneOf(BK_BracedInit, TT_CtorInitializerColon) && + return Left.isNoneOf(BK_BracedInit, TT_CtorInitializerColon) && Right.NewlinesBefore > 0 && Right.HasUnescapedNewline; } if (Left.isTrailingComment()) @@ -5873,7 +5873,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, case FormatStyle::RCPS_WithPreceding: return Right.isNot(tok::semi); case FormatStyle::RCPS_OwnLineWithBrace: - return !Right.isOneOf(tok::semi, tok::l_brace); + return Right.isNoneOf(tok::semi, tok::l_brace); default: break; } @@ -6000,7 +6000,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, // Put multiple Java annotation on a new line. if ((Style.isJava() || Style.isJavaScript()) && Left.is(TT_LeadingJavaAnnotation) && - !Right.isOneOf(TT_LeadingJavaAnnotation, tok::l_paren) && + Right.isNoneOf(TT_LeadingJavaAnnotation, tok::l_paren) && (Line.Last->is(tok::l_brace) || Style.BreakAfterJavaFieldAnnotations)) { return true; } @@ -6206,7 +6206,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, return false; // Avoid to break after '(' in the cases that is in bang operators. if (Right.is(tok::l_paren)) { - return !Left.isOneOf(TT_TableGenBangOperator, TT_TableGenCondOperator, + return Left.isNoneOf(TT_TableGenBangOperator, TT_TableGenCondOperator, TT_TemplateCloser); } // Avoid to break between the value and its suffix part. @@ -6294,7 +6294,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, } if (Right.is(tok::colon) && - !Right.isOneOf(TT_CtorInitializerColon, TT_InlineASMColon, + Right.isNoneOf(TT_CtorInitializerColon, TT_InlineASMColon, TT_BitFieldColon)) { return false; } @@ -6378,7 +6378,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, } if (Left.isOneOf(TT_TemplateCloser, TT_UnaryOperator, tok::kw_operator)) return false; - if (Left.is(tok::equal) && !Right.isOneOf(tok::kw_default, tok::kw_delete) && + if (Left.is(tok::equal) && Right.isNoneOf(tok::kw_default, tok::kw_delete) && Line.Type == LT_VirtualFunctionDecl && Left.NestingLevel == 0) { return false; } @@ -6405,7 +6405,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, // Allow breaking after a trailing annotation, e.g. after a method // declaration. if (Left.is(TT_TrailingAnnotation)) { - return !Right.isOneOf(tok::l_brace, tok::semi, tok::equal, tok::l_paren, + return Right.isNoneOf(tok::l_brace, tok::semi, tok::equal, tok::l_paren, tok::less, tok::coloncolon); } @@ -6448,7 +6448,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, if (Right.is(tok::kw_typename) && Left.isNot(tok::kw_const)) return true; if ((Left.isBinaryOperator() || Left.is(TT_BinaryOperator)) && - !Left.isOneOf(tok::arrowstar, tok::lessless) && + Left.isNoneOf(tok::arrowstar, tok::lessless) && Style.BreakBeforeBinaryOperators != FormatStyle::BOS_All && (Style.BreakBeforeBinaryOperators == FormatStyle::BOS_None || Left.getPrecedence() == prec::Assignment)) { diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index ac9d147..ac9c81d 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -506,7 +506,7 @@ private: (NextLine.First->is(tok::r_brace) && !Style.BraceWrapping.SplitEmptyRecord); } else if (TheLine->InPPDirective || - !TheLine->First->isOneOf(tok::kw_class, tok::kw_enum, + TheLine->First->isNoneOf(tok::kw_class, tok::kw_enum, tok::kw_struct)) { // Try to merge a block with left brace unwrapped that wasn't yet // covered. @@ -686,8 +686,8 @@ private: } Limit = limitConsideringMacros(I + 1, E, Limit); AnnotatedLine &Line = **I; - if (Line.First->isNot(tok::kw_do) && Line.First->isNot(tok::kw_else) && - Line.Last->isNot(tok::kw_else) && Line.Last->isNot(tok::r_paren)) { + if (Line.First->isNoneOf(tok::kw_do, tok::kw_else) && + Line.Last->isNoneOf(tok::kw_else, tok::r_paren)) { return 0; } // Only merge `do while` if `do` is the only statement on the line. diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 6948b3d..2879743 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -405,7 +405,7 @@ bool UnwrappedLineParser::parseLevel(const FormatToken *OpeningBrace, case tok::r_brace: if (OpeningBrace) { if (!Style.RemoveBracesLLVM || Line->InPPDirective || - !OpeningBrace->isOneOf(TT_ControlStatementLBrace, TT_ElseLBrace)) { + OpeningBrace->isNoneOf(TT_ControlStatementLBrace, TT_ElseLBrace)) { return false; } if (FormatTok->isNot(tok::r_brace) || StatementCount != 1 || HasLabel || @@ -427,7 +427,7 @@ bool UnwrappedLineParser::parseLevel(const FormatToken *OpeningBrace, unsigned StoredPosition = Tokens->getPosition(); auto *Next = Tokens->getNextNonComment(); FormatTok = Tokens->setPosition(StoredPosition); - if (!Next->isOneOf(tok::colon, tok::arrow)) { + if (Next->isNoneOf(tok::colon, tok::arrow)) { // default not followed by `:` or `->` is not a case label; treat it // like an identifier. parseStructuralElement(); @@ -584,7 +584,7 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { ProbablyBracedList = ProbablyBracedList || (NextTok->is(tok::identifier) && - !PrevTok->isOneOf(tok::semi, tok::r_brace, tok::l_brace)); + PrevTok->isNoneOf(tok::semi, tok::r_brace, tok::l_brace)); ProbablyBracedList = ProbablyBracedList || (NextTok->is(tok::semi) && @@ -607,7 +607,7 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { // A statement can end with only `;` (simple statement), a block // closing brace (compound statement), or `:` (label statement). // If PrevTok is a block opening brace, Tok ends an empty block. - !PrevTok->isOneOf(tok::semi, BK_Block, tok::colon)) { + PrevTok->isNoneOf(tok::semi, BK_Block, tok::colon)) { ProbablyBracedList = true; } } @@ -1157,7 +1157,7 @@ void UnwrappedLineParser::parsePPDefine() { IncludeGuard = IG_Defined; IncludeGuardToken = nullptr; for (auto &Line : Lines) { - if (!Line.Tokens.front().Tok->isOneOf(tok::comment, tok::hash)) { + if (Line.Tokens.front().Tok->isNoneOf(tok::comment, tok::hash)) { IncludeGuard = IG_Rejected; break; } @@ -1233,7 +1233,7 @@ void UnwrappedLineParser::parsePPUnknown() { static bool tokenCanStartNewLine(const FormatToken &Tok) { // Semicolon can be a null-statement, l_square can be a start of a macro or // a C++11 attribute, but this doesn't seem to be common. - return !Tok.isOneOf(tok::semi, tok::l_brace, + return Tok.isNoneOf(tok::semi, tok::l_brace, // Tokens that can only be used as binary operators and a // part of overloaded operator names. tok::period, tok::periodstar, tok::arrow, tok::arrowstar, @@ -1256,7 +1256,7 @@ static bool mustBeJSIdent(const AdditionalKeywords &Keywords, // FIXME: This returns true for C/C++ keywords like 'struct'. return FormatTok->is(tok::identifier) && (!FormatTok->Tok.getIdentifierInfo() || - !FormatTok->isOneOf( + FormatTok->isNoneOf( Keywords.kw_in, Keywords.kw_of, Keywords.kw_as, Keywords.kw_async, Keywords.kw_await, Keywords.kw_yield, Keywords.kw_finally, Keywords.kw_function, Keywords.kw_import, Keywords.kw_is, @@ -1322,7 +1322,7 @@ static bool isC78ParameterDecl(const FormatToken *Tok, const FormatToken *Next, return false; if (!isC78Type(*Tok) && - !Tok->isOneOf(tok::kw_register, tok::kw_struct, tok::kw_union)) { + Tok->isNoneOf(tok::kw_register, tok::kw_struct, tok::kw_union)) { return false; } @@ -1345,7 +1345,7 @@ bool UnwrappedLineParser::parseModuleImport() { if (auto Token = Tokens->peekNextToken(/*SkipComment=*/true); !Token->Tok.getIdentifierInfo() && - !Token->isOneOf(tok::colon, tok::less, tok::string_literal)) { + Token->isNoneOf(tok::colon, tok::less, tok::string_literal)) { return false; } @@ -1357,7 +1357,7 @@ bool UnwrappedLineParser::parseModuleImport() { // Handle import <foo/bar.h> as we would an include statement. else if (FormatTok->is(tok::less)) { nextToken(); - while (!FormatTok->isOneOf(tok::semi, tok::greater) && !eof()) { + while (FormatTok->isNoneOf(tok::semi, tok::greater) && !eof()) { // Mark tokens up to the trailing line comments as implicit string // literals. if (FormatTok->isNot(tok::comment) && @@ -2394,13 +2394,13 @@ bool UnwrappedLineParser::tryToParseLambdaIntroducer() { const auto *BeforeRParen = Previous->getPreviousNonComment(); // Lambdas can be cast to function types only, e.g. `std::function<int()>` // and `int (*)()`. - if (!BeforeRParen || !BeforeRParen->isOneOf(tok::greater, tok::r_paren)) + if (!BeforeRParen || BeforeRParen->isNoneOf(tok::greater, tok::r_paren)) return false; } else if (Previous->is(tok::star)) { Previous = Previous->getPreviousNonComment(); } if (Previous && Previous->Tok.getIdentifierInfo() && - !Previous->isOneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield, + Previous->isNoneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield, tok::kw_co_return)) { return false; } @@ -2450,7 +2450,7 @@ void UnwrappedLineParser::tryToParseJSFunction() { if (FormatTok->is(tok::l_brace)) tryToParseBracedList(); else - while (!FormatTok->isOneOf(tok::l_brace, tok::semi) && !eof()) + while (FormatTok->isNoneOf(tok::l_brace, tok::semi) && !eof()) nextToken(); } @@ -3108,11 +3108,11 @@ void UnwrappedLineParser::parseTryCatch() { for (bool SeenCatch = false;;) { if (FormatTok->is(tok::at)) nextToken(); - if (!(FormatTok->isOneOf(tok::kw_catch, Keywords.kw___except, - tok::kw___finally, tok::objc_catch, - tok::objc_finally) || - ((Style.isJava() || Style.isJavaScript()) && - FormatTok->is(Keywords.kw_finally)))) { + if (FormatTok->isNoneOf(tok::kw_catch, Keywords.kw___except, + tok::kw___finally, tok::objc_catch, + tok::objc_finally) && + !((Style.isJava() || Style.isJavaScript()) && + FormatTok->is(Keywords.kw_finally))) { break; } if (FormatTok->is(tok::kw_catch)) @@ -3290,7 +3290,7 @@ void UnwrappedLineParser::parseForOrWhileLoop(bool HasParens) { Keywords.kw_repeat))) && "'for', 'while' or foreach macro expected"); const bool KeepBraces = !Style.RemoveBracesLLVM || - !FormatTok->isOneOf(tok::kw_for, tok::kw_while); + FormatTok->isNoneOf(tok::kw_for, tok::kw_while); nextToken(); // JS' for await ( ... @@ -4339,7 +4339,7 @@ void UnwrappedLineParser::parseJavaScriptEs6ImportExport() { // to the terminating `;`. For everything else, just return and continue // parsing the structural element, i.e. the declaration or expression for // `export default`. - if (!IsImport && !FormatTok->isOneOf(tok::l_brace, tok::star) && + if (!IsImport && FormatTok->isNoneOf(tok::l_brace, tok::star) && !FormatTok->isStringLiteral() && !(FormatTok->is(Keywords.kw_type) && Tokens->peekNextToken()->isOneOf(tok::l_brace, tok::star))) { @@ -4886,7 +4886,7 @@ void UnwrappedLineParser::readToken(int LevelDifference) { const auto *Next = Tokens->peekNextToken(); if ((Style.isVerilog() && !Keywords.isVerilogPPDirective(*Next)) || (Style.isTableGen() && - !Next->isOneOf(tok::kw_else, tok::pp_define, tok::pp_ifdef, + Next->isNoneOf(tok::kw_else, tok::pp_define, tok::pp_ifdef, tok::pp_ifndef, tok::pp_endif))) { break; } diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp index 30c06bb..54f366f 100644 --- a/clang/lib/Format/WhitespaceManager.cpp +++ b/clang/lib/Format/WhitespaceManager.cpp @@ -462,7 +462,7 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End, if ((Style.PointerAlignment == FormatStyle::PAS_Right || Style.ReferenceAlignment == FormatStyle::RAS_Right) && CurrentChange.Spaces != 0 && - !CurrentChange.Tok->isOneOf(tok::equal, tok::r_paren, + CurrentChange.Tok->isNoneOf(tok::equal, tok::r_paren, TT_TemplateCloser)) { const bool ReferenceNotRightAligned = Style.ReferenceAlignment != FormatStyle::RAS_Right && diff --git a/clang/lib/Frontend/ChainedIncludesSource.cpp b/clang/lib/Frontend/ChainedIncludesSource.cpp index 82249f8..049277c 100644 --- a/clang/lib/Frontend/ChainedIncludesSource.cpp +++ b/clang/lib/Frontend/ChainedIncludesSource.cpp @@ -129,7 +129,7 @@ clang::createChainedIncludesSource(CompilerInstance &CI, Clang->setTarget(TargetInfo::CreateTargetInfo( Clang->getDiagnostics(), Clang->getInvocation().getTargetOpts())); Clang->createFileManager(); - Clang->createSourceManager(Clang->getFileManager()); + Clang->createSourceManager(); Clang->createPreprocessor(TU_Prefix); Clang->getDiagnosticClient().BeginSourceFile(Clang->getLangOpts(), &Clang->getPreprocessor()); diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index b1fb905..5844366 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -382,17 +382,18 @@ IntrusiveRefCntPtr<DiagnosticsEngine> CompilerInstance::createDiagnostics( // File Manager -FileManager *CompilerInstance::createFileManager() { +void CompilerInstance::createFileManager() { assert(VFS && "CompilerInstance needs a VFS for creating FileManager"); FileMgr = llvm::makeIntrusiveRefCnt<FileManager>(getFileSystemOpts(), VFS); - return FileMgr.get(); } // Source Manager -void CompilerInstance::createSourceManager(FileManager &FileMgr) { - SourceMgr = - llvm::makeIntrusiveRefCnt<SourceManager>(getDiagnostics(), FileMgr); +void CompilerInstance::createSourceManager() { + assert(Diagnostics && "DiagnosticsEngine needed for creating SourceManager"); + assert(FileMgr && "FileManager needed for creating SourceManager"); + SourceMgr = llvm::makeIntrusiveRefCnt<SourceManager>(getDiagnostics(), + getFileManager()); } // Initialize the remapping of files to alternative contents, e.g., @@ -1186,7 +1187,7 @@ std::unique_ptr<CompilerInstance> CompilerInstance::cloneForModuleCompileImpl( if (llvm::is_contained(DiagOpts.SystemHeaderWarningsModules, ModuleName)) Instance.getDiagnostics().setSuppressSystemWarnings(false); - Instance.createSourceManager(Instance.getFileManager()); + Instance.createSourceManager(); SourceManager &SourceMgr = Instance.getSourceManager(); if (ThreadSafeConfig) { diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index 6cc3b65..1b63c40 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -879,7 +879,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI, // file, otherwise the CompilerInstance will happily destroy them. CI.setVirtualFileSystem(AST->getFileManager().getVirtualFileSystemPtr()); CI.setFileManager(AST->getFileManagerPtr()); - CI.createSourceManager(CI.getFileManager()); + CI.createSourceManager(); CI.getSourceManager().initializeForReplay(AST->getSourceManager()); // Preload all the module files loaded transitively by the AST unit. Also @@ -971,13 +971,10 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI, // Set up the file system, file and source managers, if needed. if (!CI.hasVirtualFileSystem()) CI.createVirtualFileSystem(); - if (!CI.hasFileManager()) { - if (!CI.createFileManager()) { - return false; - } - } + if (!CI.hasFileManager()) + CI.createFileManager(); if (!CI.hasSourceManager()) { - CI.createSourceManager(CI.getFileManager()); + CI.createSourceManager(); if (CI.getDiagnosticOpts().getFormat() == DiagnosticOptions::SARIF) { static_cast<SARIFDiagnosticPrinter *>(&CI.getDiagnosticClient()) ->setSarifWriter( diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp index 97a6a7f..3c20ccd 100644 --- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp +++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp @@ -138,7 +138,16 @@ private: // LastStmt - refers to the last statement in the method body; referencing // LastStmt will remove the statement from the method body since // it will be linked from the new expression being constructed. - enum class PlaceHolder { _0, _1, _2, _3, _4, Handle = 128, LastStmt }; + enum class PlaceHolder { + _0, + _1, + _2, + _3, + _4, + Handle = 128, + CounterHandle, + LastStmt + }; Expr *convertPlaceholder(PlaceHolder PH); Expr *convertPlaceholder(LocalVar &Var); @@ -178,10 +187,14 @@ public: template <typename ResourceT, typename ValueT> BuiltinTypeMethodBuilder &setHandleFieldOnResource(ResourceT ResourceRecord, ValueT HandleValue); + template <typename T> + BuiltinTypeMethodBuilder & + accessCounterHandleFieldOnResource(T ResourceRecord); template <typename T> BuiltinTypeMethodBuilder &returnValue(T ReturnValue); BuiltinTypeMethodBuilder &returnThis(); BuiltinTypeDeclBuilder &finalize(); Expr *getResourceHandleExpr(); + Expr *getResourceCounterHandleExpr(); private: void createDecl(); @@ -346,6 +359,8 @@ TemplateParameterListBuilder::finalizeTemplateArgs(ConceptDecl *CD) { Expr *BuiltinTypeMethodBuilder::convertPlaceholder(PlaceHolder PH) { if (PH == PlaceHolder::Handle) return getResourceHandleExpr(); + if (PH == PlaceHolder::CounterHandle) + return getResourceCounterHandleExpr(); if (PH == PlaceHolder::LastStmt) { assert(!StmtsList.empty() && "no statements in the list"); @@ -467,6 +482,18 @@ Expr *BuiltinTypeMethodBuilder::getResourceHandleExpr() { OK_Ordinary); } +Expr *BuiltinTypeMethodBuilder::getResourceCounterHandleExpr() { + ensureCompleteDecl(); + + ASTContext &AST = DeclBuilder.SemaRef.getASTContext(); + CXXThisExpr *This = CXXThisExpr::Create( + AST, SourceLocation(), Method->getFunctionObjectParameterType(), true); + FieldDecl *HandleField = DeclBuilder.getResourceCounterHandleField(); + return MemberExpr::CreateImplicit(AST, This, false, HandleField, + HandleField->getType(), VK_LValue, + OK_Ordinary); +} + BuiltinTypeMethodBuilder & BuiltinTypeMethodBuilder::declareLocalVar(LocalVar &Var) { ensureCompleteDecl(); @@ -584,6 +611,22 @@ BuiltinTypeMethodBuilder::setHandleFieldOnResource(ResourceT ResourceRecord, } template <typename T> +BuiltinTypeMethodBuilder & +BuiltinTypeMethodBuilder::accessCounterHandleFieldOnResource(T ResourceRecord) { + ensureCompleteDecl(); + + Expr *ResourceExpr = convertPlaceholder(ResourceRecord); + + ASTContext &AST = DeclBuilder.SemaRef.getASTContext(); + FieldDecl *HandleField = DeclBuilder.getResourceCounterHandleField(); + MemberExpr *HandleExpr = MemberExpr::CreateImplicit( + AST, ResourceExpr, false, HandleField, HandleField->getType(), VK_LValue, + OK_Ordinary); + StmtsList.push_back(HandleExpr); + return *this; +} + +template <typename T> BuiltinTypeMethodBuilder &BuiltinTypeMethodBuilder::returnValue(T ReturnValue) { ensureCompleteDecl(); @@ -722,8 +765,31 @@ BuiltinTypeDeclBuilder::addMemberVariable(StringRef Name, QualType Type, return *this; } +BuiltinTypeDeclBuilder & +BuiltinTypeDeclBuilder::addBufferHandles(ResourceClass RC, bool IsROV, + bool RawBuffer, bool HasCounter, + AccessSpecifier Access) { + addHandleMember(RC, IsROV, RawBuffer, Access); + if (HasCounter) + addCounterHandleMember(RC, IsROV, RawBuffer, Access); + return *this; +} + BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addHandleMember( ResourceClass RC, bool IsROV, bool RawBuffer, AccessSpecifier Access) { + return addResourceMember("__handle", RC, IsROV, RawBuffer, + /*IsCounter=*/false, Access); +} + +BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCounterHandleMember( + ResourceClass RC, bool IsROV, bool RawBuffer, AccessSpecifier Access) { + return addResourceMember("__counter_handle", RC, IsROV, RawBuffer, + /*IsCounter=*/true, Access); +} + +BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addResourceMember( + StringRef MemberName, ResourceClass RC, bool IsROV, bool RawBuffer, + bool IsCounter, AccessSpecifier Access) { assert(!Record->isCompleteDefinition() && "record is already complete"); ASTContext &Ctx = SemaRef.getASTContext(); @@ -739,9 +805,12 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addHandleMember( ElementTypeInfo ? HLSLContainedTypeAttr::CreateImplicit(Ctx, ElementTypeInfo) : nullptr}; + if (IsCounter) + Attrs.push_back(HLSLIsCounterAttr::CreateImplicit(Ctx)); + if (CreateHLSLAttributedResourceType(SemaRef, Ctx.HLSLResourceTy, Attrs, AttributedResTy)) - addMemberVariable("__handle", AttributedResTy, {}, Access); + addMemberVariable(MemberName, AttributedResTy, {}, Access); return *this; } @@ -844,12 +913,17 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCopyConstructor() { using PH = BuiltinTypeMethodBuilder::PlaceHolder; - return BuiltinTypeMethodBuilder(*this, /*Name=*/"", AST.VoidTy, - /*IsConst=*/false, /*IsCtor=*/true) - .addParam("other", ConstRecordRefType) + BuiltinTypeMethodBuilder MMB(*this, /*Name=*/"", AST.VoidTy, + /*IsConst=*/false, /*IsCtor=*/true); + MMB.addParam("other", ConstRecordRefType) .accessHandleFieldOnResource(PH::_0) - .assign(PH::Handle, PH::LastStmt) - .finalize(); + .assign(PH::Handle, PH::LastStmt); + + if (getResourceCounterHandleField()) + MMB.accessCounterHandleFieldOnResource(PH::_0).assign(PH::CounterHandle, + PH::LastStmt); + + return MMB.finalize(); } BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCopyAssignmentOperator() { @@ -863,12 +937,16 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCopyAssignmentOperator() { using PH = BuiltinTypeMethodBuilder::PlaceHolder; DeclarationName Name = AST.DeclarationNames.getCXXOperatorName(OO_Equal); - return BuiltinTypeMethodBuilder(*this, Name, RecordRefType) - .addParam("other", ConstRecordRefType) + BuiltinTypeMethodBuilder MMB(*this, Name, RecordRefType); + MMB.addParam("other", ConstRecordRefType) .accessHandleFieldOnResource(PH::_0) - .assign(PH::Handle, PH::LastStmt) - .returnThis() - .finalize(); + .assign(PH::Handle, PH::LastStmt); + + if (getResourceCounterHandleField()) + MMB.accessCounterHandleFieldOnResource(PH::_0).assign(PH::CounterHandle, + PH::LastStmt); + + return MMB.returnThis().finalize(); } BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addArraySubscriptOperators() { @@ -903,6 +981,14 @@ FieldDecl *BuiltinTypeDeclBuilder::getResourceHandleField() const { return I->second; } +FieldDecl *BuiltinTypeDeclBuilder::getResourceCounterHandleField() const { + auto I = Fields.find("__counter_handle"); + if (I == Fields.end() || + !I->second->getType()->isHLSLAttributedResourceType()) + return nullptr; + return I->second; +} + QualType BuiltinTypeDeclBuilder::getFirstTemplateTypeParam() { assert(Template && "record it not a template"); if (const auto *TTD = dyn_cast<TemplateTypeParmDecl>( diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h index 9448af1..a981602 100644 --- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h +++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h @@ -72,8 +72,9 @@ public: AccessSpecifier Access = AccessSpecifier::AS_private); BuiltinTypeDeclBuilder & - addHandleMember(ResourceClass RC, bool IsROV, bool RawBuffer, - AccessSpecifier Access = AccessSpecifier::AS_private); + addBufferHandles(ResourceClass RC, bool IsROV, bool RawBuffer, + bool HasCounter, + AccessSpecifier Access = AccessSpecifier::AS_private); BuiltinTypeDeclBuilder &addArraySubscriptOperators(); // Builtin types constructors @@ -95,7 +96,18 @@ public: BuiltinTypeDeclBuilder &addConsumeMethod(); private: + BuiltinTypeDeclBuilder &addResourceMember(StringRef MemberName, + ResourceClass RC, bool IsROV, + bool RawBuffer, bool IsCounter, + AccessSpecifier Access); + BuiltinTypeDeclBuilder & + addHandleMember(ResourceClass RC, bool IsROV, bool RawBuffer, + AccessSpecifier Access = AccessSpecifier::AS_private); + BuiltinTypeDeclBuilder & + addCounterHandleMember(ResourceClass RC, bool IsROV, bool RawBuffer, + AccessSpecifier Access = AccessSpecifier::AS_private); FieldDecl *getResourceHandleField() const; + FieldDecl *getResourceCounterHandleField() const; QualType getFirstTemplateTypeParam(); QualType getHandleElementType(); Expr *getConstantIntExpr(int value); diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 464922b..cc43e94 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -230,9 +230,9 @@ void HLSLExternalSemaSource::defineTrivialHLSLTypes() { /// Set up common members and attributes for buffer types static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S, ResourceClass RC, bool IsROV, - bool RawBuffer) { + bool RawBuffer, bool HasCounter) { return BuiltinTypeDeclBuilder(S, Decl) - .addHandleMember(RC, IsROV, RawBuffer) + .addBufferHandles(RC, IsROV, RawBuffer, HasCounter) .addDefaultHandleConstructor() .addCopyConstructor() .addCopyAssignmentOperator() @@ -377,7 +377,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, /*IsROV=*/false, - /*RawBuffer=*/false) + /*RawBuffer=*/false, /*HasCounter=*/false) .addArraySubscriptOperators() .addLoadMethods() .completeDefinition(); @@ -389,7 +389,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false, - /*RawBuffer=*/false) + /*RawBuffer=*/false, /*HasCounter=*/false) .addArraySubscriptOperators() .addLoadMethods() .completeDefinition(); @@ -401,7 +401,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .finalizeForwardDeclaration(); onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/true, - /*RawBuffer=*/false) + /*RawBuffer=*/false, /*HasCounter=*/false) .addArraySubscriptOperators() .addLoadMethods() .completeDefinition(); @@ -412,7 +412,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .finalizeForwardDeclaration(); onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, /*IsROV=*/false, - /*RawBuffer=*/true) + /*RawBuffer=*/true, /*HasCounter=*/false) .addArraySubscriptOperators() .addLoadMethods() .completeDefinition(); @@ -423,7 +423,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .finalizeForwardDeclaration(); onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false, - /*RawBuffer=*/true) + /*RawBuffer=*/true, /*HasCounter=*/true) .addArraySubscriptOperators() .addLoadMethods() .addIncrementCounterMethod() @@ -437,7 +437,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .finalizeForwardDeclaration(); onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false, - /*RawBuffer=*/true) + /*RawBuffer=*/true, /*HasCounter=*/true) .addAppendMethod() .completeDefinition(); }); @@ -448,7 +448,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .finalizeForwardDeclaration(); onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false, - /*RawBuffer=*/true) + /*RawBuffer=*/true, /*HasCounter=*/true) .addConsumeMethod() .completeDefinition(); }); @@ -459,7 +459,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .finalizeForwardDeclaration(); onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/true, - /*RawBuffer=*/true) + /*RawBuffer=*/true, /*HasCounter=*/true) .addArraySubscriptOperators() .addLoadMethods() .addIncrementCounterMethod() @@ -471,14 +471,14 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .finalizeForwardDeclaration(); onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, /*IsROV=*/false, - /*RawBuffer=*/true) + /*RawBuffer=*/true, /*HasCounter=*/false) .completeDefinition(); }); Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RWByteAddressBuffer") .finalizeForwardDeclaration(); onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false, - /*RawBuffer=*/true) + /*RawBuffer=*/true, /*HasCounter=*/false) .completeDefinition(); }); Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, @@ -486,7 +486,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .finalizeForwardDeclaration(); onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/true, - /*RawBuffer=*/true) + /*RawBuffer=*/true, /*HasCounter=*/false) .completeDefinition(); }); } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 3cc61b1..7ce3513 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5954,9 +5954,6 @@ bool Sema::BuiltinAssumeAligned(CallExpr *TheCall) { if (Result > Sema::MaximumAlignment) Diag(TheCall->getBeginLoc(), diag::warn_assume_aligned_too_great) << SecondArg->getSourceRange() << Sema::MaximumAlignment; - - TheCall->setArg(1, - ConstantExpr::Create(Context, SecondArg, APValue(Result))); } if (NumArgs > 2) { diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 129b03c..fa30c66b 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -1810,6 +1810,13 @@ bool clang::CreateHLSLAttributedResourceType( } ResAttrs.RawBuffer = true; break; + case attr::HLSLIsCounter: + if (ResAttrs.IsCounter) { + S.Diag(A->getLocation(), diag::warn_duplicate_attribute_exact) << A; + return false; + } + ResAttrs.IsCounter = true; + break; case attr::HLSLContainedType: { const HLSLContainedTypeAttr *CTAttr = cast<HLSLContainedTypeAttr>(A); QualType Ty = CTAttr->getType(); @@ -1902,6 +1909,10 @@ bool SemaHLSL::handleResourceTypeAttr(QualType T, const ParsedAttr &AL) { A = HLSLRawBufferAttr::Create(getASTContext(), ACI); break; + case ParsedAttr::AT_HLSLIsCounter: + A = HLSLIsCounterAttr::Create(getASTContext(), ACI); + break; + case ParsedAttr::AT_HLSLContainedType: { if (AL.getNumArgs() != 1 && !AL.hasParsedType()) { Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1; diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index a64f207..9aaf7f4 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -2789,7 +2789,7 @@ OpenACCPrivateRecipe SemaOpenACC::CreatePrivateInitRecipe(const Expr *VarExpr) { AllocaDecl->setInitStyle(VarDecl::CallInit); } - return OpenACCPrivateRecipe(AllocaDecl, Init.get()); + return OpenACCPrivateRecipe(AllocaDecl); } OpenACCFirstPrivateRecipe @@ -2828,7 +2828,14 @@ SemaOpenACC::CreateFirstPrivateInitRecipe(const Expr *VarExpr) { if (!ArrTy) { ExprResult Init = FinishValueInit( SemaRef.SemaRef, Entity, VarExpr->getBeginLoc(), VarTy, TemporaryDRE); - return OpenACCFirstPrivateRecipe(AllocaDecl, Init.get(), Temporary); + + // For 'no bounds' version, we can use this as a shortcut, so set the init + // anyway. + if (Init.isUsable()) { + AllocaDecl->setInit(Init.get()); + AllocaDecl->setInitStyle(VarDecl::CallInit); + } + return OpenACCFirstPrivateRecipe(AllocaDecl, Temporary); } // Arrays need to have each individual element initialized as there @@ -2875,8 +2882,16 @@ SemaOpenACC::CreateFirstPrivateInitRecipe(const Expr *VarExpr) { ExprResult Init = FinishValueInit(SemaRef.SemaRef, Entity, VarExpr->getBeginLoc(), VarTy, InitExpr); - return OpenACCFirstPrivateRecipe(AllocaDecl, Init.get(), Temporary); + // For 'no bounds' version, we can use this as a shortcut, so set the init + // anyway. + if (Init.isUsable()) { + AllocaDecl->setInit(Init.get()); + AllocaDecl->setInitStyle(VarDecl::CallInit); + } + + return OpenACCFirstPrivateRecipe(AllocaDecl, Temporary); } + OpenACCReductionRecipe SemaOpenACC::CreateReductionInitRecipe( OpenACCReductionOperator ReductionOperator, const Expr *VarExpr) { // TODO: OpenACC: This shouldn't be necessary, see PrivateInitRecipe @@ -2932,5 +2947,12 @@ OpenACCReductionRecipe SemaOpenACC::CreateReductionInitRecipe( ExprResult Init = FinishValueInit(SemaRef.SemaRef, Entity, VarExpr->getBeginLoc(), VarTy, InitExpr); - return OpenACCReductionRecipe(AllocaDecl, Init.get()); + + // For 'no bounds' version, we can use this as a shortcut, so set the init + // anyway. + if (Init.isUsable()) { + AllocaDecl->setInit(Init.get()); + AllocaDecl->setInitStyle(VarDecl::CallInit); + } + return OpenACCReductionRecipe(AllocaDecl); } diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index c05e428..6acf79a 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -12860,10 +12860,9 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() { llvm::SmallVector<OpenACCPrivateRecipe> RecipeList; for (unsigned I = 0; I < VarList.size(); ++I) { - static_assert(sizeof(OpenACCPrivateRecipe) == 2 * sizeof(int *)); + static_assert(sizeof(OpenACCPrivateRecipe) == 1 * sizeof(int *)); VarDecl *Alloca = readDeclAs<VarDecl>(); - Expr *InitExpr = readSubExpr(); - RecipeList.push_back({Alloca, InitExpr}); + RecipeList.push_back({Alloca}); } return OpenACCPrivateClause::Create(getContext(), BeginLoc, LParenLoc, @@ -12886,11 +12885,10 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() { llvm::SmallVector<Expr *> VarList = readOpenACCVarList(); llvm::SmallVector<OpenACCFirstPrivateRecipe> RecipeList; for (unsigned I = 0; I < VarList.size(); ++I) { - static_assert(sizeof(OpenACCFirstPrivateRecipe) == 3 * sizeof(int *)); + static_assert(sizeof(OpenACCFirstPrivateRecipe) == 2 * sizeof(int *)); VarDecl *Recipe = readDeclAs<VarDecl>(); - Expr *InitExpr = readSubExpr(); VarDecl *RecipeTemp = readDeclAs<VarDecl>(); - RecipeList.push_back({Recipe, InitExpr, RecipeTemp}); + RecipeList.push_back({Recipe, RecipeTemp}); } return OpenACCFirstPrivateClause::Create(getContext(), BeginLoc, LParenLoc, @@ -13011,10 +13009,9 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() { llvm::SmallVector<OpenACCReductionRecipe> RecipeList; for (unsigned I = 0; I < VarList.size(); ++I) { - static_assert(sizeof(OpenACCReductionRecipe) == 2 * sizeof(int *)); + static_assert(sizeof(OpenACCReductionRecipe) == sizeof(int *)); VarDecl *Recipe = readDeclAs<VarDecl>(); - Expr *InitExpr = readSubExpr(); - RecipeList.push_back({Recipe, InitExpr}); + RecipeList.push_back({Recipe}); } return OpenACCReductionClause::Create(getContext(), BeginLoc, LParenLoc, Op, diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index cdf95ba..09b1e58 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -8779,9 +8779,8 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { writeOpenACCVarList(PC); for (const OpenACCPrivateRecipe &R : PC->getInitRecipes()) { - static_assert(sizeof(R) == 2 * sizeof(int *)); + static_assert(sizeof(R) == 1 * sizeof(int *)); AddDeclRef(R.AllocaDecl); - AddStmt(const_cast<Expr *>(R.InitExpr)); } return; } @@ -8803,9 +8802,8 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { writeOpenACCVarList(FPC); for (const OpenACCFirstPrivateRecipe &R : FPC->getInitRecipes()) { - static_assert(sizeof(R) == 3 * sizeof(int *)); + static_assert(sizeof(R) == 2 * sizeof(int *)); AddDeclRef(R.AllocaDecl); - AddStmt(const_cast<Expr *>(R.InitExpr)); AddDeclRef(R.InitFromTemporary); } return; @@ -8927,9 +8925,8 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { writeOpenACCVarList(RC); for (const OpenACCReductionRecipe &R : RC->getRecipes()) { - static_assert(sizeof(OpenACCReductionRecipe) == 2 * sizeof(int *)); + static_assert(sizeof(OpenACCReductionRecipe) == 1 * sizeof(int *)); AddDeclRef(R.AllocaDecl); - AddStmt(const_cast<Expr *>(R.InitExpr)); } return; } diff --git a/clang/lib/Testing/TestAST.cpp b/clang/lib/Testing/TestAST.cpp index 9ad0de9..d333895 100644 --- a/clang/lib/Testing/TestAST.cpp +++ b/clang/lib/Testing/TestAST.cpp @@ -61,7 +61,7 @@ void createMissingComponents(CompilerInstance &Clang) { if (!Clang.hasFileManager()) Clang.createFileManager(); if (!Clang.hasSourceManager()) - Clang.createSourceManager(Clang.getFileManager()); + Clang.createSourceManager(); if (!Clang.hasTarget()) Clang.createTarget(); if (!Clang.hasPreprocessor()) diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp index 66cf2688..010380d 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp @@ -415,7 +415,7 @@ bool DependencyScanningAction::runInvocation( any(Service.getOptimizeArgs() & ScanningOptimizations::VFS); // Create a new FileManager to match the invocation's FileSystemOptions. - auto *FileMgr = ScanInstance.createFileManager(); + ScanInstance.createFileManager(); // Use the dependency scanning optimized file system if requested to do so. if (DepFS) { @@ -423,16 +423,17 @@ bool DependencyScanningAction::runInvocation( if (!ScanInstance.getHeaderSearchOpts().ModuleCachePath.empty()) { SmallString<256> ModulesCachePath; normalizeModuleCachePath( - *FileMgr, ScanInstance.getHeaderSearchOpts().ModuleCachePath, - ModulesCachePath); + ScanInstance.getFileManager(), + ScanInstance.getHeaderSearchOpts().ModuleCachePath, ModulesCachePath); DepFS->setBypassedPathPrefix(ModulesCachePath); } ScanInstance.setDependencyDirectivesGetter( - std::make_unique<ScanningDependencyDirectivesGetter>(*FileMgr)); + std::make_unique<ScanningDependencyDirectivesGetter>( + ScanInstance.getFileManager())); } - ScanInstance.createSourceManager(*FileMgr); + ScanInstance.createSourceManager(); // Create a collection of stable directories derived from the ScanInstance // for determining whether module dependencies would fully resolve from diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp index 2d4790b..ea5a372 100644 --- a/clang/lib/Tooling/Tooling.cpp +++ b/clang/lib/Tooling/Tooling.cpp @@ -458,7 +458,7 @@ bool FrontendActionFactory::runInvocation( if (!Compiler.hasDiagnostics()) return false; - Compiler.createSourceManager(*Files); + Compiler.createSourceManager(); const bool Success = Compiler.ExecuteAction(*ScopedToolAction); diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl index a490b22..6779abb 100644 --- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl +++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl @@ -12,7 +12,7 @@ // // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \ -// RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-SUBSCRIPT-UAV,CHECK-COUNTER,CHECK-LOAD %s +// RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-SUBSCRIPT-UAV,CHECK-COUNTER,CHECK-LOAD,CHECK-COUNTER-HANDLE %s // // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \ @@ -20,7 +20,7 @@ // // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \ -// RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-APPEND %s +// RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-APPEND,CHECK-COUNTER-HANDLE %s // // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \ @@ -28,7 +28,7 @@ // // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \ -// RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-CONSUME %s +// RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-CONSUME,CHECK-COUNTER-HANDLE %s // // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \ @@ -36,7 +36,7 @@ // // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \ -// RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-SUBSCRIPT-UAV,CHECK-LOAD %s +// RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-SUBSCRIPT-UAV,CHECK-LOAD,CHECK-COUNTER-HANDLE %s // This test tests two different AST generations for each structured buffer. // The "EMPTY" test mode verifies the AST generated by forward declaration @@ -113,6 +113,11 @@ RESOURCE<float> Buffer; // CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this // CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle // CHECK-NEXT: DeclRefExpr {{.*}} 'const hlsl::[[RESOURCE]]<element_type>' ParmVar {{.*}} 'other' 'const hlsl::[[RESOURCE]]<element_type> &' +// CHECK-COUNTER-HANDLE-NEXT: BinaryOperator {{.*}} '=' +// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} lvalue .__counter_handle +// CHECK-COUNTER-HANDLE-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this +// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} lvalue .__counter_handle +// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'const hlsl::[[RESOURCE]]<element_type>' ParmVar {{.*}} 'other' 'const hlsl::[[RESOURCE]]<element_type> &' // CHECK-NEXT: AlwaysInlineAttr // operator= @@ -125,6 +130,11 @@ RESOURCE<float> Buffer; // CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this // CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle // CHECK-NEXT: DeclRefExpr {{.*}} 'const hlsl::[[RESOURCE]]<element_type>' ParmVar {{.*}} 'other' 'const hlsl::[[RESOURCE]]<element_type> &' +// CHECK-COUNTER-HANDLE: BinaryOperator {{.*}} '=' +// CHECK-COUNTER-HANDLE: MemberExpr {{.*}} lvalue .__counter_handle +// CHECK-COUNTER-HANDLE: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this +// CHECK-COUNTER-HANDLE: MemberExpr {{.*}} lvalue .__counter_handle +// CHECK-COUNTER-HANDLE: DeclRefExpr {{.*}} 'const hlsl::[[RESOURCE]]<element_type>' ParmVar {{.*}} 'other' 'const hlsl::[[RESOURCE]]<element_type> &' // CHECK-NEXT: ReturnStmt // CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this // CHECK-NEXT: AlwaysInlineAttr @@ -334,3 +344,8 @@ RESOURCE<float> Buffer; // CHECK-ROV-SAME{LITERAL}: [[hlsl::is_rov]] // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] +// CHECK-COUNTER-HANDLE: FieldDecl {{.*}} implicit referenced __counter_handle '__hlsl_resource_t +// CHECK-COUNTER-HANDLE-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-COUNTER-HANDLE-SAME{LITERAL}: [[hlsl::raw_buffer]] +// CHECK-COUNTER-HANDLE-SAME{LITERAL}: [[hlsl::is_counter]] +// CHECK-COUNTER-HANDLE-SAME{LITERAL}: [[hlsl::contained_type(float)]] diff --git a/clang/test/CodeGen/X86/avx512ifma-builtins.c b/clang/test/CodeGen/X86/avx512ifma-builtins.c index 7c7c492..eebefb0 100644 --- a/clang/test/CodeGen/X86/avx512ifma-builtins.c +++ b/clang/test/CodeGen/X86/avx512ifma-builtins.c @@ -3,6 +3,11 @@ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + #include <immintrin.h> diff --git a/clang/test/CodeGen/X86/avx512ifmavl-builtins.c b/clang/test/CodeGen/X86/avx512ifmavl-builtins.c index c115b60..89108fc 100644 --- a/clang/test/CodeGen/X86/avx512ifmavl-builtins.c +++ b/clang/test/CodeGen/X86/avx512ifmavl-builtins.c @@ -3,6 +3,12 @@ // RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=i386-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c %s -flax-vector-conversions=none -ffreestanding -triple=i386-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=i386-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + + #include <immintrin.h> __m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) { diff --git a/clang/test/CodeGen/X86/avxifma-builtins.c b/clang/test/CodeGen/X86/avxifma-builtins.c index dd0f220..aa15159 100644 --- a/clang/test/CodeGen/X86/avxifma-builtins.c +++ b/clang/test/CodeGen/X86/avxifma-builtins.c @@ -3,6 +3,12 @@ // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + + #include <immintrin.h> __m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) { diff --git a/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl index 472b9a8..9f0a5b7 100644 --- a/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl @@ -1,23 +1,36 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK // RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=SPV -// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer.12" = type { target("dx.RawBuffer", i32, 1, 0) } -// SPV: %"class.hlsl::RWStructuredBuffer.12" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1) -// CHECK: %"class.hlsl::RWStructuredBuffer.13" = type { target("dx.RawBuffer", <4 x i32>, 1, 0) } -// SPV: %"class.hlsl::RWStructuredBuffer.13" = type { target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1) +// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 0), target("dx.RawBuffer", i16, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer" = type { target("spirv.VulkanBuffer", [0 x i16], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 0), target("dx.RawBuffer", i16, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.0" = type { target("spirv.VulkanBuffer", [0 x i16], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.1" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.2" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0), target("dx.RawBuffer", i64, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.3" = type { target("spirv.VulkanBuffer", [0 x i64], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0), target("dx.RawBuffer", i64, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.4" = type { target("spirv.VulkanBuffer", [0 x i64], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0), target("dx.RawBuffer", half, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.5" = type { target("spirv.VulkanBuffer", [0 x half], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.6" = type { target("spirv.VulkanBuffer", [0 x float], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0), target("dx.RawBuffer", double, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.7" = type { target("spirv.VulkanBuffer", [0 x double], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0), target("dx.RawBuffer", <4 x i16>, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.8" = type { target("spirv.VulkanBuffer", [0 x <4 x i16>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 0), target("dx.RawBuffer", <3 x i32>, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.9" = type { target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 0), target("dx.RawBuffer", <2 x half>, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.10" = type { target("spirv.VulkanBuffer", [0 x <2 x half>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 0), target("dx.RawBuffer", <3 x float>, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.11" = type { target("spirv.VulkanBuffer", [0 x <3 x float>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.12" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.12" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } +// CHECK: %"class.hlsl::RWStructuredBuffer.13" = type { target("dx.RawBuffer", <4 x i32>, 1, 0), target("dx.RawBuffer", <4 x i32>, 1, 0) } +// SPV: %"class.hlsl::RWStructuredBuffer.13" = type { target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) } RWStructuredBuffer<int16_t> BufI16; RWStructuredBuffer<uint16_t> BufU16; diff --git a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl index 6c5a705..c97ad42 100644 --- a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl @@ -5,19 +5,19 @@ struct MyStruct { int2 b; }; -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 1) } -// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 1), target("dx.RawBuffer", i16, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 1), target("dx.RawBuffer", i16, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 1), target("dx.RawBuffer", i32, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 1), target("dx.RawBuffer", i32, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 1), target("dx.RawBuffer", i64, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 1), target("dx.RawBuffer", i64, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 1), target("dx.RawBuffer", half, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 1), target("dx.RawBuffer", float, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 1), target("dx.RawBuffer", double, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 1), target("dx.RawBuffer", <4 x i16>, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 1), target("dx.RawBuffer", <3 x i32>, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 1), target("dx.RawBuffer", <2 x half>, 1, 1) } +// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 1), target("dx.RawBuffer", <3 x float>, 1, 1) } // DXIL: %struct.MyStruct = type <{ <4 x float>, <2 x i32> }> RasterizerOrderedStructuredBuffer<int16_t> BufI16; diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl index 4f005ea..89a66b0 100644 --- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl +++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl @@ -21,8 +21,8 @@ export void foo() { } // CHECK-DXIL: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) } -// CHECK-DXIL: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } -// CHECK-DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } +// CHECK-DXIL: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) } +// CHECK-DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) } // CHECK: @Buf1 = internal global %"class.hlsl::StructuredBuffer" poison, align 4 // CHECK: @[[Buf1Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf1\00", align 1 diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl index 93aa218..43ddd2e 100644 --- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl +++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl @@ -10,9 +10,9 @@ AppendStructuredBuffer<float> ASB : register(u2); ConsumeStructuredBuffer<float> CSB : register(u3); // CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } -// CHECK: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } -// CHECK: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } +// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) } +// CHECK: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) } +// CHECK: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) } export int TestIncrementCounter() { return RWSB1.IncrementCounter(); diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl index b513963..9e08a6d 100644 --- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl +++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl @@ -6,7 +6,7 @@ RWStructuredBuffer<float> RWSB1, RWSB2; RasterizerOrderedStructuredBuffer<float> ROSB1, ROSB2; -// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } +// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) } export void TestIncrementCounter() { // CHECK: define void @_Z20TestIncrementCounterv() diff --git a/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl b/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl index 4ffa7cf..1d85048 100644 --- a/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl +++ b/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl @@ -4,7 +4,7 @@ // CHECK: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", <4 x float>, 1, 0, 0) } // CHECK: %"class.hlsl::RWBuffer.0" = type { target("dx.TypedBuffer", float, 1, 0, 0) } // CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", i32, 0, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", %struct.S, 1, 0) } +// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", %struct.S, 1, 0), target("dx.RawBuffer", %struct.S, 1, 0) } // CHECK: %"class.hlsl::RWBuffer.1" = type { target("dx.TypedBuffer", double, 1, 0, 0) } // CHECK: @_ZL4U0S0 = internal global %"class.hlsl::RWBuffer" poison, align 4 diff --git a/clang/test/SemaCXX/builtin-assume-aligned.cpp b/clang/test/SemaCXX/builtin-assume-aligned.cpp index afc11cc..48bd841 100644 --- a/clang/test/SemaCXX/builtin-assume-aligned.cpp +++ b/clang/test/SemaCXX/builtin-assume-aligned.cpp @@ -47,9 +47,3 @@ constexpr void *s1 = __builtin_assume_aligned(x, 32); constexpr void *s2 = __builtin_assume_aligned(x, 32, 5); constexpr void *s3 = __builtin_assume_aligned(x, 32, -1); - -constexpr int add(int a, int b) { - return a+b; -} -constexpr void *c1 = __builtin_assume_aligned(p, add(1,1)); -constexpr void *c2 = __builtin_assume_aligned(p, add(2,1)); // expected-error {{not a power of 2}} diff --git a/clang/tools/clang-import-test/clang-import-test.cpp b/clang/tools/clang-import-test/clang-import-test.cpp index 910e08c..977cec1 100644 --- a/clang/tools/clang-import-test/clang-import-test.cpp +++ b/clang/tools/clang-import-test/clang-import-test.cpp @@ -216,7 +216,7 @@ std::unique_ptr<CompilerInstance> BuildCompilerInstance() { Ins->getTarget().adjust(Ins->getDiagnostics(), Ins->getLangOpts(), /*AuxTarget=*/nullptr); Ins->createFileManager(); - Ins->createSourceManager(Ins->getFileManager()); + Ins->createSourceManager(); Ins->createPreprocessor(TU_Complete); return Ins; diff --git a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp index 594c79a..de20e74 100644 --- a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp +++ b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp @@ -462,9 +462,10 @@ static Error runAOTCompile(StringRef InputFile, StringRef OutputFile, // TODO: Consider using LLVM-IR metadata to identify globals of interest bool isKernel(const Function &F) { - const CallingConv::ID CC = F.getCallingConv(); - return CC == CallingConv::SPIR_KERNEL || CC == CallingConv::AMDGPU_KERNEL || - CC == CallingConv::PTX_Kernel; + const llvm::CallingConv::ID CC = F.getCallingConv(); + return CC == llvm::CallingConv::SPIR_KERNEL || + CC == llvm::CallingConv::AMDGPU_KERNEL || + CC == llvm::CallingConv::PTX_Kernel; } /// Performs the following steps: diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 30e2be7..c39f337 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -2832,10 +2832,8 @@ void OpenACCClauseEnqueue::VisitTileClause(const OpenACCTileClause &C) { void OpenACCClauseEnqueue::VisitPrivateClause(const OpenACCPrivateClause &C) { VisitVarList(C); - for (const OpenACCPrivateRecipe &R : C.getInitRecipes()) { + for (const OpenACCPrivateRecipe &R : C.getInitRecipes()) Visitor.AddDecl(R.AllocaDecl); - Visitor.AddStmt(R.InitExpr); - } } void OpenACCClauseEnqueue::VisitHostClause(const OpenACCHostClause &C) { @@ -2851,7 +2849,6 @@ void OpenACCClauseEnqueue::VisitFirstPrivateClause( VisitVarList(C); for (const OpenACCFirstPrivateRecipe &R : C.getInitRecipes()) { Visitor.AddDecl(R.AllocaDecl); - Visitor.AddStmt(R.InitExpr); Visitor.AddDecl(R.InitFromTemporary); } } @@ -2927,10 +2924,8 @@ void OpenACCClauseEnqueue::VisitDeviceTypeClause( void OpenACCClauseEnqueue::VisitReductionClause( const OpenACCReductionClause &C) { VisitVarList(C); - for (const OpenACCReductionRecipe &R : C.getRecipes()) { + for (const OpenACCReductionRecipe &R : C.getRecipes()) Visitor.AddDecl(R.AllocaDecl); - Visitor.AddStmt(R.InitExpr); - } } void OpenACCClauseEnqueue::VisitAutoClause(const OpenACCAutoClause &C) {} void OpenACCClauseEnqueue::VisitIndependentClause( diff --git a/clang/unittests/Analysis/CFGTest.cpp b/clang/unittests/Analysis/CFGTest.cpp index 46a6751..6aa09a8 100644 --- a/clang/unittests/Analysis/CFGTest.cpp +++ b/clang/unittests/Analysis/CFGTest.cpp @@ -93,6 +93,159 @@ TEST(CFG, DependantBaseAddImplicitDtors) { .getStatus()); } +TEST(CFG, SwitchCoveredEnumNoDefault) { + const char *Code = R"( + enum class E {E1, E2}; + int f(E e) { + switch(e) { + case E::E1: + return 1; + case E::E2: + return 2; + } + return 0; + } + )"; + CFG::BuildOptions Options; + Options.AssumeReachableDefaultInSwitchStatements = true; + BuildResult B = BuildCFG(Code, Options); + ASSERT_EQ(BuildResult::BuiltCFG, B.getStatus()); + + // [B5 (ENTRY)] + // Succs (1): B2 + // + // [B1] + // 1: 0 + // 2: return [B1.1]; + // Preds (1): B2 + // Succs (1): B0 + // + // [B2] + // 1: e (ImplicitCastExpr, LValueToRValue, E) + // T: switch [B2.1] + // Preds (1): B5 + // Succs (3): B3 B4 B1 + // + // [B3] + // case E::E2: + // 1: 2 + // 2: return [B3.1]; + // Preds (1): B2 + // Succs (1): B0 + // + // [B4] + // case E::E1: + // 1: 1 + // 2: return [B4.1]; + // Preds (1): B2 + // Succs (1): B0 + // + // [B0 (EXIT)] + // Preds (3): B1 B3 B4 + + auto *CFG = B.getCFG(); + const auto &Entry = CFG->getEntry(); + ASSERT_EQ(1u, Entry.succ_size()); + // First successor of Entry is the switch + CFGBlock *SwitchBlock = *Entry.succ_begin(); + ASSERT_EQ(3u, SwitchBlock->succ_size()); + // Last successor of the switch is after the switch + auto NoCaseSucc = SwitchBlock->succ_rbegin(); + EXPECT_TRUE(NoCaseSucc->isReachable()); + + // Checking that the same node is Unreachable without this setting + Options.AssumeReachableDefaultInSwitchStatements = false; + B = BuildCFG(Code, Options); + ASSERT_EQ(BuildResult::BuiltCFG, B.getStatus()); + + const auto &Entry2 = B.getCFG()->getEntry(); + ASSERT_EQ(1u, Entry2.succ_size()); + CFGBlock *SwitchBlock2 = *Entry2.succ_begin(); + ASSERT_EQ(3u, SwitchBlock2->succ_size()); + auto NoCaseSucc2 = SwitchBlock2->succ_rbegin(); + EXPECT_FALSE(NoCaseSucc2->isReachable()); +} + +TEST(CFG, SwitchCoveredEnumWithDefault) { + const char *Code = R"( + enum class E {E1, E2}; + int f(E e) { + switch(e) { + case E::E1: + return 1; + case E::E2: + return 2; + default: + return 0; + } + return -1; + } + )"; + CFG::BuildOptions Options; + Options.AssumeReachableDefaultInSwitchStatements = true; + BuildResult B = BuildCFG(Code, Options); + ASSERT_EQ(BuildResult::BuiltCFG, B.getStatus()); + + // [B6 (ENTRY)] + // Succs (1): B2 + // + // [B1] + // 1: -1 + // 2: return [B1.1]; + // Succs (1): B0 + // + // [B2] + // 1: e (ImplicitCastExpr, LValueToRValue, E) + // T: switch [B2.1] + // Preds (1): B6 + // Succs (3): B4 B5 B3 + // + // [B3] + // default: + // 1: 0 + // 2: return [B3.1]; + // Preds (1): B2 + // Succs (1): B0 + // + // [B4] + // case E::E2: + // 1: 2 + // 2: return [B4.1]; + // Preds (1): B2 + // Succs (1): B0 + // + // [B5] + // case E::E1: + // 1: 1 + // 2: return [B5.1]; + // Preds (1): B2 + // Succs (1): B0 + // + // [B0 (EXIT)] + // Preds (4): B1 B3 B4 B5 + + const auto &Entry = B.getCFG()->getEntry(); + ASSERT_EQ(1u, Entry.succ_size()); + // First successor of Entry is the switch + CFGBlock *SwitchBlock = *Entry.succ_begin(); + ASSERT_EQ(3u, SwitchBlock->succ_size()); + // Last successor of the switch is the default branch + auto defaultBlock = SwitchBlock->succ_rbegin(); + EXPECT_TRUE(defaultBlock->isReachable()); + + // Checking that the same node is Unreachable without this setting + Options.AssumeReachableDefaultInSwitchStatements = false; + B = BuildCFG(Code, Options); + ASSERT_EQ(BuildResult::BuiltCFG, B.getStatus()); + + const auto &Entry2 = B.getCFG()->getEntry(); + ASSERT_EQ(1u, Entry2.succ_size()); + CFGBlock *SwitchBlock2 = *Entry2.succ_begin(); + ASSERT_EQ(3u, SwitchBlock2->succ_size()); + auto defaultBlock2 = SwitchBlock2->succ_rbegin(); + EXPECT_FALSE(defaultBlock2->isReachable()); +} + TEST(CFG, IsLinear) { auto expectLinear = [](bool IsLinear, const char *Code) { BuildResult B = BuildCFG(Code); diff --git a/clang/unittests/CodeGen/TestCompiler.h b/clang/unittests/CodeGen/TestCompiler.h index 57b5b07..9bd9060 100644 --- a/clang/unittests/CodeGen/TestCompiler.h +++ b/clang/unittests/CodeGen/TestCompiler.h @@ -52,7 +52,7 @@ struct TestCompiler { PtrSize = TInfo.getPointerWidth(clang::LangAS::Default) / 8; compiler.createFileManager(); - compiler.createSourceManager(compiler.getFileManager()); + compiler.createSourceManager(); compiler.createPreprocessor(clang::TU_Prefix); compiler.createASTContext(); diff --git a/clang/unittests/Serialization/ForceCheckFileInputTest.cpp b/clang/unittests/Serialization/ForceCheckFileInputTest.cpp index 24e2fd6..edf33ae 100644 --- a/clang/unittests/Serialization/ForceCheckFileInputTest.cpp +++ b/clang/unittests/Serialization/ForceCheckFileInputTest.cpp @@ -122,8 +122,8 @@ export int aa = 43; Clang.setDiagnostics(Diags); Clang.createVirtualFileSystem(CIOpts.VFS); - FileManager *FM = Clang.createFileManager(); - Clang.createSourceManager(*FM); + Clang.createFileManager(); + Clang.createSourceManager(); EXPECT_TRUE(Clang.createTarget()); Clang.createPreprocessor(TU_Complete); diff --git a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp index 80289ef..aa32bb3 100644 --- a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp +++ b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp @@ -65,7 +65,7 @@ public: if (!Compiler.hasDiagnostics()) return false; - Compiler.createSourceManager(*FileMgr); + Compiler.createSourceManager(); Compiler.addDependencyCollector(std::make_shared<TestFileCollector>( Compiler.getInvocation().getDependencyOutputOpts(), Deps)); diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index 57be863..e595e61 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -41,7 +41,9 @@ #include "mlir/Pass/Pass.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cstddef> #include <iterator> @@ -75,6 +77,112 @@ class MapInfoFinalizationPass /// | | std::map<mlir::Operation *, mlir::Value> localBoxAllocas; + /// Return true if the given path exists in a list of paths. + static bool + containsPath(const llvm::SmallVectorImpl<llvm::SmallVector<int64_t>> &paths, + llvm::ArrayRef<int64_t> path) { + return llvm::any_of(paths, [&](const llvm::SmallVector<int64_t> &p) { + return p.size() == path.size() && + std::equal(p.begin(), p.end(), path.begin()); + }); + } + + /// Return true if the given path is already present in + /// op.getMembersIndexAttr(). + static bool mappedIndexPathExists(mlir::omp::MapInfoOp op, + llvm::ArrayRef<int64_t> indexPath) { + if (mlir::ArrayAttr attr = op.getMembersIndexAttr()) { + for (mlir::Attribute list : attr) { + auto listAttr = mlir::cast<mlir::ArrayAttr>(list); + if (listAttr.size() != indexPath.size()) + continue; + bool allEq = true; + for (auto [i, val] : llvm::enumerate(listAttr)) { + if (mlir::cast<mlir::IntegerAttr>(val).getInt() != indexPath[i]) { + allEq = false; + break; + } + } + if (allEq) + return true; + } + } + return false; + } + + /// Build a compact string key for an index path for set-based + /// deduplication. Format: "N:v0,v1,..." where N is the length. + static void buildPathKey(llvm::ArrayRef<int64_t> path, + llvm::SmallString<64> &outKey) { + outKey.clear(); + llvm::raw_svector_ostream os(outKey); + os << path.size() << ':'; + for (size_t i = 0; i < path.size(); ++i) { + if (i) + os << ','; + os << path[i]; + } + } + + /// Create the member map for coordRef and append it (and its index + /// path) to the provided new* vectors, if it is not already present. + void appendMemberMapIfNew( + mlir::omp::MapInfoOp op, fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value coordRef, llvm::ArrayRef<int64_t> indexPath, + llvm::StringRef memberName, + llvm::SmallVectorImpl<mlir::Value> &newMapOpsForFields, + llvm::SmallVectorImpl<llvm::SmallVector<int64_t>> &newMemberIndexPaths) { + // Local de-dup within this op invocation. + if (containsPath(newMemberIndexPaths, indexPath)) + return; + // Global de-dup against already present member indices. + if (mappedIndexPathExists(op, indexPath)) + return; + + if (op.getMapperId()) { + mlir::omp::DeclareMapperOp symbol = + mlir::SymbolTable::lookupNearestSymbolFrom< + mlir::omp::DeclareMapperOp>(op, op.getMapperIdAttr()); + assert(symbol && "missing symbol for declare mapper identifier"); + mlir::omp::DeclareMapperInfoOp mapperInfo = symbol.getDeclareMapperInfo(); + // TODO: Probably a way to cache these keys in someway so we don't + // constantly go through the process of rebuilding them on every check, to + // save some cycles, but it can wait for a subsequent patch. + for (auto v : mapperInfo.getMapVars()) { + mlir::omp::MapInfoOp map = + mlir::cast<mlir::omp::MapInfoOp>(v.getDefiningOp()); + if (!map.getMembers().empty() && mappedIndexPathExists(map, indexPath)) + return; + } + } + + builder.setInsertionPoint(op); + fir::factory::AddrAndBoundsInfo info = fir::factory::getDataOperandBaseAddr( + builder, coordRef, /*isOptional=*/false, loc); + llvm::SmallVector<mlir::Value> bounds = fir::factory::genImplicitBoundsOps< + mlir::omp::MapBoundsOp, mlir::omp::MapBoundsType>( + builder, info, + hlfir::translateToExtendedValue(loc, builder, hlfir::Entity{coordRef}) + .first, + /*dataExvIsAssumedSize=*/false, loc); + + mlir::omp::MapInfoOp fieldMapOp = mlir::omp::MapInfoOp::create( + builder, loc, coordRef.getType(), coordRef, + mlir::TypeAttr::get(fir::unwrapRefType(coordRef.getType())), + op.getMapTypeAttr(), + builder.getAttr<mlir::omp::VariableCaptureKindAttr>( + mlir::omp::VariableCaptureKind::ByRef), + /*varPtrPtr=*/mlir::Value{}, /*members=*/mlir::ValueRange{}, + /*members_index=*/mlir::ArrayAttr{}, bounds, + /*mapperId=*/mlir::FlatSymbolRefAttr(), + builder.getStringAttr(op.getNameAttr().strref() + "." + memberName + + ".implicit_map"), + /*partial_map=*/builder.getBoolAttr(false)); + + newMapOpsForFields.emplace_back(fieldMapOp); + newMemberIndexPaths.emplace_back(indexPath.begin(), indexPath.end()); + } + /// getMemberUserList gathers all users of a particular MapInfoOp that are /// other MapInfoOp's and places them into the mapMemberUsers list, which /// records the map that the current argument MapInfoOp "op" is part of @@ -363,7 +471,7 @@ class MapInfoFinalizationPass mlir::ArrayAttr newMembersAttr; mlir::SmallVector<mlir::Value> newMembers; llvm::SmallVector<llvm::SmallVector<int64_t>> memberIndices; - bool IsHasDeviceAddr = isHasDeviceAddr(op, target); + bool isHasDeviceAddrFlag = isHasDeviceAddr(op, target); if (!mapMemberUsers.empty() || !op.getMembers().empty()) getMemberIndicesAsVectors( @@ -406,7 +514,7 @@ class MapInfoFinalizationPass mapUser.parent.getMembersMutable().assign(newMemberOps); mapUser.parent.setMembersIndexAttr( builder.create2DI64ArrayAttr(memberIndices)); - } else if (!IsHasDeviceAddr) { + } else if (!isHasDeviceAddrFlag) { auto baseAddr = genBaseAddrMap(descriptor, op.getBounds(), op.getMapType(), builder); newMembers.push_back(baseAddr); @@ -429,7 +537,7 @@ class MapInfoFinalizationPass // The contents of the descriptor (the base address in particular) will // remain unchanged though. uint64_t mapType = op.getMapType(); - if (IsHasDeviceAddr) { + if (isHasDeviceAddrFlag) { mapType |= llvm::to_underlying( llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS); } @@ -701,94 +809,134 @@ class MapInfoFinalizationPass auto recordType = mlir::cast<fir::RecordType>(underlyingType); llvm::SmallVector<mlir::Value> newMapOpsForFields; - llvm::SmallVector<int64_t> fieldIndicies; + llvm::SmallVector<llvm::SmallVector<int64_t>> newMemberIndexPaths; + // 1) Handle direct top-level allocatable fields. for (auto fieldMemTyPair : recordType.getTypeList()) { auto &field = fieldMemTyPair.first; auto memTy = fieldMemTyPair.second; - bool shouldMapField = - llvm::find_if(mapVarForwardSlice, [&](mlir::Operation *sliceOp) { - if (!fir::isAllocatableType(memTy)) - return false; - - auto designateOp = mlir::dyn_cast<hlfir::DesignateOp>(sliceOp); - if (!designateOp) - return false; - - return designateOp.getComponent() && - designateOp.getComponent()->strref() == field; - }) != mapVarForwardSlice.end(); - - // TODO Handle recursive record types. Adapting - // `createParentSymAndGenIntermediateMaps` to work direclty on MLIR - // entities might be helpful here. - - if (!shouldMapField) + if (!fir::isAllocatableType(memTy)) continue; - int32_t fieldIdx = recordType.getFieldIndex(field); - bool alreadyMapped = [&]() { - if (op.getMembersIndexAttr()) - for (auto indexList : op.getMembersIndexAttr()) { - auto indexListAttr = mlir::cast<mlir::ArrayAttr>(indexList); - if (indexListAttr.size() == 1 && - mlir::cast<mlir::IntegerAttr>(indexListAttr[0]).getInt() == - fieldIdx) - return true; - } - - return false; - }(); - - if (alreadyMapped) + bool referenced = llvm::any_of(mapVarForwardSlice, [&](auto *opv) { + auto designateOp = mlir::dyn_cast<hlfir::DesignateOp>(opv); + return designateOp && designateOp.getComponent() && + designateOp.getComponent()->strref() == field; + }); + if (!referenced) continue; + int32_t fieldIdx = recordType.getFieldIndex(field); builder.setInsertionPoint(op); fir::IntOrValue idxConst = mlir::IntegerAttr::get(builder.getI32Type(), fieldIdx); auto fieldCoord = fir::CoordinateOp::create( builder, op.getLoc(), builder.getRefType(memTy), op.getVarPtr(), llvm::SmallVector<fir::IntOrValue, 1>{idxConst}); - fir::factory::AddrAndBoundsInfo info = - fir::factory::getDataOperandBaseAddr( - builder, fieldCoord, /*isOptional=*/false, op.getLoc()); - llvm::SmallVector<mlir::Value> bounds = - fir::factory::genImplicitBoundsOps<mlir::omp::MapBoundsOp, - mlir::omp::MapBoundsType>( - builder, info, - hlfir::translateToExtendedValue(op.getLoc(), builder, - hlfir::Entity{fieldCoord}) - .first, - /*dataExvIsAssumedSize=*/false, op.getLoc()); - - mlir::omp::MapInfoOp fieldMapOp = mlir::omp::MapInfoOp::create( - builder, op.getLoc(), fieldCoord.getResult().getType(), - fieldCoord.getResult(), - mlir::TypeAttr::get( - fir::unwrapRefType(fieldCoord.getResult().getType())), - op.getMapTypeAttr(), - builder.getAttr<mlir::omp::VariableCaptureKindAttr>( - mlir::omp::VariableCaptureKind::ByRef), - /*varPtrPtr=*/mlir::Value{}, /*members=*/mlir::ValueRange{}, - /*members_index=*/mlir::ArrayAttr{}, bounds, - /*mapperId=*/mlir::FlatSymbolRefAttr(), - builder.getStringAttr(op.getNameAttr().strref() + "." + field + - ".implicit_map"), - /*partial_map=*/builder.getBoolAttr(false)); - newMapOpsForFields.emplace_back(fieldMapOp); - fieldIndicies.emplace_back(fieldIdx); + int64_t fieldIdx64 = static_cast<int64_t>(fieldIdx); + llvm::SmallVector<int64_t, 1> idxPath{fieldIdx64}; + appendMemberMapIfNew(op, builder, op.getLoc(), fieldCoord, idxPath, + field, newMapOpsForFields, newMemberIndexPaths); + } + + // Handle nested allocatable fields along any component chain + // referenced in the region via HLFIR designates. + llvm::SmallVector<llvm::SmallVector<int64_t>> seenIndexPaths; + for (mlir::Operation *sliceOp : mapVarForwardSlice) { + auto designateOp = mlir::dyn_cast<hlfir::DesignateOp>(sliceOp); + if (!designateOp || !designateOp.getComponent()) + continue; + llvm::SmallVector<llvm::StringRef> compPathReversed; + compPathReversed.push_back(designateOp.getComponent()->strref()); + mlir::Value curBase = designateOp.getMemref(); + bool rootedAtMapArg = false; + while (true) { + if (auto parentDes = curBase.getDefiningOp<hlfir::DesignateOp>()) { + if (!parentDes.getComponent()) + break; + compPathReversed.push_back(parentDes.getComponent()->strref()); + curBase = parentDes.getMemref(); + continue; + } + if (auto decl = curBase.getDefiningOp<hlfir::DeclareOp>()) { + if (auto barg = + mlir::dyn_cast<mlir::BlockArgument>(decl.getMemref())) + rootedAtMapArg = (barg == opBlockArg); + } else if (auto blockArg = + mlir::dyn_cast_or_null<mlir::BlockArgument>( + curBase)) { + rootedAtMapArg = (blockArg == opBlockArg); + } + break; + } + // Only process nested paths (2+ components). Single-component paths + // for direct fields are handled above. + if (!rootedAtMapArg || compPathReversed.size() < 2) + continue; + builder.setInsertionPoint(op); + llvm::SmallVector<int64_t> indexPath; + mlir::Type curTy = underlyingType; + mlir::Value coordRef = op.getVarPtr(); + bool validPath = true; + for (llvm::StringRef compName : llvm::reverse(compPathReversed)) { + auto recTy = mlir::dyn_cast<fir::RecordType>(curTy); + if (!recTy) { + validPath = false; + break; + } + int32_t idx = recTy.getFieldIndex(compName); + if (idx < 0) { + validPath = false; + break; + } + indexPath.push_back(idx); + mlir::Type memTy = recTy.getType(idx); + fir::IntOrValue idxConst = + mlir::IntegerAttr::get(builder.getI32Type(), idx); + coordRef = fir::CoordinateOp::create( + builder, op.getLoc(), builder.getRefType(memTy), coordRef, + llvm::SmallVector<fir::IntOrValue, 1>{idxConst}); + curTy = memTy; + } + if (!validPath) + continue; + if (auto finalRefTy = + mlir::dyn_cast<fir::ReferenceType>(coordRef.getType())) { + mlir::Type eleTy = finalRefTy.getElementType(); + if (fir::isAllocatableType(eleTy)) { + if (!containsPath(seenIndexPaths, indexPath)) { + seenIndexPaths.emplace_back(indexPath.begin(), indexPath.end()); + appendMemberMapIfNew(op, builder, op.getLoc(), coordRef, + indexPath, compPathReversed.front(), + newMapOpsForFields, newMemberIndexPaths); + } + } + } } if (newMapOpsForFields.empty()) return mlir::WalkResult::advance(); - op.getMembersMutable().append(newMapOpsForFields); + // Deduplicate by index path to avoid emitting duplicate members for + // the same component. Use a set-based key to keep this near O(n). + llvm::SmallVector<mlir::Value> dedupMapOps; + llvm::SmallVector<llvm::SmallVector<int64_t>> dedupIndexPaths; + llvm::StringSet<> seenKeys; + for (auto [i, mapOp] : llvm::enumerate(newMapOpsForFields)) { + const auto &path = newMemberIndexPaths[i]; + llvm::SmallString<64> key; + buildPathKey(path, key); + if (seenKeys.contains(key)) + continue; + seenKeys.insert(key); + dedupMapOps.push_back(mapOp); + dedupIndexPaths.emplace_back(path.begin(), path.end()); + } + op.getMembersMutable().append(dedupMapOps); llvm::SmallVector<llvm::SmallVector<int64_t>> newMemberIndices; - mlir::ArrayAttr oldMembersIdxAttr = op.getMembersIndexAttr(); - - if (oldMembersIdxAttr) - for (mlir::Attribute indexList : oldMembersIdxAttr) { + if (mlir::ArrayAttr oldAttr = op.getMembersIndexAttr()) + for (mlir::Attribute indexList : oldAttr) { llvm::SmallVector<int64_t> listVec; for (mlir::Attribute index : mlir::cast<mlir::ArrayAttr>(indexList)) @@ -796,10 +944,8 @@ class MapInfoFinalizationPass newMemberIndices.emplace_back(std::move(listVec)); } - - for (int64_t newFieldIdx : fieldIndicies) - newMemberIndices.emplace_back( - llvm::SmallVector<int64_t>(1, newFieldIdx)); + for (auto &path : dedupIndexPaths) + newMemberIndices.emplace_back(path); op.setMembersIndexAttr(builder.create2DI64ArrayAttr(newMemberIndices)); op.setPartialMap(true); diff --git a/flang/lib/Parser/parsing.cpp b/flang/lib/Parser/parsing.cpp index 8a8c6ef..2df6881 100644 --- a/flang/lib/Parser/parsing.cpp +++ b/flang/lib/Parser/parsing.cpp @@ -85,6 +85,7 @@ const SourceFile *Parsing::Prescan(const std::string &path, Options options) { if (options.features.IsEnabled(LanguageFeature::OpenACC) || (options.prescanAndReformat && noneOfTheAbove)) { prescanner.AddCompilerDirectiveSentinel("$acc"); + prescanner.AddCompilerDirectiveSentinel("@acc"); } if (options.features.IsEnabled(LanguageFeature::OpenMP) || (options.prescanAndReformat && noneOfTheAbove)) { diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index 865c149..66e5b2c 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -147,6 +147,11 @@ void Prescanner::Statement() { directiveSentinel_[4] == '\0') { // CUDA conditional compilation line. condOffset = 5; + } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'a' && + directiveSentinel_[2] == 'c' && directiveSentinel_[3] == 'c' && + directiveSentinel_[4] == '\0') { + // OpenACC conditional compilation line. + condOffset = 5; } if (condOffset && !preprocessingOnly_) { at_ += *condOffset, column_ += *condOffset; diff --git a/flang/test/Lower/OpenMP/declare-mapper.f90 b/flang/test/Lower/OpenMP/declare-mapper.f90 index 8a98c68..3d4d0da 100644 --- a/flang/test/Lower/OpenMP/declare-mapper.f90 +++ b/flang/test/Lower/OpenMP/declare-mapper.f90 @@ -6,6 +6,7 @@ ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-3.f90 -o - | FileCheck %t/omp-declare-mapper-3.f90 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-4.f90 -o - | FileCheck %t/omp-declare-mapper-4.f90 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-5.f90 -o - | FileCheck %t/omp-declare-mapper-5.f90 +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %t/omp-declare-mapper-6.f90 -o - | FileCheck %t/omp-declare-mapper-6.f90 !--- omp-declare-mapper-1.f90 subroutine declare_mapper_1 @@ -262,3 +263,41 @@ contains !$omp end target end subroutine end program declare_mapper_5 + +!--- omp-declare-mapper-6.f90 +subroutine declare_mapper_nested_parent + type :: inner_t + real, allocatable :: deep_arr(:) + end type inner_t + + type, abstract :: base_t + real, allocatable :: base_arr(:) + type(inner_t) :: inner + end type base_t + + type, extends(base_t) :: real_t + real, allocatable :: real_arr(:) + end type real_t + + !$omp declare mapper (custommapper : real_t :: t) map(tofrom: t%base_arr, t%real_arr) + ! CHECK: omp.declare_mapper @{{.*custommapper}} + ! CHECK-DAG: omp.map.info {{.*}} {name = "t%base_t%base_arr"} + ! CHECK-DAG: omp.map.info {{.*}} {name = "t%real_arr"} + ! CHECK: omp.declare_mapper.info + + type(real_t) :: r + + allocate(r%base_arr(10)) + allocate(r%inner%deep_arr(10)) + allocate(r%real_arr(10)) + r%base_arr = 1.0 + r%inner%deep_arr = 4.0 + r%real_arr = 0.0 + + ! Check implicit maps for deep nested allocatable payloads not covered by mapper + ! CHECK-DAG: omp.map.info {{.*}} {name = "r.deep_arr.implicit_map"} + ! CHECK: omp.target + !$omp target map(mapper(custommapper), tofrom: r) + r%real_arr = r%base_arr(1) + r%inner%deep_arr(1) + !$omp end target +end subroutine declare_mapper_nested_parent diff --git a/flang/test/Semantics/OpenACC/acc-sentinel.f90 b/flang/test/Semantics/OpenACC/acc-sentinel.f90 new file mode 100644 index 0000000..d34d97e --- /dev/null +++ b/flang/test/Semantics/OpenACC/acc-sentinel.f90 @@ -0,0 +1,14 @@ +! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenacc + +subroutine test_sentinel() +! Test for error since we currently do not have an OpenACC module upstream. +!ERROR: Cannot parse module file for module 'openacc': Source file 'openacc.mod' was not found + !@acc use openacc + integer :: i + + !$acc parallel loop + do i = 1, 10 + end do + !$acc end parallel + +end subroutine diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index ae8deab..cfb7791 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -325,6 +325,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.dup2 libc.src.unistd.dup3 libc.src.unistd.execve + libc.src.unistd.faccessat libc.src.unistd.fchdir libc.src.unistd.fpathconf libc.src.unistd.fsync diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index bf2ad4a..87b78a33 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -331,6 +331,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.dup2 libc.src.unistd.dup3 libc.src.unistd.execve + libc.src.unistd.faccessat libc.src.unistd.fchdir libc.src.unistd.fpathconf libc.src.unistd.fsync diff --git a/libc/fuzzing/stdlib/strtointeger_differential_fuzz.cpp b/libc/fuzzing/stdlib/strtointeger_differential_fuzz.cpp index 097e619..2fabbba 100644 --- a/libc/fuzzing/stdlib/strtointeger_differential_fuzz.cpp +++ b/libc/fuzzing/stdlib/strtointeger_differential_fuzz.cpp @@ -44,6 +44,10 @@ // greater than 50% chance for each character to end the string, making the odds // of getting long numbers very low. extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 2) // Needs at least one byte for the base and one byte for the + // string. + return 0; + uint8_t *container = new uint8_t[size + 1]; if (!container) __builtin_trap(); diff --git a/libc/include/llvm-libc-macros/linux/fcntl-macros.h b/libc/include/llvm-libc-macros/linux/fcntl-macros.h index aec8a0d..74d406f 100644 --- a/libc/include/llvm-libc-macros/linux/fcntl-macros.h +++ b/libc/include/llvm-libc-macros/linux/fcntl-macros.h @@ -61,6 +61,9 @@ // Allow empty relative pathname. #define AT_EMPTY_PATH 0x1000 +// Perform access checks using the effective user and group IDs. +#define AT_EACCESS 0x200 + // Values of SYS_fcntl commands. #define F_DUPFD 0 #define F_GETFD 1 diff --git a/libc/include/sys/syscall.h.def b/libc/include/sys/syscall.h.def index 6d74cc6..60e5024 100644 --- a/libc/include/sys/syscall.h.def +++ b/libc/include/sys/syscall.h.def @@ -309,6 +309,10 @@ #define SYS_faccessat __NR_faccessat #endif +#ifdef __NR_faccessat2 +#define SYS_faccessat2 __NR_faccessat2 +#endif + #ifdef __NR_fadvise64 #define SYS_fadvise64 __NR_fadvise64 #endif diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml index 3ba3ec7..2ff86ea 100644 --- a/libc/include/unistd.yaml +++ b/libc/include/unistd.yaml @@ -96,6 +96,15 @@ functions: - type: const char * - type: __exec_argv_t - type: __exec_envp_t + - name: faccessat + standards: + - POSIX + return_type: int + arguments: + - type: int + - type: const char * + - type: int + - type: int - name: fchdir standards: - POSIX diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt index c66a3a4..78c3bf8 100644 --- a/libc/src/unistd/CMakeLists.txt +++ b/libc/src/unistd/CMakeLists.txt @@ -56,6 +56,13 @@ add_entrypoint_object( ) add_entrypoint_object( + faccessat + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.faccessat +) + +add_entrypoint_object( fchdir ALIAS DEPENDS diff --git a/libc/src/unistd/faccessat.h b/libc/src/unistd/faccessat.h new file mode 100644 index 0000000..0dc834d --- /dev/null +++ b/libc/src/unistd/faccessat.h @@ -0,0 +1,20 @@ +//===-- Implementation header for faccessat ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_UNISTD_FACCESSAT_H +#define LLVM_LIBC_SRC_UNISTD_FACCESSAT_H + +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int faccessat(int fd, const char *path, int amode, int flag); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_UNISTD_FACCESSAT_H diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt index 2d510f3..dff6ba2 100644 --- a/libc/src/unistd/linux/CMakeLists.txt +++ b/libc/src/unistd/linux/CMakeLists.txt @@ -81,6 +81,19 @@ add_entrypoint_object( ) add_entrypoint_object( + faccessat + SRCS + faccessat.cpp + HDRS + ../faccessat.h + DEPENDS + libc.hdr.fcntl_macros + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil + libc.src.errno.errno +) + +add_entrypoint_object( fchdir SRCS fchdir.cpp diff --git a/libc/src/unistd/linux/access.cpp b/libc/src/unistd/linux/access.cpp index 55cd6ad..f06eec5 100644 --- a/libc/src/unistd/linux/access.cpp +++ b/libc/src/unistd/linux/access.cpp @@ -23,7 +23,7 @@ LLVM_LIBC_FUNCTION(int, access, (const char *path, int mode)) { int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_access, path, mode); #elif defined(SYS_faccessat) int ret = - LIBC_NAMESPACE::syscall_impl<int>(SYS_faccessat, AT_FDCWD, path, mode, 0); + LIBC_NAMESPACE::syscall_impl<int>(SYS_faccessat, AT_FDCWD, path, mode); #else #error "access and faccessat syscalls not available." #endif diff --git a/libc/src/unistd/linux/faccessat.cpp b/libc/src/unistd/linux/faccessat.cpp new file mode 100644 index 0000000..7a2a29c --- /dev/null +++ b/libc/src/unistd/linux/faccessat.cpp @@ -0,0 +1,37 @@ +//===-- Linux implementation of faccessat ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/faccessat.h" + +#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/common.h" + +#include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include <sys/syscall.h> // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, faccessat, + (int fd, const char *path, int amode, int flag)) { +#ifdef SYS_faccessat2 + int ret = + LIBC_NAMESPACE::syscall_impl<int>(SYS_faccessat2, fd, path, amode, flag); +#else +#error "faccessat2 syscall is not available." +#endif + + if (ret < 0) { + libc_errno = -ret; + return -1; + } + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index 6630a7e..44f28ff 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -94,6 +94,23 @@ add_libc_unittest( ) add_libc_unittest( + faccessat_test + SUITE + libc_unistd_unittests + SRCS + faccessat_test.cpp + DEPENDS + libc.include.unistd + libc.src.errno.errno + libc.src.fcntl.open + libc.src.unistd.faccessat + libc.src.unistd.close + libc.src.unistd.unlink + libc.test.UnitTest.ErrnoCheckingTest + libc.test.UnitTest.ErrnoSetterMatcher +) + +add_libc_unittest( fchdir_test SUITE libc_unistd_unittests diff --git a/libc/test/src/unistd/faccessat_test.cpp b/libc/test/src/unistd/faccessat_test.cpp new file mode 100644 index 0000000..6280b14 --- /dev/null +++ b/libc/test/src/unistd/faccessat_test.cpp @@ -0,0 +1,115 @@ +//===-- Unittests for faccessat -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/fcntl/open.h" +#include "src/unistd/close.h" +#include "src/unistd/faccessat.h" +#include "src/unistd/unlink.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/Test.h" + +#include <fcntl.h> +#include <sys/stat.h> +#include <unistd.h> + +namespace { + +using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; +using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + +using LlvmLibcFaccessatTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcFaccessatTest, WithAtFdcwd) { + // Test access checks on a file with AT_FDCWD and no flags, equivalent to + // access(). + constexpr const char *FILENAME = "faccessat_basic.test"; + auto TEST_FILE = libc_make_test_file_path(FILENAME); + + // Check permissions on a file with full permissions + int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + + ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, F_OK, 0), + Succeeds(0)); + ASSERT_THAT( + LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, X_OK | W_OK | R_OK, 0), + Succeeds(0)); + ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0)); + + // Check permissions on a file with execute-only permission + fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IXUSR); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + + ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, F_OK, 0), + Succeeds(0)); + ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, X_OK, 0), + Succeeds(0)); + ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, R_OK, 0), + Fails(EACCES)); + ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, W_OK, 0), + Fails(EACCES)); + ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0)); +} + +TEST_F(LlvmLibcFaccessatTest, NonExistentFile) { + ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, "faccessat_nonexistent.test", + F_OK, 0), + Fails(ENOENT)); +} + +TEST_F(LlvmLibcFaccessatTest, AtEaccess) { + // With AT_EACCESS, faccessat checks permissions using the effective user ID, + // but the effective and real user ID will be the same here and changing that + // is not feasible in a test, so this is just a basic sanity check. + constexpr const char *FILENAME = "faccessat_eaccess.test"; + auto TEST_FILE = libc_make_test_file_path(FILENAME); + + int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + + ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, X_OK | W_OK | R_OK, + AT_EACCESS), + Succeeds(0)); + + ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0)); +} + +TEST_F(LlvmLibcFaccessatTest, AtEmptyPath) { + constexpr const char *FILENAME = "faccessat_atemptypath.test"; + auto TEST_FILE = libc_make_test_file_path(FILENAME); + + int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + // Check permissions on the file referred to by fd + ASSERT_THAT(LIBC_NAMESPACE::faccessat(fd, "", F_OK, AT_EMPTY_PATH), + Succeeds(0)); + ASSERT_THAT( + LIBC_NAMESPACE::faccessat(fd, "", X_OK | W_OK | R_OK, AT_EMPTY_PATH), + Succeeds(0)); + + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0)); + + // Check permissions on the current working directory + ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, "", F_OK, AT_EMPTY_PATH), + Succeeds(0)); + ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, "", X_OK | W_OK | R_OK, + AT_EMPTY_PATH), + Succeeds(0)); +} + +} // namespace diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index a59cc06..3676b88 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -2104,18 +2104,18 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) { config->dtltoDistributor = args.getLastArgValue(OPT_thinlto_distributor); // Handle /thinlto-distributor-arg:<arg> - for (auto *arg : args.filtered(OPT_thinlto_distributor_arg)) - config->dtltoDistributorArgs.push_back(arg->getValue()); + config->dtltoDistributorArgs = + args::getStrings(args, OPT_thinlto_distributor_arg); // Handle /thinlto-remote-compiler:<path> - config->dtltoCompiler = args.getLastArgValue(OPT_thinlto_compiler); + config->dtltoCompiler = args.getLastArgValue(OPT_thinlto_remote_compiler); if (!config->dtltoDistributor.empty() && config->dtltoCompiler.empty()) Err(ctx) << "A value must be specified for /thinlto-remote-compiler if " "/thinlto-distributor is specified."; // Handle /thinlto-remote-compiler-arg:<arg> - for (auto *arg : args.filtered(OPT_thinlto_compiler_arg)) - config->dtltoCompilerArgs.push_back(arg->getValue()); + config->dtltoCompilerArgs = + args::getStrings(args, OPT_thinlto_remote_compiler_arg); // Handle /dwodir config->dwoDir = args.getLastArgValue(OPT_dwodir); diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td index 485db5a..f3d0eb3 100644 --- a/lld/COFF/Options.td +++ b/lld/COFF/Options.td @@ -289,10 +289,10 @@ def thinlto_distributor : P<"thinlto-distributor", "backend compilations will be distributed">; def thinlto_distributor_arg : P<"thinlto-distributor-arg", "Arguments to pass to the ThinLTO distributor">; -def thinlto_compiler : P<"thinlto-remote-compiler", +def thinlto_remote_compiler : P<"thinlto-remote-compiler", "Compiler for the ThinLTO distributor to invoke for ThinLTO backend " "compilations">; -def thinlto_compiler_arg : P<"thinlto-remote-compiler-arg", +def thinlto_remote_compiler_arg : P<"thinlto-remote-compiler-arg", "Compiler arguments for the ThinLTO distributor to pass for ThinLTO backend " "compilations">; def lto_obj_path : P< diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 1beab8d..62f7fff 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1399,8 +1399,9 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) { ctx.arg.dtltoDistributor = args.getLastArgValue(OPT_thinlto_distributor_eq); ctx.arg.dtltoDistributorArgs = args::getStrings(args, OPT_thinlto_distributor_arg); - ctx.arg.dtltoCompiler = args.getLastArgValue(OPT_thinlto_compiler_eq); - ctx.arg.dtltoCompilerArgs = args::getStrings(args, OPT_thinlto_compiler_arg); + ctx.arg.dtltoCompiler = args.getLastArgValue(OPT_thinlto_remote_compiler_eq); + ctx.arg.dtltoCompilerArgs = + args::getStrings(args, OPT_thinlto_remote_compiler_arg); ctx.arg.dwoDir = args.getLastArgValue(OPT_plugin_opt_dwo_dir_eq); ctx.arg.dynamicLinker = getDynamicLinker(ctx, args); ctx.arg.ehFrameHdr = diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index f052318..0d6dda4 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -722,11 +722,11 @@ def thinlto_distributor_eq: JJ<"thinlto-distributor=">, "ThinLTO backend compilations will be distributed">; defm thinlto_distributor_arg: EEq<"thinlto-distributor-arg", "Arguments to " "pass to the ThinLTO distributor">; -def thinlto_compiler_eq: JJ<"thinlto-remote-compiler=">, +def thinlto_remote_compiler_eq: JJ<"thinlto-remote-compiler=">, HelpText<"Compiler for the ThinLTO distributor to invoke for ThinLTO backend " "compilations">; -defm thinlto_compiler_arg: EEq<"thinlto-remote-compiler-arg", "Compiler " - "arguments for the ThinLTO distributor to pass for ThinLTO backend " +defm thinlto_remote_compiler_arg: EEq<"thinlto-remote-compiler-arg", + "Compiler arguments for the ThinLTO distributor to pass for ThinLTO backend " "compilations">; defm fat_lto_objects: BB<"fat-lto-objects", "Use the .llvm.lto section, which contains LLVM bitcode, in fat LTO object files to perform LTO.", diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp index 90ffe78..0d03250 100644 --- a/lldb/source/API/SBTarget.cpp +++ b/lldb/source/API/SBTarget.cpp @@ -255,7 +255,7 @@ SBProcess SBTarget::LoadCore(const char *core_file, lldb::SBError &error) { ProcessSP process_sp(target_sp->CreateProcess( target_sp->GetDebugger().GetListener(), "", &filespec, false)); if (process_sp) { - ElapsedTime loadCoreTime(target_sp->GetStatistics().GetLoadCoreTime()); + ElapsedTime load_core_time(target_sp->GetStatistics().GetLoadCoreTime()); error.SetError(process_sp->LoadCore()); if (error.Success()) sb_process.SetSP(process_sp); diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index b5fc49d..c59d028 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -419,7 +419,7 @@ protected: // Seems weird that we Launch a core file, but that is what we // do! { - ElapsedTime loadCoreTime( + ElapsedTime load_core_time( target_sp->GetStatistics().GetLoadCoreTime()); error = process_sp->LoadCore(); } diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp index 924953c..3c49c91 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp @@ -792,7 +792,7 @@ ClangExpressionParser::ClangExpressionParser( // 6. Set up the source management objects inside the compiler m_compiler->createFileManager(); if (!m_compiler->hasSourceManager()) - m_compiler->createSourceManager(m_compiler->getFileManager()); + m_compiler->createSourceManager(); m_compiler->createPreprocessor(TU_Complete); switch (expr.Language().AsLanguageType()) { diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst index 2b1af05..0ff4cc7 100644 --- a/llvm/docs/TableGen/ProgRef.rst +++ b/llvm/docs/TableGen/ProgRef.rst @@ -64,7 +64,7 @@ Classes and concrete records have a unique *name*, either chosen by the programmer or generated by TableGen. Associated with that name is a list of *fields* with values and an optional list of *parent classes* (sometimes called base or super classes). The fields are the primary data that -backends will process. Note that TableGen assigns no meanings to fields; the +backends will process. Note that TableGen assigns no meaning to fields; the meanings are entirely up to the backends and the programs that incorporate the output of those backends. @@ -243,7 +243,7 @@ Include files ------------- TableGen has an include mechanism. The content of the included file -lexically replaces the ``include`` directive and is then parsed as if it was +lexically replaces the ``include`` directive and is then parsed as if it were originally in the main file. .. productionlist:: @@ -670,17 +670,17 @@ name of a multiclass. The argument values can be specified in two forms: * Positional argument (``value``). The value is assigned to the argument in the - corresponding position. For ``Foo<a0, a1>``, ``a0`` will be assigned to first - argument and ``a1`` will be assigned to second argument. + corresponding position. For ``Foo<a0, a1>``, ``a0`` will be assigned to the first + argument and ``a1`` will be assigned to the second argument. * Named argument (``name=value``). The value is assigned to the argument with the specified name. For ``Foo<a=a0, b=a1>``, ``a0`` will be assigned to the argument with name ``a`` and ``a1`` will be assigned to the argument with name ``b``. -Required arguments can also be specified as named argument. +Required arguments can also be specified as a named argument. Note that the argument can only be specified once regardless of the way (named -or positional) to specify and positional arguments should be put before named +or positional) to specify and positional arguments should precede named arguments. .. productionlist:: @@ -817,7 +817,7 @@ type. It provides a single field, ``Value``, which holds a 3-bit number. Its template argument, ``val``, is used to set the ``Value`` field. Each of the eight records is defined with ``FPFormat`` as its parent class. The enumeration value is passed in angle brackets as the template argument. Each -record will inherent the ``Value`` field with the appropriate enumeration +record will inherit the ``Value`` field with the appropriate enumeration value. Here is a more complex example of classes with template arguments. First, we @@ -1308,7 +1308,7 @@ with ``F0``, ``F1``, ``F2``, and ``F3``. ------------------------------------- A ``dump`` statement prints the input string to standard error -output. It is intended for debugging purpose. +output. It is intended for debugging purposes. * At top level, the message is printed immediately. @@ -1727,7 +1727,7 @@ and non-0 as true. ``!div(``\ *a*\ ``,`` *b*\ ``)`` This operator performs signed division of *a* by *b*, and produces the quotient. - Division by 0 produces an error. Division of INT64_MIN by -1 produces an error. + Division by 0 produces an error. Division of ``INT64_MIN`` by -1 produces an error. ``!empty(``\ *a*\ ``)`` This operator produces 1 if the string, list, or DAG *a* is empty; 0 otherwise. @@ -1914,7 +1914,7 @@ and non-0 as true. ``!or(``\ *a*\ ``,`` *b*\ ``, ...)`` This operator does a bitwise OR on *a*, *b*, etc., and produces the result. A logical OR can be performed if all the arguments are either - 0 or 1. This operator is short-circuit to -1 (all ones) the left-most + 0 or 1. This operator is short-circuit to -1 (all ones) when the left-most operand is -1. ``!range([``\ *start*\ ``,]`` *end*\ ``[,``\ *step*\ ``])`` @@ -1937,7 +1937,7 @@ and non-0 as true. Equivalent to ``!range(0, !size(list))``. ``!repr(``\ *value*\ ``)`` - Represents *value* as a string. String format for the value is not + Represents *value* as a string. The string format for the value is not guaranteed to be stable. Intended for debugging purposes only. ``!setdagarg(``\ *dag*\ ``,``\ *key*\ ``,``\ *arg*\ ``)`` diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h index 83350e6..9e81a4b 100644 --- a/llvm/include/llvm/ADT/BitVector.h +++ b/llvm/include/llvm/ADT/BitVector.h @@ -570,10 +570,7 @@ public: template <class F, class... ArgTys> static BitVector &apply(F &&f, BitVector &Out, BitVector const &Arg, ArgTys const &...Args) { - assert(llvm::all_of( - std::initializer_list<unsigned>{Args.size()...}, - [&Arg](auto const &BV) { return Arg.size() == BV; }) && - "consistent sizes"); + assert(((Arg.size() == Args.size()) && ...) && "consistent sizes"); Out.resize(Arg.size()); for (size_type I = 0, E = Arg.Bits.size(); I != E; ++I) Out.Bits[I] = f(Arg.Bits[I], Args.Bits[I]...); diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h index 6de194d..6a943c5 100644 --- a/llvm/include/llvm/ADT/ConcurrentHashtable.h +++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h @@ -253,9 +253,8 @@ public: OS << "\nOverall number of entries = " << OverallNumberOfEntries; OS << "\nOverall number of non empty buckets = " << NumberOfNonEmptyBuckets; - for (auto &BucketSize : BucketSizesMap) - OS << "\n Number of buckets with size " << BucketSize.first << ": " - << BucketSize.second; + for (auto [Size, Count] : BucketSizesMap) + OS << "\n Number of buckets with size " << Size << ": " << Count; std::stringstream stream; stream << std::fixed << std::setprecision(2) diff --git a/llvm/include/llvm/ADT/DirectedGraph.h b/llvm/include/llvm/ADT/DirectedGraph.h index 83c0bea..fb6b180 100644 --- a/llvm/include/llvm/ADT/DirectedGraph.h +++ b/llvm/include/llvm/ADT/DirectedGraph.h @@ -181,16 +181,6 @@ public: DirectedGraph() = default; explicit DirectedGraph(NodeType &N) : Nodes() { addNode(N); } - DirectedGraph(const DGraphType &G) : Nodes(G.Nodes) {} - DirectedGraph(DGraphType &&RHS) : Nodes(std::move(RHS.Nodes)) {} - DGraphType &operator=(const DGraphType &G) { - Nodes = G.Nodes; - return *this; - } - DGraphType &operator=(const DGraphType &&G) { - Nodes = std::move(G.Nodes); - return *this; - } const_iterator begin() const { return Nodes.begin(); } const_iterator end() const { return Nodes.end(); } diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h index be690a4..571caf9 100644 --- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h +++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h @@ -59,14 +59,6 @@ LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type); /// True if the AllocTypes bitmask contains just a single type. LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes); -/// Removes any existing "ambiguous" memprof attribute. Called before we apply a -/// specific allocation type such as "cold", "notcold", or "hot". -LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB); - -/// Adds an "ambiguous" memprof attribute to call with a matched allocation -/// profile but that we haven't yet been able to disambiguate. -LLVM_ABI void addAmbiguousAttribute(CallBase *CB); - /// Class to build a trie of call stack contexts for a particular profiled /// allocation call, along with their associated allocation types. /// The allocation will be at the root of the trie, which is then used to diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h index b37c677..50ce931 100644 --- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h +++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h @@ -501,8 +501,12 @@ public: StackID == TargetStackID::ScalablePredicateVector; } - bool isScalableStackID(int ObjectIdx) const { + bool hasScalableStackID(int ObjectIdx) const { uint8_t StackID = getStackID(ObjectIdx); + return isScalableStackID(StackID); + } + + bool isScalableStackID(uint8_t StackID) const { return StackID == TargetStackID::ScalableVector || StackID == TargetStackID::ScalablePredicateVector; } diff --git a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h index 52af205..ffe0b50 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h +++ b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h @@ -179,6 +179,7 @@ public: class DWARFDataExtractorSimple : public DWARFDataExtractorBase<DWARFDataExtractorSimple> { +public: using DWARFDataExtractorBase::DWARFDataExtractorBase; LLVM_ABI uint64_t getRelocatedValueImpl(uint32_t Size, uint64_t *Off, diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index eb0440f..0622bfa 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -810,6 +810,26 @@ public: /// Whether the intrinsic is signed or unsigned. bool isSigned() const { return isSigned(getIntrinsicID()); }; + /// Whether the intrinsic is a smin or umin. + static bool isMin(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::umin: + case Intrinsic::smin: + return true; + case Intrinsic::umax: + case Intrinsic::smax: + return false; + default: + llvm_unreachable("Invalid intrinsic"); + } + } + + /// Whether the intrinsic is a smin or a umin. + bool isMin() const { return isMin(getIntrinsicID()); } + + /// Whether the intrinsic is a smax or a umax. + bool isMax() const { return !isMin(getIntrinsicID()); } + /// Min/max intrinsics are monotonic, they operate on a fixed-bitwidth values, /// so there is a certain threshold value, upon reaching which, /// their value can no longer change. Return said threshold. diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 11602d2..0c1f8db 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -125,24 +125,6 @@ bool llvm::memprof::hasSingleAllocType(uint8_t AllocTypes) { return NumAllocTypes == 1; } -void llvm::memprof::removeAnyExistingAmbiguousAttribute(CallBase *CB) { - if (!CB->hasFnAttr("memprof")) - return; - assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous"); - CB->removeFnAttr("memprof"); -} - -void llvm::memprof::addAmbiguousAttribute(CallBase *CB) { - // We may have an existing ambiguous attribute if we are reanalyzing - // after inlining. - if (CB->hasFnAttr("memprof")) { - assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous"); - } else { - auto A = llvm::Attribute::get(CB->getContext(), "memprof", "ambiguous"); - CB->addFnAttr(A); - } -} - void CallStackTrie::addCallStack( AllocationType AllocType, ArrayRef<uint64_t> StackIds, std::vector<ContextTotalSize> ContextSizeInfo) { @@ -488,9 +470,6 @@ void CallStackTrie::addSingleAllocTypeAttribute(CallBase *CI, AllocationType AT, StringRef Descriptor) { auto AllocTypeString = getAllocTypeAttributeString(AT); auto A = llvm::Attribute::get(CI->getContext(), "memprof", AllocTypeString); - // After inlining we may be able to convert an existing ambiguous allocation - // to an unambiguous one. - removeAnyExistingAmbiguousAttribute(CI); CI->addFnAttr(A); if (MemProfReportHintedSizes) { std::vector<ContextTotalSize> ContextSizeInfo; @@ -550,7 +529,6 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) { assert(MIBCallStack.size() == 1 && "Should only be left with Alloc's location in stack"); CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes)); - addAmbiguousAttribute(CI); return true; } // If there exists corner case that CallStackTrie has one chain to leaf diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 62fb5eb..3cfe7cc 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -1889,11 +1889,12 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) { bool IsBitfield = DT->isBitField(); // Handle the size. - if (auto *Var = dyn_cast_or_null<DIVariable>(DT->getRawSizeInBits())) { + if (DT->getRawSizeInBits() == nullptr) { + // No size, just ignore. + } else if (auto *Var = dyn_cast<DIVariable>(DT->getRawSizeInBits())) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(MemberDie, dwarf::DW_AT_bit_size, *VarDIE); - } else if (auto *Exp = - dyn_cast_or_null<DIExpression>(DT->getRawSizeInBits())) { + } else if (auto *Exp = dyn_cast<DIExpression>(DT->getRawSizeInBits())) { DIELoc *Loc = new (DIEValueAllocator) DIELoc; DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); DwarfExpr.setMemoryLocationKind(); diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp index ec75dc3..64e5cd5 100644 --- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp +++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp @@ -72,7 +72,7 @@ struct StackFrameLayoutAnalysis { : Slot(Idx), Size(MFI.getObjectSize(Idx)), Align(MFI.getObjectAlign(Idx).value()), Offset(Offset), SlotTy(Invalid), Scalable(false) { - Scalable = MFI.isScalableStackID(Idx); + Scalable = MFI.hasScalableStackID(Idx); if (MFI.isSpillSlotObjectIndex(Idx)) SlotTy = SlotType::Spill; else if (MFI.isFixedObjectIndex(Idx)) diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index d14abb4..8623c06 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -5857,7 +5857,7 @@ DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp, // practice. if (Exp == APFloat::IEK_NaN) { DoubleAPFloat Quiet{Arg}; - Quiet.getFirst().makeQuiet(); + Quiet.getFirst() = Quiet.getFirst().makeQuiet(); return Quiet; } diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 79655e1..0f4bbfc3 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1610,7 +1610,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, int BaseOffset = -AFI->getTaggedBasePointerOffset(); Register FrameReg; StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference( - MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg, + MF, BaseOffset, false /*isFixed*/, TargetStackID::Default /*StackID*/, + FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); Register SrcReg = FrameReg; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 20b0d69..8d6eb91 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -56,15 +56,20 @@ // | async context if needed | // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) -// | <hazard padding> | -// |-----------------------------------| -// | | -// | callee-saved fp/simd/SVE regs | -// | | -// |-----------------------------------| -// | | -// | SVE stack objects | -// | | +// Default SVE stack layout Split SVE objects +// (aarch64-split-sve-objects=false) (aarch64-split-sve-objects=true) +// |-----------------------------------| |-----------------------------------| +// | <hazard padding> | | callee-saved PPR registers | +// |-----------------------------------| |-----------------------------------| +// | | | PPR stack objects | +// | callee-saved fp/simd/SVE regs | |-----------------------------------| +// | | | <hazard padding> | +// |-----------------------------------| |-----------------------------------| +// | | | callee-saved ZPR/FPR registers | +// | SVE stack objects | |-----------------------------------| +// | | | ZPR stack objects | +// |-----------------------------------| |-----------------------------------| +// ^ NB: FPR CSRs are promoted to ZPRs // |-----------------------------------| // |.empty.space.to.make.part.below....| // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at @@ -274,6 +279,11 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects", cl::desc("sort stack allocations"), cl::init(true), cl::Hidden); +static cl::opt<bool> + SplitSVEObjects("aarch64-split-sve-objects", + cl::desc("Split allocation of ZPR & PPR objects"), + cl::init(false), cl::Hidden); + cl::opt<bool> EnableHomogeneousPrologEpilog( "homogeneous-prolog-epilog", cl::Hidden, cl::desc("Emit homogeneous prologue and epilogue for the size " @@ -324,6 +334,40 @@ AArch64FrameLowering::getArgumentStackToRestore(MachineFunction &MF, static bool produceCompactUnwindFrame(const AArch64FrameLowering &, MachineFunction &MF); +enum class AssignObjectOffsets { No, Yes }; +/// Process all the SVE stack objects and the SVE stack size and offsets for +/// each object. If AssignOffsets is "Yes", the offsets get assigned (and SVE +/// stack sizes set). Returns the size of the SVE stack. +static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, + AssignObjectOffsets AssignOffsets); + +static unsigned getStackHazardSize(const MachineFunction &MF) { + return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize(); +} + +/// Returns true if PPRs are spilled as ZPRs. +static bool arePPRsSpilledAsZPR(const MachineFunction &MF) { + return MF.getSubtarget().getRegisterInfo()->getSpillSize( + AArch64::PPRRegClass) == 16; +} + +StackOffset +AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return StackOffset::getScalable(AFI->getStackSizeZPR()); +} + +StackOffset +AArch64FrameLowering::getPPRStackSize(const MachineFunction &MF) const { + // With split SVE objects, the hazard padding is added to the PPR region, + // which places it between the [GPR, PPR] area and the [ZPR, FPR] area. This + // avoids hazards between both GPRs and FPRs and ZPRs and PPRs. + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return StackOffset::get(AFI->hasSplitSVEObjects() ? getStackHazardSize(MF) + : 0, + AFI->getStackSizePPR()); +} + // Conservatively, returns true if the function is likely to have SVE vectors // on the stack. This function is safe to be called before callee-saves or // object offsets have been determined. @@ -338,7 +382,7 @@ static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL, const MachineFrameInfo &MFI = MF.getFrameInfo(); for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) { - if (MFI.isScalableStackID(FI)) + if (MFI.hasScalableStackID(FI)) return true; } @@ -482,13 +526,6 @@ AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF, } } -/// Returns the size of the entire SVE stackframe (calleesaves + spills). -StackOffset -AArch64FrameLowering::getSVEStackSize(const MachineFunction &MF) const { - const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE()); -} - bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -514,7 +551,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { !Subtarget.hasSVE(); return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize || - getSVEStackSize(MF) || LowerQRegCopyThroughMem); + AFI->hasSVEStackSize() || LowerQRegCopyThroughMem); } /// hasFPImpl - Return true if the specified function should have a dedicated @@ -557,7 +594,7 @@ bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const { // CFA in either of these cases. if (AFI.needsDwarfUnwindInfo(MF) && ((requiresSaveVG(MF) || AFI.getSMEFnAttrs().hasStreamingBody()) && - (!AFI.hasCalculatedStackSizeSVE() || AFI.getStackSizeSVE() > 0))) + (!AFI.hasCalculatedStackSizeSVE() || AFI.hasSVEStackSize()))) return true; // With large callframes around we may need to use FP to access the scavenging // emergency spillslot. @@ -1126,10 +1163,6 @@ static bool isTargetWindows(const MachineFunction &MF) { return MF.getSubtarget<AArch64Subtarget>().isTargetWindows(); } -static unsigned getStackHazardSize(const MachineFunction &MF) { - return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize(); -} - void AArch64FrameLowering::emitPacRetPlusLeafHardening( MachineFunction &MF) const { const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); @@ -1212,7 +1245,9 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); - StackOffset SVEStackSize = getSVEStackSize(MF); + StackOffset ZPRStackSize = getZPRStackSize(MF); + StackOffset PPRStackSize = getPPRStackSize(MF); + StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; // For VLA-area objects, just emit an offset at the end of the stack frame. // Whilst not quite correct, these objects do live at the end of the frame and @@ -1228,11 +1263,21 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); bool FPAfterSVECalleeSaves = isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize(); - if (MFI.isScalableStackID(FI)) { + if (MFI.hasScalableStackID(FI)) { if (FPAfterSVECalleeSaves && - -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) + -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) { + assert(!AFI->hasSplitSVEObjects() && + "split-sve-objects not supported with FPAfterSVECalleeSaves"); return StackOffset::getScalable(ObjectOffset); - return StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()), + } + StackOffset AccessOffset{}; + // The scalable vectors are below (lower address) the scalable predicates + // with split SVE objects, so we must subtract the size of the predicates. + if (AFI->hasSplitSVEObjects() && + MFI.getStackID(FI) == TargetStackID::ScalableVector) + AccessOffset = -PPRStackSize; + return AccessOffset + + StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()), ObjectOffset); } @@ -1294,14 +1339,15 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); bool isFixed = MFI.isFixedObjectIndex(FI); - bool isSVE = MFI.isScalableStackID(FI); - return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, - PreferFP, ForSimm); + auto StackID = static_cast<TargetStackID::Value>(MFI.getStackID(FI)); + return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, StackID, + FrameReg, PreferFP, ForSimm); } StackOffset AArch64FrameLowering::resolveFrameOffsetReference( - const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE, - Register &FrameReg, bool PreferFP, bool ForSimm) const { + const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, + TargetStackID::Value StackID, Register &FrameReg, bool PreferFP, + bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); @@ -1312,8 +1358,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed(); bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI)); + bool isSVE = MFI.isScalableStackID(StackID); - const StackOffset &SVEStackSize = getSVEStackSize(MF); + StackOffset ZPRStackSize = getZPRStackSize(MF); + StackOffset PPRStackSize = getPPRStackSize(MF); + StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't @@ -1388,12 +1437,25 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize(); if (isSVE) { - StackOffset FPOffset = - StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset); + StackOffset FPOffset = StackOffset::get( + -AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset); StackOffset SPOffset = SVEStackSize + StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), ObjectOffset); + + // With split SVE objects the ObjectOffset is relative to the split area + // (i.e. the PPR area or ZPR area respectively). + if (AFI->hasSplitSVEObjects() && StackID == TargetStackID::ScalableVector) { + // If we're accessing an SVE vector with split SVE objects... + // - From the FP we need to move down past the PPR area: + FPOffset -= PPRStackSize; + // - From the SP we only need to move up to the ZPR area: + SPOffset -= PPRStackSize; + // Note: `SPOffset = SVEStackSize + ...`, so `-= PPRStackSize` results in + // `SPOffset = ZPRStackSize + ...`. + } + if (FPAfterSVECalleeSaves) { FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()); if (-ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) { @@ -1401,6 +1463,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( SPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize()); } } + // Always use the FP for SVE spills if available and beneficial. if (hasFP(MF) && (SPOffset.getFixed() || FPOffset.getScalable() < SPOffset.getScalable() || @@ -1408,13 +1471,13 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; } - FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() : (unsigned)AArch64::SP; + return SPOffset; } - StackOffset ScalableOffset = {}; + StackOffset SVEAreaOffset = {}; if (FPAfterSVECalleeSaves) { // In this stack layout, the FP is in between the callee saves and other // SVE allocations. @@ -1422,25 +1485,25 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()); if (UseFP) { if (isFixed) - ScalableOffset = SVECalleeSavedStack; + SVEAreaOffset = SVECalleeSavedStack; else if (!isCSR) - ScalableOffset = SVECalleeSavedStack - SVEStackSize; + SVEAreaOffset = SVECalleeSavedStack - SVEStackSize; } else { if (isFixed) - ScalableOffset = SVEStackSize; + SVEAreaOffset = SVEStackSize; else if (isCSR) - ScalableOffset = SVEStackSize - SVECalleeSavedStack; + SVEAreaOffset = SVEStackSize - SVECalleeSavedStack; } } else { if (UseFP && !(isFixed || isCSR)) - ScalableOffset = -SVEStackSize; + SVEAreaOffset = -SVEStackSize; if (!UseFP && (isFixed || isCSR)) - ScalableOffset = SVEStackSize; + SVEAreaOffset = SVEStackSize; } if (UseFP) { FrameReg = RegInfo->getFrameRegister(MF); - return StackOffset::getFixed(FPOffset) + ScalableOffset; + return StackOffset::getFixed(FPOffset) + SVEAreaOffset; } // Use the base pointer if we have one. @@ -1457,7 +1520,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( Offset -= AFI->getLocalStackSize(); } - return StackOffset::getFixed(Offset) + ScalableOffset; + return StackOffset::getFixed(Offset) + SVEAreaOffset; } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { @@ -1614,11 +1677,25 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, RegInc = -1; FirstReg = Count - 1; } + bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize(); - int ScalableByteOffset = - FPAfterSVECalleeSaves ? 0 : AFI->getSVECalleeSavedStackSize(); + + int ZPRByteOffset = 0; + int PPRByteOffset = 0; + bool SplitPPRs = AFI->hasSplitSVEObjects(); + if (SplitPPRs) { + ZPRByteOffset = AFI->getZPRCalleeSavedStackSize(); + PPRByteOffset = AFI->getPPRCalleeSavedStackSize(); + } else if (!FPAfterSVECalleeSaves) { + ZPRByteOffset = + AFI->getZPRCalleeSavedStackSize() + AFI->getPPRCalleeSavedStackSize(); + // Unused: Everything goes in ZPR space. + PPRByteOffset = 0; + } + bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace(); Register LastReg = 0; + bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex() && !SplitPPRs; // When iterating backwards, the loop condition relies on unsigned wraparound. for (unsigned i = FirstReg; i < Count; i += RegInc) { @@ -1647,8 +1724,12 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, llvm_unreachable("Unsupported register class."); } + int &ScalableByteOffset = RPI.Type == RegPairInfo::PPR && SplitPPRs + ? PPRByteOffset + : ZPRByteOffset; + // Add the stack hazard size as we transition from GPR->FPR CSRs. - if (AFI->hasStackHazardSlotIndex() && + if (HasCSHazardPadding && (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && AArch64InstrInfo::isFpOrNEON(RPI.Reg1)) ByteOffset += StackFillDir * StackHazardSize; @@ -1656,7 +1737,7 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, int Scale = TRI->getSpillSize(*RPI.RC); // Add the next reg to the pair if it is in the same register class. - if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) { + if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) { MCRegister NextReg = CSI[i + RegInc].getReg(); bool IsFirst = i == FirstReg; switch (RPI.Type) { @@ -2203,6 +2284,13 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI, return getMMOFrameID(*MI.memoperands_begin(), MFI); } +// Returns true if the LDST MachineInstr \p MI is a PPR access. +static bool isPPRAccess(const MachineInstr &MI) { + return MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO && + MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO && + AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()); +} + // Check if a Hazard slot is needed for the current function, and if so create // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex, // which can be used to determine if any hazard padding is needed. @@ -2226,25 +2314,50 @@ void AArch64FrameLowering::determineStackHazardSlot( bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { return AArch64::FPR64RegClass.contains(Reg) || AArch64::FPR128RegClass.contains(Reg) || - AArch64::ZPRRegClass.contains(Reg) || - AArch64::PPRRegClass.contains(Reg); + AArch64::ZPRRegClass.contains(Reg); + }); + bool HasPPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { + return AArch64::PPRRegClass.contains(Reg); }); bool HasFPRStackObjects = false; - if (!HasFPRCSRs) { - std::vector<unsigned> FrameObjects(MFI.getObjectIndexEnd()); + bool HasPPRStackObjects = false; + if (!HasFPRCSRs || SplitSVEObjects) { + enum SlotType : uint8_t { + Unknown = 0, + ZPRorFPR = 1 << 0, + PPR = 1 << 1, + GPR = 1 << 2, + LLVM_MARK_AS_BITMASK_ENUM(GPR) + }; + + // Find stack slots solely used for one kind of register (ZPR, PPR, etc.), + // based on the kinds of accesses used in the function. + SmallVector<SlotType> SlotTypes(MFI.getObjectIndexEnd(), SlotType::Unknown); for (auto &MBB : MF) { for (auto &MI : MBB) { std::optional<int> FI = getLdStFrameID(MI, MFI); - if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { - if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI)) - FrameObjects[*FI] |= 2; - else - FrameObjects[*FI] |= 1; + if (!FI || FI < 0 || FI > int(SlotTypes.size())) + continue; + if (MFI.hasScalableStackID(*FI)) { + SlotTypes[*FI] |= + isPPRAccess(MI) ? SlotType::PPR : SlotType::ZPRorFPR; + } else { + SlotTypes[*FI] |= AArch64InstrInfo::isFpOrNEON(MI) + ? SlotType::ZPRorFPR + : SlotType::GPR; } } } - HasFPRStackObjects = - any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; }); + + for (int FI = 0; FI < int(SlotTypes.size()); ++FI) { + HasFPRStackObjects |= SlotTypes[FI] == SlotType::ZPRorFPR; + // For SplitSVEObjects remember that this stack slot is a predicate, this + // will be needed later when determining the frame layout. + if (SlotTypes[FI] == SlotType::PPR) { + MFI.setStackID(FI, TargetStackID::ScalablePredicateVector); + HasPPRStackObjects = true; + } + } } if (HasFPRCSRs || HasFPRStackObjects) { @@ -2253,6 +2366,78 @@ void AArch64FrameLowering::determineStackHazardSlot( << StackHazardSize << "\n"); AFI->setStackHazardSlotIndex(ID); } + + // Determine if we should use SplitSVEObjects. This should only be used if + // there's a possibility of a stack hazard between PPRs and ZPRs or FPRs. + if (SplitSVEObjects) { + if (!HasPPRCSRs && !HasPPRStackObjects) { + LLVM_DEBUG( + dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n"); + return; + } + + if (!HasFPRCSRs && !HasFPRStackObjects) { + LLVM_DEBUG( + dbgs() + << "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n"); + return; + } + + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) { + LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable " + "sized objects or realignment\n"); + return; + } + + if (arePPRsSpilledAsZPR(MF)) { + LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with " + "-aarch64-enable-zpr-predicate-spills"); + return; + } + + // If another calling convention is explicitly set FPRs can't be promoted to + // ZPR callee-saves. + if (!is_contained({CallingConv::C, CallingConv::Fast, + CallingConv::AArch64_SVE_VectorCall}, + MF.getFunction().getCallingConv())) { + LLVM_DEBUG( + dbgs() << "Calling convention is not supported with SplitSVEObjects"); + return; + } + + [[maybe_unused]] const AArch64Subtarget &Subtarget = + MF.getSubtarget<AArch64Subtarget>(); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Expected SVE to be available for PPRs"); + + // With SplitSVEObjects the CS hazard padding is placed between the + // PPRs and ZPRs. If there are any FPR CS there would be a hazard between + // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs. + BitVector FPRZRegs(SavedRegs.size()); + for (size_t Reg = 0, E = SavedRegs.size(); HasFPRCSRs && Reg < E; ++Reg) { + BitVector::reference RegBit = SavedRegs[Reg]; + if (!RegBit) + continue; + unsigned SubRegIdx = 0; + if (AArch64::FPR64RegClass.contains(Reg)) + SubRegIdx = AArch64::dsub; + else if (AArch64::FPR128RegClass.contains(Reg)) + SubRegIdx = AArch64::zsub; + else + continue; + // Clear the bit for the FPR save. + RegBit = false; + // Mark that we should save the corresponding ZPR. + Register ZReg = + TRI->getMatchingSuperReg(Reg, SubRegIdx, &AArch64::ZPRRegClass); + FPRZRegs.set(ZReg); + } + SavedRegs |= FPRZRegs; + + AFI->setSplitSVEObjects(true); + LLVM_DEBUG(dbgs() << "SplitSVEObjects enabled!\n"); + } } void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, @@ -2263,10 +2448,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); - const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; @@ -2385,17 +2571,26 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(AArch64::X18); } + // Determine if a Hazard slot should be used and where it should go. + // If SplitSVEObjects is used, the hazard padding is placed between the PPRs + // and ZPRs. Otherwise, it goes in the callee save area. + determineStackHazardSlot(MF, SavedRegs); + // Calculates the callee saved stack size. unsigned CSStackSize = 0; - unsigned SVECSStackSize = 0; + unsigned ZPRCSStackSize = 0; + unsigned PPRCSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (unsigned Reg : SavedRegs.set_bits()) { auto *RC = TRI->getMinimalPhysRegClass(Reg); assert(RC && "expected register class!"); auto SpillSize = TRI->getSpillSize(*RC); - if (AArch64::PPRRegClass.contains(Reg) || - AArch64::ZPRRegClass.contains(Reg)) - SVECSStackSize += SpillSize; + bool IsZPR = AArch64::ZPRRegClass.contains(Reg); + bool IsPPR = !IsZPR && AArch64::PPRRegClass.contains(Reg); + if (IsZPR || (IsPPR && arePPRsSpilledAsZPR(MF))) + ZPRCSStackSize += SpillSize; + else if (IsPPR) + PPRCSStackSize += SpillSize; else CSStackSize += SpillSize; } @@ -2405,17 +2600,15 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // only 64-bit GPRs can be added to SavedRegs. unsigned NumSavedRegs = SavedRegs.count(); + // If we have hazard padding in the CS area add that to the size. + if (AFI->isStackHazardIncludedInCalleeSaveArea()) + CSStackSize += getStackHazardSize(MF); + // Increase the callee-saved stack size if the function has streaming mode // changes, as we will need to spill the value of the VG register. if (requiresSaveVG(MF)) CSStackSize += 8; - // Determine if a Hazard slot should be used, and increase the CSStackSize by - // StackHazardSize if so. - determineStackHazardSlot(MF, SavedRegs); - if (AFI->hasStackHazardSlotIndex()) - CSStackSize += getStackHazardSize(MF); - // If we must call __arm_get_current_vg in the prologue preserve the LR. if (requiresSaveVG(MF) && !Subtarget.hasSVE()) SavedRegs.set(AArch64::LR); @@ -2436,8 +2629,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, }); // If any callee-saved registers are used, the frame cannot be eliminated. - int64_t SVEStackSize = - alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16); + auto [ZPRLocalStackSize, PPRLocalStackSize] = + determineSVEStackSizes(MF, AssignObjectOffsets::No); + uint64_t SVELocals = ZPRLocalStackSize + PPRLocalStackSize; + uint64_t SVEStackSize = + alignTo(ZPRCSStackSize + PPRCSStackSize + SVELocals, 16); bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; // The CSR spill slots have not been allocated yet, so estimateStackSize @@ -2522,7 +2718,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // instructions. AFI->setCalleeSavedStackSize(AlignedCSStackSize); AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize); - AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16)); + AFI->setSVECalleeSavedStackSize(ZPRCSStackSize, alignTo(PPRCSStackSize, 16)); } bool AArch64FrameLowering::assignCalleeSavedSpillSlots( @@ -2575,7 +2771,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); // Create a hazard slot as we switch between GPR and FPR CSRs. - if (AFI->hasStackHazardSlotIndex() && + if (AFI->isStackHazardIncludedInCalleeSaveArea() && (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && AArch64InstrInfo::isFpOrNEON(Reg)) { assert(HazardSlotIndex == std::numeric_limits<int>::max() && @@ -2614,7 +2810,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( } // Add hazard slot in the case where no FPR CSRs are present. - if (AFI->hasStackHazardSlotIndex() && + if (AFI->isStackHazardIncludedInCalleeSaveArea() && HazardSlotIndex == std::numeric_limits<int>::max()) { HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true); LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex @@ -2661,7 +2857,6 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, assert((Max == std::numeric_limits<int>::min() || Max + 1 == CS.getFrameIdx()) && "SVE CalleeSaves are not consecutive"); - Min = std::min(Min, CS.getFrameIdx()); Max = std::max(Max, CS.getFrameIdx()); } @@ -2669,43 +2864,64 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, return Min != std::numeric_limits<int>::max(); } -// Process all the SVE stack objects and determine offsets for each -// object. If AssignOffsets is true, the offsets get assigned. -// Fills in the first and last callee-saved frame indices into -// Min/MaxCSFrameIndex, respectively. -// Returns the size of the stack. -static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, - int &MinCSFrameIndex, - int &MaxCSFrameIndex, - bool AssignOffsets) { +static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, + AssignObjectOffsets AssignOffsets) { + MachineFrameInfo &MFI = MF.getFrameInfo(); + auto *AFI = MF.getInfo<AArch64FunctionInfo>(); + + SVEStackSizes SVEStack{}; + + // With SplitSVEObjects we maintain separate stack offsets for predicates + // (PPRs) and SVE vectors (ZPRs). When SplitSVEObjects is disabled predicates + // are included in the SVE vector area. + uint64_t &ZPRStackTop = SVEStack.ZPRStackSize; + uint64_t &PPRStackTop = + AFI->hasSplitSVEObjects() ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize; + #ifndef NDEBUG // First process all fixed stack objects. for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) - assert(!MFI.isScalableStackID(I) && + assert(!MFI.hasScalableStackID(I) && "SVE vectors should never be passed on the stack by value, only by " "reference."); #endif - auto Assign = [&MFI](int FI, int64_t Offset) { + auto AllocateObject = [&](int FI) { + uint64_t &StackTop = MFI.getStackID(FI) == TargetStackID::ScalableVector + ? ZPRStackTop + : PPRStackTop; + + // FIXME: Given that the length of SVE vectors is not necessarily a power of + // two, we'd need to align every object dynamically at runtime if the + // alignment is larger than 16. This is not yet supported. + Align Alignment = MFI.getObjectAlign(FI); + if (Alignment > Align(16)) + report_fatal_error( + "Alignment of scalable vectors > 16 bytes is not yet supported"); + + StackTop += MFI.getObjectSize(FI); + StackTop = alignTo(StackTop, Alignment); + + assert(StackTop < std::numeric_limits<int64_t>::max() && + "SVE StackTop far too large?!"); + + int64_t Offset = -int64_t(StackTop); + if (AssignOffsets == AssignObjectOffsets::Yes) + MFI.setObjectOffset(FI, Offset); + LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n"); - MFI.setObjectOffset(FI, Offset); }; - int64_t Offset = 0; - // Then process all callee saved slots. + int MinCSFrameIndex, MaxCSFrameIndex; if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { - // Assign offsets to the callee save slots. - for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { - Offset += MFI.getObjectSize(I); - Offset = alignTo(Offset, MFI.getObjectAlign(I)); - if (AssignOffsets) - Assign(I, -Offset); - } + for (int FI = MinCSFrameIndex; FI <= MaxCSFrameIndex; ++FI) + AllocateObject(FI); } - // Ensure that the Callee-save area is aligned to 16bytes. - Offset = alignTo(Offset, Align(16U)); + // Ensure the CS area is 16-byte aligned. + PPRStackTop = alignTo(PPRStackTop, Align(16U)); + ZPRStackTop = alignTo(ZPRStackTop, Align(16U)); // Create a buffer of SVE objects to allocate and sort it. SmallVector<int, 8> ObjectsToAllocate; @@ -2715,50 +2931,34 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, int StackProtectorFI = -1; if (MFI.hasStackProtectorIndex()) { StackProtectorFI = MFI.getStackProtectorIndex(); - if (MFI.isScalableStackID(StackProtectorFI)) + if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector) ObjectsToAllocate.push_back(StackProtectorFI); } - for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { - if (!MFI.isScalableStackID(I)) - continue; - if (I == StackProtectorFI) + + for (int FI = 0, E = MFI.getObjectIndexEnd(); FI != E; ++FI) { + if (FI == StackProtectorFI || MFI.isDeadObjectIndex(FI)) continue; - if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex) + if (MaxCSFrameIndex >= FI && FI >= MinCSFrameIndex) continue; - if (MFI.isDeadObjectIndex(I)) + + if (MFI.getStackID(FI) != TargetStackID::ScalableVector && + MFI.getStackID(FI) != TargetStackID::ScalablePredicateVector) continue; - ObjectsToAllocate.push_back(I); + ObjectsToAllocate.push_back(FI); } // Allocate all SVE locals and spills - for (unsigned FI : ObjectsToAllocate) { - Align Alignment = MFI.getObjectAlign(FI); - // FIXME: Given that the length of SVE vectors is not necessarily a power of - // two, we'd need to align every object dynamically at runtime if the - // alignment is larger than 16. This is not yet supported. - if (Alignment > Align(16)) - report_fatal_error( - "Alignment of scalable vectors > 16 bytes is not yet supported"); - - Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment); - if (AssignOffsets) - Assign(FI, -Offset); - } + for (unsigned FI : ObjectsToAllocate) + AllocateObject(FI); - return Offset; -} + PPRStackTop = alignTo(PPRStackTop, Align(16U)); + ZPRStackTop = alignTo(ZPRStackTop, Align(16U)); -int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets( - MachineFrameInfo &MFI) const { - int MinCSFrameIndex, MaxCSFrameIndex; - return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false); -} + if (AssignOffsets == AssignObjectOffsets::Yes) + AFI->setStackSizeSVE(SVEStack.ZPRStackSize, SVEStack.PPRStackSize); -int64_t AArch64FrameLowering::assignSVEStackObjectOffsets( - MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const { - return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, - true); + return SVEStack; } /// Attempts to scavenge a register from \p ScavengeableRegs given the used @@ -3072,12 +3272,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && "Upwards growing stack unsupported"); - int MinCSFrameIndex, MaxCSFrameIndex; - int64_t SVEStackSize = - assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex); - - AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U)); - AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex); + (void)determineSVEStackSizes(MF, AssignObjectOffsets::Yes); // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. @@ -3361,7 +3556,8 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, Register Reg; FrameRegOffset = TFI->resolveFrameOffsetReference( - *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg, + *MF, FirstTagStore.Offset, false /*isFixed*/, + TargetStackID::Default /*StackID*/, Reg, /*PreferFP=*/false, /*ForSimm=*/true); FrameReg = Reg; FrameRegUpdate = std::nullopt; @@ -3599,7 +3795,7 @@ StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP( // Go to common code if we cannot provide sp + offset. if (MFI.hasVarSizedObjects() || - MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() || + MF.getInfo<AArch64FunctionInfo>()->hasSVEStackSize() || MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF)) return getFrameIndexReference(MF, FI, FrameReg); @@ -3701,10 +3897,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { void AArch64FrameLowering::orderFrameObjects( const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const { - if (!OrderFrameObjects || ObjectsToAllocate.empty()) + const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); + + if ((!OrderFrameObjects && !AFI.hasSplitSVEObjects()) || + ObjectsToAllocate.empty()) return; - const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); const MachineFrameInfo &MFI = MF.getFrameInfo(); std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd()); for (auto &Obj : ObjectsToAllocate) { @@ -3723,7 +3921,8 @@ void AArch64FrameLowering::orderFrameObjects( if (AFI.hasStackHazardSlotIndex()) { std::optional<int> FI = getLdStFrameID(MI, MFI); if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { - if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI)) + if (MFI.getStackID(*FI) == TargetStackID::ScalableVector || + AArch64InstrInfo::isFpOrNEON(MI)) FrameObjects[*FI].Accesses |= FrameObject::AccessFPR; else FrameObjects[*FI].Accesses |= FrameObject::AccessGPR; @@ -4081,7 +4280,7 @@ void AArch64FrameLowering::emitRemarks( } unsigned RegTy = StackAccess::AccessType::GPR; - if (MFI.isScalableStackID(FrameIdx)) { + if (MFI.hasScalableStackID(FrameIdx)) { // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO // spill/fill the predicate as a data vector (so are an FPR access). if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO && diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 20d1d6a..32a9bd8 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -24,6 +24,11 @@ class AArch64FunctionInfo; class AArch64PrologueEmitter; class AArch64EpilogueEmitter; +struct SVEStackSizes { + uint64_t ZPRStackSize{0}; + uint64_t PPRStackSize{0}; +}; + class AArch64FrameLowering : public TargetFrameLowering { public: explicit AArch64FrameLowering() @@ -64,8 +69,9 @@ public: bool ForSimm) const; StackOffset resolveFrameOffsetReference(const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, - bool isSVE, Register &FrameReg, - bool PreferFP, bool ForSimm) const; + TargetStackID::Value StackID, + Register &FrameReg, bool PreferFP, + bool ForSimm) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef<CalleeSavedInfo> CSI, @@ -147,7 +153,17 @@ public: bool requiresSaveVG(const MachineFunction &MF) const; - StackOffset getSVEStackSize(const MachineFunction &MF) const; + /// Returns the size of the entire ZPR stackframe (calleesaves + spills). + StackOffset getZPRStackSize(const MachineFunction &MF) const; + + /// Returns the size of the entire PPR stackframe (calleesaves + spills + + /// hazard padding). + StackOffset getPPRStackSize(const MachineFunction &MF) const; + + /// Returns the size of the entire SVE stackframe (PPRs + ZPRs). + StackOffset getSVEStackSize(const MachineFunction &MF) const { + return getZPRStackSize(MF) + getPPRStackSize(MF); + } friend class AArch64PrologueEpilogueCommon; friend class AArch64PrologueEmitter; @@ -167,10 +183,6 @@ private: /// Returns true if CSRs should be paired. bool producePairRegisters(MachineFunction &MF) const; - int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const; - int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, - int &MinCSFrameIndex, - int &MaxCSFrameIndex) const; /// Make a determination whether a Hazard slot is used and create it if /// needed. void determineStackHazardSlot(MachineFunction &MF, diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 35bbb0c0..e7b2d20 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -7497,7 +7497,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, int FI = cast<FrameIndexSDNode>(N)->getIndex(); // We can only encode VL scaled offsets, so only fold in frame indexes // referencing SVE objects. - if (MFI.isScalableStackID(FI)) { + if (MFI.hasScalableStackID(FI)) { Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64); return true; @@ -7543,7 +7543,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, int FI = cast<FrameIndexSDNode>(Base)->getIndex(); // We can only encode VL scaled offsets, so only fold in frame indexes // referencing SVE objects. - if (MFI.isScalableStackID(FI)) + if (MFI.hasScalableStackID(FI)) Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c2a482a..70d5ad7d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9256,7 +9256,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, (MI.getOpcode() == AArch64::ADDXri || MI.getOpcode() == AArch64::SUBXri)) { const MachineOperand &MO = MI.getOperand(1); - if (MO.isFI() && MF.getFrameInfo().isScalableStackID(MO.getIndex())) + if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex())) MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false, /*IsImplicit=*/true)); } @@ -29608,7 +29608,7 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { // than doing it here in finalizeLowering. if (MFI.hasStackProtectorIndex()) { for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { - if (MFI.isScalableStackID(i) && + if (MFI.hasScalableStackID(i) && MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) { MFI.setStackID(MFI.getStackProtectorIndex(), TargetStackID::ScalableVector); diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index a81f5b3..b3c9656 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -23,12 +23,21 @@ using namespace llvm; +static std::optional<uint64_t> +getSVEStackSize(const AArch64FunctionInfo &MFI, + uint64_t (AArch64FunctionInfo::*GetStackSize)() const) { + if (!MFI.hasCalculatedStackSizeSVE()) + return std::nullopt; + return (MFI.*GetStackSize)(); +} + yaml::AArch64FunctionInfo::AArch64FunctionInfo( const llvm::AArch64FunctionInfo &MFI) : HasRedZone(MFI.hasRedZone()), - StackSizeSVE(MFI.hasCalculatedStackSizeSVE() - ? std::optional<uint64_t>(MFI.getStackSizeSVE()) - : std::nullopt), + StackSizeZPR( + getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizeZPR)), + StackSizePPR( + getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizePPR)), HasStackFrame(MFI.hasStackFrame() ? std::optional<bool>(MFI.hasStackFrame()) : std::nullopt) {} @@ -41,8 +50,9 @@ void AArch64FunctionInfo::initializeBaseYamlFields( const yaml::AArch64FunctionInfo &YamlMFI) { if (YamlMFI.HasRedZone) HasRedZone = YamlMFI.HasRedZone; - if (YamlMFI.StackSizeSVE) - setStackSizeSVE(*YamlMFI.StackSizeSVE); + if (YamlMFI.StackSizeZPR || YamlMFI.StackSizePPR) + setStackSizeSVE(YamlMFI.StackSizeZPR.value_or(0), + YamlMFI.StackSizePPR.value_or(0)); if (YamlMFI.HasStackFrame) setHasStackFrame(*YamlMFI.HasStackFrame); } diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 897c7e8..91e64e6 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -74,13 +74,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// Amount of stack frame size, not including callee-saved registers. uint64_t LocalStackSize = 0; - /// The start and end frame indices for the SVE callee saves. - int MinSVECSFrameIndex = 0; - int MaxSVECSFrameIndex = 0; - /// Amount of stack frame size used for saving callee-saved registers. unsigned CalleeSavedStackSize = 0; - unsigned SVECalleeSavedStackSize = 0; + unsigned ZPRCalleeSavedStackSize = 0; + unsigned PPRCalleeSavedStackSize = 0; bool HasCalleeSavedStackSize = false; bool HasSVECalleeSavedStackSize = false; @@ -137,9 +134,14 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// SVE stack size (for predicates and data vectors) are maintained here /// rather than in FrameInfo, as the placement and Stack IDs are target /// specific. - uint64_t StackSizeSVE = 0; + uint64_t StackSizeZPR = 0; + uint64_t StackSizePPR = 0; + + /// Are SVE objects (vectors and predicates) split into separate regions on + /// the stack. + bool SplitSVEObjects = false; - /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid. + /// HasCalculatedStackSizeSVE indicates whether StackSizeZPR/PPR is valid. bool HasCalculatedStackSizeSVE = false; /// Has a value when it is known whether or not the function uses a @@ -312,16 +314,25 @@ public: TailCallReservedStack = bytes; } - bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } - - void setStackSizeSVE(uint64_t S) { + void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) { + StackSizeZPR = ZPR; + StackSizePPR = PPR; HasCalculatedStackSizeSVE = true; - StackSizeSVE = S; } - uint64_t getStackSizeSVE() const { + uint64_t getStackSizeZPR() const { + assert(hasCalculatedStackSizeSVE()); + return StackSizeZPR; + } + uint64_t getStackSizePPR() const { assert(hasCalculatedStackSizeSVE()); - return StackSizeSVE; + return StackSizePPR; + } + + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + + bool hasSVEStackSize() const { + return getStackSizeZPR() > 0 || getStackSizePPR() > 0; } bool hasStackFrame() const { return HasStackFrame; } @@ -329,7 +340,6 @@ public: bool isStackRealigned() const { return StackRealigned; } void setStackRealigned(bool s) { StackRealigned = s; } - bool hasCalleeSaveStackFreeSpace() const { return CalleeSaveStackHasFreeSpace; } @@ -414,29 +424,37 @@ public: } // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes' - void setSVECalleeSavedStackSize(unsigned Size) { - SVECalleeSavedStackSize = Size; + void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) { + ZPRCalleeSavedStackSize = ZPR; + PPRCalleeSavedStackSize = PPR; HasSVECalleeSavedStackSize = true; } - unsigned getSVECalleeSavedStackSize() const { + unsigned getZPRCalleeSavedStackSize() const { assert(HasSVECalleeSavedStackSize && - "SVECalleeSavedStackSize has not been calculated"); - return SVECalleeSavedStackSize; + "ZPRCalleeSavedStackSize has not been calculated"); + return ZPRCalleeSavedStackSize; } - - void setMinMaxSVECSFrameIndex(int Min, int Max) { - MinSVECSFrameIndex = Min; - MaxSVECSFrameIndex = Max; + unsigned getPPRCalleeSavedStackSize() const { + assert(HasSVECalleeSavedStackSize && + "PPRCalleeSavedStackSize has not been calculated"); + return PPRCalleeSavedStackSize; } - int getMinSVECSFrameIndex() const { return MinSVECSFrameIndex; } - int getMaxSVECSFrameIndex() const { return MaxSVECSFrameIndex; } + unsigned getSVECalleeSavedStackSize() const { + assert(!hasSplitSVEObjects() && + "ZPRs and PPRs are split. Use get[ZPR|PPR]CalleeSavedStackSize()"); + return getZPRCalleeSavedStackSize() + getPPRCalleeSavedStackSize(); + } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamicTLSAccesses; } + bool isStackHazardIncludedInCalleeSaveArea() const { + return hasStackHazardSlotIndex() && !hasSplitSVEObjects(); + } + std::optional<bool> hasRedZone() const { return HasRedZone; } void setHasRedZone(bool s) { HasRedZone = s; } @@ -472,6 +490,15 @@ public: StackHazardCSRSlotIndex = Index; } + bool hasSplitSVEObjects() const { return SplitSVEObjects; } + void setSplitSVEObjects(bool s) { SplitSVEObjects = s; } + + bool hasSVE_AAPCS(const MachineFunction &MF) const { + return hasSplitSVEObjects() || isSVECC() || + MF.getFunction().getCallingConv() == + CallingConv::AArch64_SVE_VectorCall; + } + SMEAttrs getSMEFnAttrs() const { return SMEFnAttrs; } unsigned getSRetReturnReg() const { return SRetReturnReg; } @@ -611,7 +638,8 @@ private: namespace yaml { struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo { std::optional<bool> HasRedZone; - std::optional<uint64_t> StackSizeSVE; + std::optional<uint64_t> StackSizeZPR; + std::optional<uint64_t> StackSizePPR; std::optional<bool> HasStackFrame; AArch64FunctionInfo() = default; @@ -624,7 +652,8 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo { template <> struct MappingTraits<AArch64FunctionInfo> { static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) { YamlIO.mapOptional("hasRedZone", MFI.HasRedZone); - YamlIO.mapOptional("stackSizeSVE", MFI.StackSizeSVE); + YamlIO.mapOptional("stackSizeZPR", MFI.StackSizeZPR); + YamlIO.mapOptional("stackSizePPR", MFI.StackSizePPR); YamlIO.mapOptional("hasStackFrame", MFI.HasStackFrame); } }; diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 5da16b9..aed137c 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -48,21 +48,19 @@ bool AArch64PrologueEpilogueCommon::isVGInstruction( return Opc == TargetOpcode::COPY; } -// Convenience function to determine whether I is an SVE callee save. -static bool isSVECalleeSave(MachineBasicBlock::iterator I) { +// Convenience function to determine whether I is part of the ZPR callee saves. +static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) { switch (I->getOpcode()) { default: return false; - case AArch64::PTRUE_C_B: case AArch64::LD1B_2Z_IMM: case AArch64::ST1B_2Z_IMM: case AArch64::STR_ZXI: - case AArch64::STR_PXI: case AArch64::LDR_ZXI: - case AArch64::LDR_PXI: - case AArch64::PTRUE_B: case AArch64::CPY_ZPzI_B: case AArch64::CMPNE_PPzZI_B: + case AArch64::PTRUE_C_B: + case AArch64::PTRUE_B: return I->getFlag(MachineInstr::FrameSetup) || I->getFlag(MachineInstr::FrameDestroy); case AArch64::SEH_SavePReg: @@ -71,6 +69,23 @@ static bool isSVECalleeSave(MachineBasicBlock::iterator I) { } } +// Convenience function to determine whether I is part of the PPR callee saves. +static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) { + switch (I->getOpcode()) { + default: + return false; + case AArch64::STR_PXI: + case AArch64::LDR_PXI: + return I->getFlag(MachineInstr::FrameSetup) || + I->getFlag(MachineInstr::FrameDestroy); + } +} + +// Convenience function to determine whether I is part of the SVE callee saves. +static bool isPartOfSVECalleeSaves(MachineBasicBlock::iterator I) { + return isPartOfZPRCalleeSaves(I) || isPartOfPPRCalleeSaves(I); +} + AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon( MachineFunction &MF, MachineBasicBlock &MBB, const AArch64FrameLowering &AFL) @@ -316,7 +331,7 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump( // When there is an SVE area on the stack, always allocate the // callee-saves and spills/locals separately. - if (AFL.getSVEStackSize(MF)) + if (AFI->hasSVEStackSize()) return false; return true; @@ -639,7 +654,7 @@ void AArch64PrologueEmitter::emitPrologue() { // Now allocate space for the GPR callee saves. MachineBasicBlock::iterator MBBI = PrologueBeginI; - while (MBBI != EndI && isSVECalleeSave(MBBI)) + while (MBBI != EndI && isPartOfSVECalleeSaves(MBBI)) ++MBBI; FirstGPRSaveI = convertCalleeSaveRestoreToSPPrePostIncDec( MBBI, DL, -AFI->getCalleeSavedStackSize(), EmitAsyncCFI); @@ -669,7 +684,7 @@ void AArch64PrologueEmitter::emitPrologue() { MachineBasicBlock::iterator AfterGPRSavesI = FirstGPRSaveI; while (AfterGPRSavesI != EndI && AfterGPRSavesI->getFlag(MachineInstr::FrameSetup) && - !isSVECalleeSave(AfterGPRSavesI)) { + !isPartOfSVECalleeSaves(AfterGPRSavesI)) { if (CombineSPBump && // Only fix-up frame-setup load/store instructions. (!AFL.requiresSaveVG(MF) || !isVGInstruction(AfterGPRSavesI, TLI))) @@ -700,56 +715,105 @@ void AArch64PrologueEmitter::emitPrologue() { if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding); - StackOffset SVEStackSize = AFL.getSVEStackSize(MF); - StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize; - MachineBasicBlock::iterator CalleeSavesEnd = AfterGPRSavesI; + StackOffset PPRCalleeSavesSize = + StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); + StackOffset ZPRCalleeSavesSize = + StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); + StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize; + StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize; + StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize; + + std::optional<MachineBasicBlock::iterator> ZPRCalleeSavesBegin, + ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd; StackOffset CFAOffset = StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); - - // Process the SVE callee-saves to determine what space needs to be - // allocated. MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI; - if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { - LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize - << "\n"); - SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize); - SVELocalsSize = SVEStackSize - SVECalleeSavesSize; - // Find callee save instructions in frame. - // Note: With FPAfterSVECalleeSaves the callee saves have already been - // allocated. - if (!FPAfterSVECalleeSaves) { - MachineBasicBlock::iterator CalleeSavesBegin = AfterGPRSavesI; - assert(isSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); - while (isSVECalleeSave(AfterSVESavesI) && + if (!FPAfterSVECalleeSaves) { + // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR + // areas. + PPRCalleeSavesBegin = AfterGPRSavesI; + if (PPRCalleeSavesSize) { + LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = " + << PPRCalleeSavesSize.getScalable() << "\n"); + + assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) && + "Unexpected instruction"); + while (isPartOfPPRCalleeSaves(AfterSVESavesI) && + AfterSVESavesI != MBB.getFirstTerminator()) + ++AfterSVESavesI; + } + PPRCalleeSavesEnd = ZPRCalleeSavesBegin = AfterSVESavesI; + if (ZPRCalleeSavesSize) { + LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = " + << ZPRCalleeSavesSize.getScalable() << "\n"); + assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) && + "Unexpected instruction"); + while (isPartOfZPRCalleeSaves(AfterSVESavesI) && AfterSVESavesI != MBB.getFirstTerminator()) ++AfterSVESavesI; - CalleeSavesEnd = AfterSVESavesI; - - StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes); - // Allocate space for the callee saves (if any). - allocateStackSpace(CalleeSavesBegin, 0, SVECalleeSavesSize, - EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || LocalsSize); } + ZPRCalleeSavesEnd = AfterSVESavesI; } - CFAOffset += SVECalleeSavesSize; if (EmitAsyncCFI) - emitCalleeSavedSVELocations(CalleeSavesEnd); - - // Allocate space for the rest of the frame including SVE locals. Align the - // stack as necessary. - assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) && - "Cannot use redzone with stack realignment"); - if (!AFL.canUseRedZone(MF)) { - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - allocateStackSpace(CalleeSavesEnd, RealignmentPadding, - SVELocalsSize + StackOffset::getFixed(NumBytes), + emitCalleeSavedSVELocations(AfterSVESavesI); + + if (AFI->hasSplitSVEObjects()) { + assert(!FPAfterSVECalleeSaves && + "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects"); + assert(!AFL.canUseRedZone(MF) && + "Cannot use redzone with aarch64-split-sve-objects"); + // TODO: Handle HasWinCFI/NeedsWinCFI? + assert(!NeedsWinCFI && + "WinCFI with aarch64-split-sve-objects is not supported"); + + // Split ZPR and PPR allocation. + // Allocate PPR callee saves + allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || ZPRCalleeSavesSize || + ZPRLocalsSize || PPRLocalsSize); + CFAOffset += PPRCalleeSavesSize; + + // Allocate PPR locals + ZPR callee saves + assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin && + "Expected ZPR callee saves after PPR locals"); + allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding, + PPRLocalsSize + ZPRCalleeSavesSize, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || ZPRLocalsSize); + CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize; + + // Allocate ZPR locals + allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding, + ZPRLocalsSize + StackOffset::getFixed(NumBytes), EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects()); + } else { + // Allocate space for the callee saves (if any). + StackOffset LocalsSize = + PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes); + if (!FPAfterSVECalleeSaves) + allocateStackSpace(AfterGPRSavesI, 0, SVECalleeSavesSize, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || LocalsSize); + CFAOffset += SVECalleeSavesSize; + + // Allocate space for the rest of the frame including SVE locals. Align the + // stack as necessary. + assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) && + "Cannot use redzone with stack realignment"); + if (!AFL.canUseRedZone(MF)) { + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize; + allocateStackSpace(AfterSVESavesI, RealignmentPadding, + SVELocalsSize + StackOffset::getFixed(NumBytes), + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects()); + } } // If we need a base pointer, set it up here. It's whatever the value of the @@ -796,7 +860,8 @@ void AArch64PrologueEmitter::emitPrologue() { emitDefineCFAWithFP(AfterSVESavesI, FixedObject); } else { StackOffset TotalSize = - SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize()); + AFL.getSVEStackSize(MF) + + StackOffset::getFixed((int64_t)MFI.getStackSize()); CFIInstBuilder CFIBuilder(MBB, AfterSVESavesI, MachineInstr::FrameSetup); CFIBuilder.insertCFIInst( createDefCFA(RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP, @@ -1165,7 +1230,7 @@ void AArch64PrologueEmitter::emitCalleeSavedGPRLocations( CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); for (const auto &Info : CSI) { unsigned FrameIdx = Info.getFrameIdx(); - if (MFI.isScalableStackID(FrameIdx)) + if (MFI.hasScalableStackID(FrameIdx)) continue; assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); @@ -1191,8 +1256,10 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations( AFL.getOffsetOfLocalArea(); } + StackOffset PPRStackSize = AFL.getPPRStackSize(MF); for (const auto &Info : CSI) { - if (!MFI.isScalableStackID(Info.getFrameIdx())) + int FI = Info.getFrameIdx(); + if (!MFI.hasScalableStackID(FI)) continue; // Not all unwinders may know about SVE registers, so assume the lowest @@ -1203,9 +1270,13 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations( continue; StackOffset Offset = - StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) - + StackOffset::getScalable(MFI.getObjectOffset(FI)) - StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI)); + if (AFI->hasSplitSVEObjects() && + MFI.getStackID(FI) == TargetStackID::ScalableVector) + Offset -= PPRStackSize; + CFIBuilder.insertCFIInst( createCFAOffset(RegInfo, Reg, Offset, IncomingVGOffsetFromDefCFA)); } @@ -1322,7 +1393,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { while (FirstGPRRestoreI != Begin) { --FirstGPRRestoreI; if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) || - (!FPAfterSVECalleeSaves && isSVECalleeSave(FirstGPRRestoreI))) { + (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) { ++FirstGPRRestoreI; break; } else if (CombineSPBump) @@ -1346,7 +1417,9 @@ void AArch64EpilogueEmitter::emitEpilogue() { if (HasFP && AFI->hasSwiftAsyncContext()) emitSwiftAsyncContextFramePointer(EpilogueEndI, DL); - const StackOffset &SVEStackSize = AFL.getSVEStackSize(MF); + StackOffset ZPRStackSize = AFL.getZPRStackSize(MF); + StackOffset PPRStackSize = AFL.getPPRStackSize(MF); + StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { @@ -1367,106 +1440,188 @@ void AArch64EpilogueEmitter::emitEpilogue() { NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); - // Process the SVE callee-saves to determine what space needs to be - // deallocated. - StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; - MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI, - RestoreEnd = FirstGPRRestoreI; - if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { - if (FPAfterSVECalleeSaves) - RestoreEnd = MBB.getFirstTerminator(); - - RestoreBegin = std::prev(RestoreEnd); - while (RestoreBegin != MBB.begin() && - isSVECalleeSave(std::prev(RestoreBegin))) - --RestoreBegin; - - assert(isSVECalleeSave(RestoreBegin) && - isSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); - - StackOffset CalleeSavedSizeAsOffset = - StackOffset::getScalable(CalleeSavedSize); - DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; - DeallocateAfter = CalleeSavedSizeAsOffset; - } - - // Deallocate the SVE area. - if (FPAfterSVECalleeSaves) { - // If the callee-save area is before FP, restoring the FP implicitly - // deallocates non-callee-save SVE allocations. Otherwise, deallocate - // them explicitly. - if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { - emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + if (!AFI->hasSplitSVEObjects()) { + // Process the SVE callee-saves to determine what space needs to be + // deallocated. + StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; + MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI, + RestoreEnd = FirstGPRRestoreI; + int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize(); + int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize(); + int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize; + + if (SVECalleeSavedSize) { + if (FPAfterSVECalleeSaves) + RestoreEnd = MBB.getFirstTerminator(); + + RestoreBegin = std::prev(RestoreEnd); + while (RestoreBegin != MBB.begin() && + isPartOfSVECalleeSaves(std::prev(RestoreBegin))) + --RestoreBegin; + + assert(isPartOfSVECalleeSaves(RestoreBegin) && + isPartOfSVECalleeSaves(std::prev(RestoreEnd)) && + "Unexpected instruction"); + + StackOffset CalleeSavedSizeAsOffset = + StackOffset::getScalable(SVECalleeSavedSize); + DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; + DeallocateAfter = CalleeSavedSizeAsOffset; } - // Deallocate callee-save non-SVE registers. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - - // Deallocate fixed objects. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(FixedObject), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - - // Deallocate callee-save SVE registers. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - DeallocateAfter, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); - } else if (SVEStackSize) { - int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize(); - // If we have stack realignment or variable-sized objects we must use the - // FP to restore SVE callee saves (as there is an unknown amount of - // data/padding between the SP and SVE CS area). - Register BaseForSVEDealloc = - (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP - : AArch64::SP; - if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) { - Register CalleeSaveBase = AArch64::FP; - if (int64_t CalleeSaveBaseOffset = - AFI->getCalleeSaveBaseToFrameRecordOffset()) { - // If we have have an non-zero offset to the non-SVE CS base we need to - // compute the base address by subtracting the offest in a temporary - // register first (to avoid briefly deallocating the SVE CS). - CalleeSaveBase = - MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); - emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, - StackOffset::getFixed(-CalleeSaveBaseOffset), TII, - MachineInstr::FrameDestroy); - } - // The code below will deallocate the stack space space by moving the - // SP to the start of the SVE callee-save area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, - StackOffset::getScalable(-SVECalleeSavedSize), TII, - MachineInstr::FrameDestroy); - } else if (BaseForSVEDealloc == AArch64::SP) { - if (SVECalleeSavedSize) { - // Deallocate the non-SVE locals first before we can deallocate (and - // restore callee saves) from the SVE area. - emitFrameOffset( - MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, - SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize)); - NumBytes = 0; + // Deallocate the SVE area. + if (FPAfterSVECalleeSaves) { + // If the callee-save area is before FP, restoring the FP implicitly + // deallocates non-callee-save SVE allocations. Otherwise, deallocate + // them explicitly. + if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { + emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, + DeallocateBefore, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI); } + // Deallocate callee-save non-SVE registers. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, - SVEStackSize + - StackOffset::getFixed(NumBytes + PrologueSaveSize)); + StackOffset::getFixed(AFI->getCalleeSavedStackSize()), + TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI); + + // Deallocate fixed objects. + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(FixedObject), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI); + // Deallocate callee-save SVE registers. emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, DeallocateAfter, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, - DeallocateAfter + - StackOffset::getFixed(NumBytes + PrologueSaveSize)); + NeedsWinCFI, &HasWinCFI); + } else if (SVEStackSize) { + int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize(); + // If we have stack realignment or variable-sized objects we must use the + // FP to restore SVE callee saves (as there is an unknown amount of + // data/padding between the SP and SVE CS area). + Register BaseForSVEDealloc = + (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP + : AArch64::SP; + if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) { + Register CalleeSaveBase = AArch64::FP; + if (int64_t CalleeSaveBaseOffset = + AFI->getCalleeSaveBaseToFrameRecordOffset()) { + // If we have have an non-zero offset to the non-SVE CS base we need + // to compute the base address by subtracting the offest in a + // temporary register first (to avoid briefly deallocating the SVE + // CS). + CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); + emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, + StackOffset::getFixed(-CalleeSaveBaseOffset), TII, + MachineInstr::FrameDestroy); + } + // The code below will deallocate the stack space space by moving the + // SP to the start of the SVE callee-save area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, + StackOffset::getScalable(-SVECalleeSavedSize), TII, + MachineInstr::FrameDestroy); + } else if (BaseForSVEDealloc == AArch64::SP) { + if (SVECalleeSavedSize) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(NumBytes), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI, EmitCFI && !HasFP, + SVEStackSize + StackOffset::getFixed( + NumBytes + PrologueSaveSize)); + NumBytes = 0; + } + + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + DeallocateBefore, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + SVEStackSize + + StackOffset::getFixed(NumBytes + PrologueSaveSize)); + + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + DeallocateAfter, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + DeallocateAfter + + StackOffset::getFixed(NumBytes + PrologueSaveSize)); + } + + if (EmitCFI) + emitCalleeSavedSVERestores(RestoreEnd); + } + } else if (AFI->hasSplitSVEObjects() && SVEStackSize) { + // TODO: Support stack realigment and variable-sized objects. + assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() && + "unexpected stack realignment or variable sized objects with split " + "SVE stack objects"); + // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR + // areas. + auto ZPRCalleeSavedSize = + StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); + auto PPRCalleeSavedSize = + StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); + StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize; + StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize; + + MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI, + PPRRestoreEnd = FirstGPRRestoreI; + if (PPRCalleeSavedSize) { + PPRRestoreBegin = std::prev(PPRRestoreEnd); + while (PPRRestoreBegin != MBB.begin() && + isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin))) + --PPRRestoreBegin; + } + + MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin, + ZPRRestoreEnd = PPRRestoreBegin; + if (ZPRCalleeSavedSize) { + ZPRRestoreBegin = std::prev(ZPRRestoreEnd); + while (ZPRRestoreBegin != MBB.begin() && + isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin))) + --ZPRRestoreBegin; } + + auto CFAOffset = + SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize); + if (PPRCalleeSavedSize || ZPRCalleeSavedSize) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + auto NonSVELocals = StackOffset::getFixed(NumBytes); + emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP, + NonSVELocals, TII, MachineInstr::FrameDestroy, false, + false, nullptr, EmitCFI && !HasFP, CFAOffset); + NumBytes = 0; + CFAOffset -= NonSVELocals; + } + + if (ZPRLocalsSize) { + emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP, + ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false, + false, nullptr, EmitCFI && !HasFP, CFAOffset); + CFAOffset -= ZPRLocalsSize; + } + + if (PPRLocalsSize || ZPRCalleeSavedSize) { + assert(PPRRestoreBegin == ZPRRestoreEnd && + "Expected PPR restores after ZPR"); + emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP, + PPRLocalsSize + ZPRCalleeSavedSize, TII, + MachineInstr::FrameDestroy, false, false, nullptr, + EmitCFI && !HasFP, CFAOffset); + CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize; + } + if (PPRCalleeSavedSize) { + emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP, + PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy, + false, false, nullptr, EmitCFI && !HasFP, CFAOffset); + } + + // We only emit CFI information for ZPRs so emit CFI after the ZPR restores. if (EmitCFI) - emitCalleeSavedSVERestores(RestoreEnd); + emitCalleeSavedSVERestores(ZPRRestoreEnd); } if (!HasFP) { @@ -1624,7 +1779,7 @@ void AArch64EpilogueEmitter::emitCalleeSavedRestores( CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy); for (const auto &Info : CSI) { - if (SVE != MFI.isScalableStackID(Info.getFrameIdx())) + if (SVE != MFI.hasScalableStackID(Info.getFrameIdx())) continue; MCRegister Reg = Info.getReg(); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 2b0c8ad..79975b0 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -71,6 +71,7 @@ bool AArch64RegisterInfo::regNeedsCFI(MCRegister Reg, const MCPhysReg * AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); + auto &AFI = *MF->getInfo<AArch64FunctionInfo>(); if (MF->getFunction().getCallingConv() == CallingConv::GHC) // GHC set of callee saved regs is empty as all those regs are @@ -101,10 +102,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_Win_AArch64_AAPCS_SwiftTail_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall) return CSR_Win_AArch64_AAVPCS_SaveList; - if (MF->getFunction().getCallingConv() == - CallingConv::AArch64_SVE_VectorCall) - return CSR_Win_AArch64_SVE_AAPCS_SaveList; - if (MF->getInfo<AArch64FunctionInfo>()->isSVECC()) + if (AFI.hasSVE_AAPCS(*MF)) return CSR_Win_AArch64_SVE_AAPCS_SaveList; return CSR_Win_AArch64_AAPCS_SaveList; } @@ -148,7 +146,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { // This is for OSes other than Windows; Windows is a separate case further // above. return CSR_AArch64_AAPCS_X18_SaveList; - if (MF->getInfo<AArch64FunctionInfo>()->isSVECC()) + if (AFI.hasSVE_AAPCS(*MF)) return CSR_AArch64_SVE_AAPCS_SaveList; return CSR_AArch64_AAPCS_SaveList; } @@ -158,6 +156,7 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); assert(MF->getSubtarget<AArch64Subtarget>().isTargetDarwin() && "Invalid subtarget for getDarwinCalleeSavedRegs"); + auto &AFI = *MF->getInfo<AArch64FunctionInfo>(); if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) report_fatal_error( @@ -205,7 +204,7 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const { return CSR_Darwin_AArch64_RT_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::Win64) return CSR_Darwin_AArch64_AAPCS_Win64_SaveList; - if (MF->getInfo<AArch64FunctionInfo>()->isSVECC()) + if (AFI.hasSVE_AAPCS(*MF)) return CSR_Darwin_AArch64_SVE_AAPCS_SaveList; return CSR_Darwin_AArch64_AAPCS_SaveList; } @@ -643,7 +642,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { if (ST.hasSVE() || ST.isStreaming()) { // Frames that have variable sized objects and scalable SVE objects, // should always use a basepointer. - if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE()) + if (!AFI->hasCalculatedStackSizeSVE() || AFI->hasSVEStackSize()) return true; } @@ -783,7 +782,7 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() || AFI->hasCalculatedStackSizeSVE()) && "Expected SVE area to be calculated by this point"); - return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE() && + return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->hasSVEStackSize() && !AFI->hasStackHazardSlotIndex(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index f01d5f6..6efa78e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -608,6 +608,8 @@ public: ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] : EmptySet; + const size_t HybridModuleRootKernelsSize = HybridModuleRootKernels.size(); + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { // Each iteration of this loop assigns exactly one global variable to // exactly one of the implementation strategies. @@ -647,7 +649,8 @@ public: ModuleScopeVariables.insert(GV); } else if (K.second.size() == 1) { KernelAccessVariables.insert(GV); - } else if (set_is_subset(K.second, HybridModuleRootKernels)) { + } else if (K.second.size() == HybridModuleRootKernelsSize && + set_is_subset(K.second, HybridModuleRootKernels)) { ModuleScopeVariables.insert(GV); } else { TableLookupVariables.insert(GV); diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index b3fd8c7..84287b6 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -352,10 +352,12 @@ def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">; } // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] +let Defs = [SCC] in { def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32", [(set i32:$sdst, (int_amdgcn_s_quadmask i32:$src0))]>; def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64", [(set i64:$sdst, (int_amdgcn_s_quadmask i64:$src0))]>; +} let Uses = [M0] in { def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 3ac7c28..8c21746 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -638,6 +638,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // No support for these operations with v2f32/v2i32 setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand); + + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Expand); + setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, + MVT::v2i32, Expand); + // Need custom lowering in case the index is dynamic. if (STI.hasF32x2Instructions()) setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index 19d5aff..af1ceb6 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -118,7 +118,7 @@ let Predicates = [HasAtomicLdSt] in { } let Predicates = [HasAtomicLdSt, IsRV64] in { - def : LdPat<atomic_load_nonext_32, LW, i32>; + // Load pattern is in RISCVInstrInfoA.td and shared with RV32. def : StPat<atomic_store_32, SW, GPR, i32>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index 99992d1..25accd9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -174,15 +174,14 @@ let Predicates = [HasAtomicLdSt] in { def : StPat<relaxed_store<atomic_store_8>, SB, GPR, XLenVT>; def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>; def : StPat<relaxed_store<atomic_store_32>, SW, GPR, XLenVT>; -} -let Predicates = [HasAtomicLdSt, IsRV32] in { - def : LdPat<relaxed_load<atomic_load_nonext_32>, LW>; + // Used by GISel for RV32 and RV64. + def : LdPat<relaxed_load<atomic_load_nonext_32>, LW, i32>; } let Predicates = [HasAtomicLdSt, IsRV64] in { - def : LdPat<relaxed_load<atomic_load_asext_32>, LW>; - def : LdPat<relaxed_load<atomic_load_zext_32>, LWU>; + def : LdPat<relaxed_load<atomic_load_asext_32>, LW, i64>; + def : LdPat<relaxed_load<atomic_load_zext_32>, LWU, i64>; def : LdPat<relaxed_load<atomic_load_nonext_64>, LD, i64>; def : StPat<relaxed_store<atomic_store_64>, SD, GPR, i64>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index d998316..298d35a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -554,7 +554,8 @@ defset list<VTypeInfoToWide> AllWidenableBF16ToFloatVectors = { // This represents the information we need in codegen for each pseudo. // The definition should be consistent with `struct PseudoInfo` in // RISCVInstrInfo.h. -class RISCVVPseudo<dag outs, dag ins, list<dag> pattern = [], string opcodestr = "", string argstr = ""> +class RISCVVPseudo<dag outs, dag ins, list<dag> pattern = [], + string opcodestr = "", string argstr = ""> : Pseudo<outs, ins, pattern, opcodestr, argstr> { Pseudo Pseudo = !cast<Pseudo>(NAME); // Used as a key. Instruction BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); @@ -1010,8 +1011,7 @@ class VPseudoNullaryNoMask<VReg RegClass> : class VPseudoNullaryMask<VReg RegClass> : RISCVVPseudo<(outs GetVRegNoV0<RegClass>.R:$rd), (ins GetVRegNoV0<RegClass>.R:$passthru, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), - []> { + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1190,8 +1190,7 @@ class VPseudoBinaryNoMask<VReg RetClass, bits<2> TargetConstraintType = 1, DAGOperand sewop = sew> : RISCVVPseudo<(outs RetClass:$rd), - (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew), - []> { + (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1227,8 +1226,7 @@ class VPseudoBinaryNoMaskRoundingMode<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, - vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), - []> { + vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1320,7 +1318,7 @@ class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL, bit Ordered>: RISCVVPseudo<(outs), (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2, - AVL:$vl, sew:$sew),[]>, + AVL:$vl, sew:$sew)>, RISCVVSX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 0; let mayStore = 1; @@ -1333,7 +1331,7 @@ class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL, bit Ordered>: RISCVVPseudo<(outs), (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2, - VMaskOp:$vm, AVL:$vl, sew:$sew),[]>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 0; let mayStore = 1; @@ -1351,8 +1349,7 @@ class VPseudoBinaryMaskPolicy<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, Op1Class:$rs2, Op2Class:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), - []> { + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1371,8 +1368,7 @@ class VPseudoTernaryMaskPolicy<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, Op1Class:$rs2, Op2Class:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), - []> { + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1414,8 +1410,7 @@ class VPseudoBinaryMOutMask<VReg RetClass, RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), - []> { + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1438,8 +1433,7 @@ class VPseudoTiedBinaryMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, Op2Class:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), - []> { + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1546,8 +1540,7 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, - vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), - []> { + vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1716,8 +1709,8 @@ class VPseudoUSSegStoreNoMask<VReg ValClass, int EEW, bits<4> NF> : RISCVVPseudo<(outs), - (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sew:$sew), - []>, + (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, + sew:$sew)>, RISCVVSSEG<NF, /*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -6029,9 +6022,9 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 1 in { PseudoInstExpansion<(CSRRS GPR:$rd, SysRegVLENB.Encoding, X0)>, Sched<[WriteRdVLENB]>; let Defs = [VL, VTYPE] in { - def PseudoReadVLENBViaVSETVLIX0 : Pseudo<(outs GPRNoX0:$rd), (ins uimm5:$shamt), - []>, - Sched<[WriteVSETVLI, ReadVSETVLI]>; + def PseudoReadVLENBViaVSETVLIX0 : Pseudo<(outs GPRNoX0:$rd), + (ins uimm5:$shamt), []>, + Sched<[WriteVSETVLI, ReadVSETVLI]>; } } @@ -6694,14 +6687,14 @@ defm PseudoVID : VPseudoVID_V; let Predicates = [HasVInstructions] in { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { let HasSEWOp = 1, BaseInstr = VMV_X_S in - def PseudoVMV_X_S: + def PseudoVMV_X_S : RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew)>, Sched<[WriteVMovXS, ReadVMovXS]>; let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1, Constraints = "$rd = $passthru" in - def PseudoVMV_S_X: RISCVVPseudo<(outs VR:$rd), - (ins VR:$passthru, GPR:$rs1, AVL:$vl, sew:$sew), - []>, + def PseudoVMV_S_X : + RISCVVPseudo<(outs VR:$rd), + (ins VR:$passthru, GPR:$rs1, AVL:$vl, sew:$sew)>, Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>; } } // Predicates = [HasVInstructions] @@ -6721,8 +6714,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { Constraints = "$rd = $passthru" in def "PseudoVFMV_S_" # f.FX : RISCVVPseudo<(outs VR:$rd), - (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew), - []>, + (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew)>, Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td index 5e013b4..680bca3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td @@ -63,13 +63,14 @@ defm SD : SRL_r_aq_rl<0b011, "sd">; //===----------------------------------------------------------------------===// class PatLAQ<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT> - : Pat<(vt (OpNode (vt GPRMemZeroOffset:$rs1))), (Inst GPRMemZeroOffset:$rs1)>; + : Pat<(vt (OpNode (XLenVT GPRMemZeroOffset:$rs1))), + (Inst GPRMemZeroOffset:$rs1)>; // n.b. this switches order of arguments // to deal with the fact that SRL has addr, data // while atomic_store has data, addr class PatSRL<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT> - : Pat<(OpNode (vt GPR:$rs2), (vt GPRMemZeroOffset:$rs1)), + : Pat<(OpNode (vt GPR:$rs2), (XLenVT GPRMemZeroOffset:$rs1)), (Inst GPRMemZeroOffset:$rs1, GPR:$rs2)>; diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index c4f1b68..ddb95a4 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -3981,7 +3981,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( void ModuleCallsiteContextGraph::updateAllocationCall( CallInfo &Call, AllocationType AllocType) { std::string AllocTypeString = getAllocTypeAttributeString(AllocType); - removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call())); auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), "memprof", AllocTypeString); cast<CallBase>(Call.call())->addFnAttr(A); @@ -5643,7 +5642,6 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // clone J-1 (J==0 is the original clone and does not have a VMaps // entry). CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); - removeAnyExistingAmbiguousAttribute(CBClone); CBClone->addFnAttr(A); ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) << ore::NV("AllocationCall", CBClone) << " in clone " diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index e4cb4574..07ad65c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -5780,6 +5780,45 @@ Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I, return nullptr; } +/// Match and fold patterns like: +/// icmp eq/ne X, min(max(X, Lo), Hi) +/// which represents a range check and can be repsented as a ConstantRange. +/// +/// For icmp eq, build ConstantRange [Lo, Hi + 1) and convert to: +/// (X - Lo) u< (Hi + 1 - Lo) +/// For icmp ne, build ConstantRange [Hi + 1, Lo) and convert to: +/// (X - (Hi + 1)) u< (Lo - (Hi + 1)) +Instruction *InstCombinerImpl::foldICmpWithClamp(ICmpInst &I, Value *X, + MinMaxIntrinsic *Min) { + if (!I.isEquality() || !Min->hasOneUse() || !Min->isMin()) + return nullptr; + + const APInt *Lo = nullptr, *Hi = nullptr; + if (Min->isSigned()) { + if (!match(Min->getLHS(), m_OneUse(m_SMax(m_Specific(X), m_APInt(Lo)))) || + !match(Min->getRHS(), m_APInt(Hi)) || !Lo->slt(*Hi)) + return nullptr; + } else { + if (!match(Min->getLHS(), m_OneUse(m_UMax(m_Specific(X), m_APInt(Lo)))) || + !match(Min->getRHS(), m_APInt(Hi)) || !Lo->ult(*Hi)) + return nullptr; + } + + ConstantRange CR = ConstantRange::getNonEmpty(*Lo, *Hi + 1); + ICmpInst::Predicate Pred; + APInt C, Offset; + if (I.getPredicate() == ICmpInst::ICMP_EQ) + CR.getEquivalentICmp(Pred, C, Offset); + else + CR.inverse().getEquivalentICmp(Pred, C, Offset); + + if (!Offset.isZero()) + X = Builder.CreateAdd(X, ConstantInt::get(X->getType(), Offset)); + + return replaceInstUsesWith( + I, Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), C))); +} + // Canonicalize checking for a power-of-2-or-zero value: static Instruction *foldICmpPow2Test(ICmpInst &I, InstCombiner::BuilderTy &Builder) { @@ -7467,10 +7506,14 @@ Instruction *InstCombinerImpl::foldICmpCommutative(CmpPredicate Pred, if (Instruction *NI = foldSelectICmp(Pred, SI, Op1, CxtI)) return NI; - if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op0)) + if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op0)) { if (Instruction *Res = foldICmpWithMinMax(CxtI, MinMax, Op1, Pred)) return Res; + if (Instruction *Res = foldICmpWithClamp(CxtI, Op1, MinMax)) + return Res; + } + { Value *X; const APInt *C; @@ -8527,6 +8570,9 @@ static Instruction *foldFCmpFSubIntoFCmp(FCmpInst &I, Instruction *LHSI, DenormalMode::getIEEE()) { CI.replaceOperand(I, 0, X); CI.replaceOperand(I, 1, Y); + I.setHasNoInfs(LHSI->hasNoInfs()); + if (LHSI->hasNoNaNs()) + I.setHasNoNaNs(true); return &I; } break; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 4f94aa2..e01c145 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -725,6 +725,7 @@ public: Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ); Instruction *foldICmpWithMinMax(Instruction &I, MinMaxIntrinsic *MinMax, Value *Z, CmpPredicate Pred); + Instruction *foldICmpWithClamp(ICmpInst &Cmp, Value *X, MinMaxIntrinsic *Min); Instruction *foldICmpEquality(ICmpInst &Cmp); Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I); Instruction *foldSignBitTest(ICmpInst &I); diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index 735bad1..e1dcaa85 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -883,84 +883,6 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, } } -struct WeightInfo { - // Weights for current iteration. - SmallVector<uint32_t> Weights; - // Weights to subtract after each iteration. - const SmallVector<uint32_t> SubWeights; -}; - -/// Update the branch weights of an exiting block of a peeled-off loop -/// iteration. -/// Let F is a weight of the edge to continue (fallthrough) into the loop. -/// Let E is a weight of the edge to an exit. -/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to -/// go to exit. -/// Then, Estimated ExitCount = F / E. -/// For I-th (counting from 0) peeled off iteration we set the weights for -/// the peeled exit as (EC - I, 1). It gives us reasonable distribution, -/// The probability to go to exit 1/(EC-I) increases. At the same time -/// the estimated exit count in the remainder loop reduces by I. -/// To avoid dealing with division rounding we can just multiple both part -/// of weights to E and use weight as (F - I * E, E). -static void updateBranchWeights(Instruction *Term, WeightInfo &Info) { - setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false); - for (auto [Idx, SubWeight] : enumerate(Info.SubWeights)) - if (SubWeight != 0) - // Don't set the probability of taking the edge from latch to loop header - // to less than 1:1 ratio (meaning Weight should not be lower than - // SubWeight), as this could significantly reduce the loop's hotness, - // which would be incorrect in the case of underestimating the trip count. - Info.Weights[Idx] = - Info.Weights[Idx] > SubWeight - ? std::max(Info.Weights[Idx] - SubWeight, SubWeight) - : SubWeight; -} - -/// Initialize the weights for all exiting blocks. -static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos, - Loop *L) { - SmallVector<BasicBlock *> ExitingBlocks; - L->getExitingBlocks(ExitingBlocks); - for (BasicBlock *ExitingBlock : ExitingBlocks) { - Instruction *Term = ExitingBlock->getTerminator(); - SmallVector<uint32_t> Weights; - if (!extractBranchWeights(*Term, Weights)) - continue; - - // See the comment on updateBranchWeights() for an explanation of what we - // do here. - uint32_t FallThroughWeights = 0; - uint32_t ExitWeights = 0; - for (auto [Succ, Weight] : zip(successors(Term), Weights)) { - if (L->contains(Succ)) - FallThroughWeights += Weight; - else - ExitWeights += Weight; - } - - // Don't try to update weights for degenerate case. - if (FallThroughWeights == 0) - continue; - - SmallVector<uint32_t> SubWeights; - for (auto [Succ, Weight] : zip(successors(Term), Weights)) { - if (!L->contains(Succ)) { - // Exit weights stay the same. - SubWeights.push_back(0); - continue; - } - - // Subtract exit weights on each iteration, distributed across all - // fallthrough edges. - double W = (double)Weight / (double)FallThroughWeights; - SubWeights.push_back((uint32_t)(ExitWeights * W)); - } - - WeightInfos.insert({Term, {std::move(Weights), std::move(SubWeights)}}); - } -} - /// Clones the body of the loop L, putting it between \p InsertTop and \p /// InsertBot. /// \param IterNumber The serial number of the iteration currently being @@ -1332,11 +1254,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI, Instruction *LatchTerm = cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator()); - // If we have branch weight information, we'll want to update it for the - // newly created branches. - DenseMap<Instruction *, WeightInfo> Weights; - initBranchWeights(Weights, L); - // Identify what noalias metadata is inside the loop: if it is inside the // loop, the associated metadata must be cloned for each iteration. SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes; @@ -1382,11 +1299,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI, assert(DT.verify(DominatorTree::VerificationLevel::Fast)); #endif - for (auto &[Term, Info] : Weights) { - auto *TermCopy = cast<Instruction>(VMap[Term]); - updateBranchWeights(TermCopy, Info); - } - // Remove Loop metadata from the latch branch instruction // because it is not the Loop's latch branch anymore. auto *LatchTermCopy = cast<Instruction>(VMap[LatchTerm]); @@ -1426,15 +1338,38 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI, } } - for (const auto &[Term, Info] : Weights) { - setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false); - } - // Update Metadata for count of peeled off iterations. unsigned AlreadyPeeled = 0; if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData)) AlreadyPeeled = *Peeled; - addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount); + unsigned TotalPeeled = AlreadyPeeled + PeelCount; + addStringMetadataToLoop(L, PeeledCountMetaData, TotalPeeled); + + // Update metadata for the estimated trip count. The original branch weight + // metadata is already correct for both the remaining loop and the peeled loop + // iterations, so do not adjust it. + // + // For example, consider what happens when peeling 2 iterations from a loop + // with an estimated trip count of 10 and inserting them before the remaining + // loop. Each of the peeled iterations and each iteration in the remaining + // loop still has the same probability of exiting the *entire original* loop + // as it did when in the original loop, and thus it should still have the same + // branch weights. The peeled iterations' non-zero probabilities of exiting + // already appropriately reduce the probability of reaching the remaining + // iterations just as they did in the original loop. Trying to also adjust + // the remaining loop's branch weights to reflect its new trip count of 8 will + // erroneously further reduce its block frequencies. However, in case an + // analysis later needs to determine the trip count of the remaining loop + // while examining it in isolation without considering the probability of + // actually reaching it, we store the new trip count as separate metadata. + if (auto EstimatedTripCount = getLoopEstimatedTripCount(L)) { + unsigned EstimatedTripCountNew = *EstimatedTripCount; + if (EstimatedTripCountNew < TotalPeeled) + EstimatedTripCountNew = 0; + else + EstimatedTripCountNew -= TotalPeeled; + setLoopEstimatedTripCount(L, EstimatedTripCountNew); + } if (Loop *ParentLoop = L->getParentLoop()) L = ParentLoop; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index acdb379..f76777b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1110,8 +1110,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { // x && !x -> 0 if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) - return Def->replaceAllUsesWith(Plan->getOrAddLiveIn( - ConstantInt::getFalse(VPTypeAnalysis(*Plan).inferScalarType(Def)))); + return Def->replaceAllUsesWith(Plan->getFalse()); if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) return Def->replaceAllUsesWith(X); @@ -3346,12 +3345,7 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy); } - [[maybe_unused]] auto *ConstStep = - ScalarStep->isLiveIn() - ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue()) - : nullptr; - assert(!ConstStep || ConstStep->getValue() != 1); - (void)ConstStep; + assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step"); if (TypeInfo.inferScalarType(ScalarStep) != IVTy) { ScalarStep = Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy); diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll index a08f859..6d9aa8d 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll @@ -756,3 +756,129 @@ e.1: e.2: ret void } + +define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_nofree_via_context(ptr %A, ptr %B) nosync { +; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_nofree_via_context' +; CHECK-NEXT: loop.header: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group GRP0: +; CHECK-NEXT: %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv +; CHECK-NEXT: Against group GRP1: +; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr)) +; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header> +; CHECK-NEXT: Group GRP1: +; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr)) +; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 2000) ] + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %B, i64 2000) ] + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] + %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv + %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv + %l = load i32, ptr %gep.A, align 4 + store i32 0, ptr %gep.B, align 4 + %cntable.c.1 = icmp ult i64 %iv, 1000 + %iv.next = add nuw nsw i64 %iv, 1 + br i1 %cntable.c.1, label %b2, label %e.1 + +b2: + %uncntable.c.0 = icmp eq i32 %l, 0 + br i1 %uncntable.c.0, label %e.2, label %b3 + +b3: + %cntable.c.2 = icmp eq i64 %iv.next, 500 + br i1 %cntable.c.2, label %cleanup4, label %latch + +latch: + br label %loop.header + +cleanup4: + ret void + +e.1: + ret void + +e.2: + ret void +} + +define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_missing_nofree_multiple_predecessors(ptr %A, ptr %B, i1 %c) nosync { +; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_missing_nofree_multiple_predecessors' +; CHECK-NEXT: loop.header: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group GRP0: +; CHECK-NEXT: %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv +; CHECK-NEXT: Against group GRP1: +; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr)) +; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header> +; CHECK-NEXT: Group GRP1: +; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr)) +; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 2000) ] + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %B, i64 2000) ] + br i1 %c, label %then, label %else + +then: + br label %loop.header + +else: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %then ], [ 0, %else ], [ %iv.next, %latch ] + %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv + %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv + %l = load i32, ptr %gep.A, align 4 + store i32 0, ptr %gep.B, align 4 + %cntable.c.1 = icmp ult i64 %iv, 1000 + %iv.next = add nuw nsw i64 %iv, 1 + br i1 %cntable.c.1, label %b2, label %e.1 + +b2: + %uncntable.c.0 = icmp eq i32 %l, 0 + br i1 %uncntable.c.0, label %e.2, label %b3 + +b3: + %cntable.c.2 = icmp eq i64 %iv.next, 500 + br i1 %cntable.c.2, label %cleanup4, label %latch + +latch: + br label %loop.header + +cleanup4: + ret void + +e.1: + ret void + +e.2: + ret void +} diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir new file mode 100644 index 0000000..35eafe8 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir @@ -0,0 +1,587 @@ +# RUN: llc -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -o - | FileCheck %s --check-prefix=ASM +# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -filetype=obj -o %t +# RUN: llvm-objdump --dwarf=frames %t | FileCheck %s --check-prefix=UNWINDINFO +# RUN: rm -rf %t +# +# Test allocation and deallocation of SVE objects on the stack with +# split-sve-objects (and hazard padding) enabled. This also tests using a +# combination of scalable and non-scalable offsets to access the SVE on the +# stack. +# +# With split-sve-objects (which implies hazard padding) the SVE area is split +# into PPR and ZPR areas with (fixed-size) hazard padding between them. The PPR +# area holds all scalable predicate callee saves and locals, and the ZPR area +# holds all scalable vector callee saves and locals. Additionally, any FPR +# callee save is promoted to a ZPR callee save (to avoid needing additional +# hazard padding in the callee save area). +# +# +-------------+ +# | stack arg | +# +-------------+ <- SP before call +# | Callee Saves| +# | Frame record| (if available) +# |-------------| <- FP (if available) +# | PPR area | +# |-------------| +# |/////////////| hazard padding +# |-------------| +# | ZPR area | +# +-------------+ +# | : | +# | Stack objs | +# | : | +# +-------------+ <- SP after call and frame-setup +# +--- | + + define void @test_allocate_split_sve() uwtable { entry: unreachable } + define void @test_allocate_split_sve_realigned() uwtable { entry: unreachable } + define void @test_address_split_sve() uwtable { entry: unreachable } + define void @test_address_split_sve_fp() uwtable { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_ppr_zpr() uwtable { entry: unreachable } + +... +--- +# +----------+ +# |scratchreg| // x29 is used as scratch reg. +# |----------| +# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes +# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes) +# |----------| +# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area +# |//////////| // Note: This is currently not included in the "stackSize" +# +----------+ +# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes) +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.1 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_split_sve +# CHECK: stackSize: 1056 + +# CHECK: bb.0.entry: +# CHECK: liveins: $z0, $p0, $fp +# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 +# +# CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0 +# CHECK-NEXT: $x8 = ADDPL_XXI $x8, 7, implicit $vg +# CHECK-NEXT: STR_ZXI $z0, killed $x8, 0 :: (store (<vscale x 1 x s128>) into %stack.0) +# CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0 +# CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store (<vscale x 1 x s16>) into %stack.1) +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_allocate_split_sve: +# ASM: str x29, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM-NEXT: sub sp, sp, #1024 +# ASM-NEXT: .cfi_def_cfa_offset 1040 +# ASM-NEXT: addvl sp, sp, #-1 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG +# ASM-NEXT: sub sp, sp, #1040 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM-NEXT: addvl sp, sp, #-2 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG +# +# ASM: addvl sp, sp, #2 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM-NEXT: add sp, sp, #1024 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG +# ASM-NEXT: addvl sp, sp, #1 +# ASM-NEXT: .cfi_def_cfa wsp, 1056 +# ASM-NEXT: add sp, sp, #1040 +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: ldr x29, [sp], #16 +# ASM-NEXT: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056 +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: test_allocate_split_sve +stack: + - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 } + - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 } + - { id: 2, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + liveins: $z0, $p0 + STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0) + STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1) + RET_ReallyLR +... +--- + +# Stack realignment is not supported with split-sve-objects, so we fallback to +# the default hazard padding implementation. This does not prevent hazards +# between ZPRs and PPRs (TODO: support this case). +# +# +----------+ +# | lr, fp | // frame record +# |----------| +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes +# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes) +# +----------+ +# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes) +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.1 | // not scalable +# +----------+ <- SP + +name: test_allocate_split_sve_realigned +stack: + - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 } + - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 } + - { id: 2, stack-id: default, size: 16, alignment: 32 } +body: | + bb.0.entry: + liveins: $z0, $p0 + STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0) + STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1) + RET_ReallyLR + +# CHECK-LABEL: name: test_allocate_split_sve_realigned +# CHECK: stackSize: 2080 + +# CHECK: bb.0.entry: +# CHECK: liveins: $z0, $p0, $lr +# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 +# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5) +# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4) +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg +# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930 +# +# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 +# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg +# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0) +# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 +# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1) +# +# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040 +# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4) +# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5) +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_allocate_split_sve_realigned +# ASM: sub sp, sp, #1040 +# ASM-NEXT: .cfi_def_cfa_offset 1040 +# ASM-NEXT: str x29, [sp, #1024] +# ASM-NEXT: str x30, [sp, #1032] +# ASM-NEXT: add x29, sp, #1024 +# ASM-NEXT: .cfi_def_cfa w29, 16 +# ASM-NEXT: .cfi_offset w30, -8 +# ASM-NEXT: .cfi_offset w29, -16 +# +# ASM: sub sp, x29, #1024 +# ASM-NEXT: .cfi_def_cfa wsp, 1040 +# ASM-NEXT: ldr x30, [sp, #1032] +# ASM-NEXT: ldr x29, [sp, #1024] +# ASM-NEXT: add sp, sp, #1040 +# ASM-NEXT: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w30 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa: reg29 +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# +# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg30 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 +... +--- + +# +----------+ +# |scratchreg| // x29 is used as scratch reg. +# +----------+ +# | %stack.2 | // scalable predicate @ SP + 2064b + 46 scalable bytes +# |----------| +# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area +# |//////////| // Note: This is currently not included in the "stackSize" +# |----------| +# | %stack.0 | // scalable vector @ SP + 1040b + 16 scalable bytes +# | %stack.1 | // scalable vector @ SP + 1040b +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.3 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_address_split_sve +# CHECK: stackSize: 1056 + +# CHECK: bb.0.entry: +# CHECK-NEXT: liveins: +# CHECK-NEXT: {{ $}} +# CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 +# +# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0 +# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], 1 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0 +# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], 0 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0 +# CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23 +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_address_split_sve +# ASM: str x29, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM-NEXT: sub sp, sp, #1024 +# ASM-NEXT: .cfi_def_cfa_offset 1040 +# ASM-NEXT: addvl sp, sp, #-1 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG +# ASM-NEXT: sub sp, sp, #1040 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM-NEXT: addvl sp, sp, #-2 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG +# +# ASM: addvl sp, sp, #2 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM-NEXT: add sp, sp, #1024 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG +# ASM-NEXT: addvl sp, sp, #1 +# ASM-NEXT: .cfi_def_cfa wsp, 1056 +# ASM-NEXT: add sp, sp, #1040 +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: ldr x29, [sp], #16 +# ASM-NEXT: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056 +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: test_address_split_sve +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 } + - { id: 3, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + liveins: $z0, $z1, $p0 + + STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0) + STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1) + STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2) + + RET_ReallyLR +... +--- +# +----------+ +# | lr, fp | // frame record +# +----------+ <- FP +# | %stack.2 | // scalable predicate @ FP - 2 scalable bytes +# |----------| +# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area +# |//////////| // Note: This is currently not included in the "stackSize" +# |----------| +# | %stack.0 | // scalable vector @ FP - 1024b - 32 scalable bytes +# | %stack.1 | // scalable vector @ FP - 1024b - 48 scalable bytes +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.3 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_address_split_sve_fp +# CHECK: stackSize: 1056 +# +# CHECK: bb.0.entry: +# CHECK-NEXT: liveins: +# CHECK-NEXT: {{ $}} +# CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.6), (store (s64) into %stack.5) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# +# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0 +# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0 +# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3 +# CHECK-NEXT: STR_PXI $p0, $fp, -1 +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 +# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_address_split_sve_fp +# ASM: stp x29, x30, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: mov x29, sp +# ASM-NEXT: .cfi_def_cfa w29, 16 +# ASM-NEXT: .cfi_offset w30, -8 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM-NEXT: sub sp, sp, #1024 +# ASM-NEXT: addvl sp, sp, #-1 +# ASM-NEXT: sub sp, sp, #1040 +# ASM-NEXT: addvl sp, sp, #-2 +# +# ASM: addvl sp, sp, #2 +# ASM-NEXT: add sp, sp, #1024 +# ASM-NEXT: addvl sp, sp, #1 +# ASM-NEXT: add sp, sp, #1040 +# ASM-NEXT: .cfi_def_cfa wsp, 16 +# ASM-NEXT: ldp x29, x30, [sp], #16 +# ASM-NEXT: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w30 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa: reg29 +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg30 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: test_address_split_sve_fp +frameInfo: + maxAlignment: 16 + isFrameAddressTaken: true +stack: + - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 } + - { id: 3, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + liveins: $z0, $z1, $p0 + + STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0) + STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1) + STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2) + + RET_ReallyLR +... +--- +# CHECK-LABEL: name: save_restore_ppr_zpr +# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.8) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: frame-setup STR_PXI killed $p6, $sp, 5 :: (store (s16) into %stack.7) +# CHECK-NEXT: frame-setup STR_PXI killed $p5, $sp, 6 :: (store (s16) into %stack.6) +# CHECK-NEXT: frame-setup STR_PXI killed $p4, $sp, 7 :: (store (s16) into %stack.5) +# +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 +# CHECK-NEXT: frame-setup STR_ZXI killed $z10, $sp, 0 :: (store (s128) into %stack.4) +# CHECK-NEXT: frame-setup STR_ZXI killed $z9, $sp, 1 :: (store (s128) into %stack.3) +# CHECK-NEXT: frame-setup STR_ZXI killed $z8, $sp, 2 :: (store (s128) into %stack.2) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1056, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 +# +# +# CHECK: $sp = frame-destroy ADDXri $sp, 1056, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 +# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4) +# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3) +# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.2) +# +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10 +# CHECK-NEXT: $p6 = frame-destroy LDR_PXI $sp, 5 :: (load (s16) from %stack.7) +# CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 6 :: (load (s16) from %stack.6) +# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 :: (load (s16) from %stack.5) +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 +# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.8) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: save_restore_ppr_zpr: +# ASM: str x29, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM-NEXT: addvl sp, sp, #-1 +# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +# ASM-NEXT: str p6, [sp, #5, mul vl] +# ASM-NEXT: str p5, [sp, #6, mul vl] +# ASM-NEXT: str p4, [sp, #7, mul vl] +# ASM-NEXT: sub sp, sp, #1024 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG +# ASM-NEXT: addvl sp, sp, #-3 +# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG +# ASM-NEXT: str z10, [sp] +# ASM-NEXT: str z9, [sp, #1, mul vl] +# ASM-NEXT: str z8, [sp, #2, mul vl] +# ASM-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040 +# ASM-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1040 +# ASM-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1040 +# ASM-NEXT: sub sp, sp, #1056 +# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 2096 + 32 * VG +# +# ASM: add sp, sp, #1056 +# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG +# ASM-NEXT: ldr z10, [sp] +# ASM-NEXT: ldr z9, [sp, #1, mul vl] +# ASM-NEXT: ldr z8, [sp, #2, mul vl] +# ASM-NEXT: add sp, sp, #1024 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 16 + 32 * VG +# ASM-NEXT: addvl sp, sp, #3 +# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +# ASM-NEXT: .cfi_restore z8 +# ASM-NEXT: .cfi_restore z9 +# ASM-NEXT: .cfi_restore z10 +# ASM-NEXT: ldr p6, [sp, #5, mul vl] +# ASM-NEXT: ldr p5, [sp, #6, mul vl] +# ASM-NEXT: ldr p4, [sp, #7, mul vl] +# ASM-NEXT: addvl sp, sp, #1 +# ASM-NEXT: .cfi_def_cfa wsp, 16 +# ASM-NEXT: ldr x29, [sp], #16 +# ASM-NEXT: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_expression: reg72 DW_OP_bregx 0x2e +0, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus +# UNWINDINFO: DW_CFA_expression: reg73 DW_OP_bregx 0x2e +0, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus +# UNWINDINFO: DW_CFA_expression: reg74 DW_OP_bregx 0x2e +0, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2096, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg104 +# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg105 +# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg106 +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: save_restore_ppr_zpr +stack: + - { id: 0, stack-id: default, size: 32, alignment: 16 } +body: | + bb.0.entry: + + $p4 = IMPLICIT_DEF + $p5 = IMPLICIT_DEF + $p6 = IMPLICIT_DEF + $z8 = IMPLICIT_DEF + $z9 = IMPLICIT_DEF + $z10 = IMPLICIT_DEF + + RET_ReallyLR diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir index bff0cac..0298168 100644 --- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir +++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir @@ -983,26 +983,22 @@ body: | ; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4 ; EXPAND-NEXT: {{ $}} - ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 - ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.3) + ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2) ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0 - ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.2) + ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.1) ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0 - ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.1) - ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 + ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0) ; ; EXPAND-NEXT: $p8 = IMPLICIT_DEF ; - ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 - ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.2) + ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.1) ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv - ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.1) + ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.0) ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg - ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.3) - ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 + ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2) ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3 ; If we spill a register above p8, p4 must also be saved, so we can guarantee diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll new file mode 100644 index 0000000..690a39d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll @@ -0,0 +1,824 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT + +; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 + +; <GPRs> +; %ppr_local sp+2048+30*vscale (= #15, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding sp+2048+16*vscale +; <hazard padding> sp+1024+16*vscale +; %zpr_local sp+1024 +; <hazard padding> +; -> sp +define void @zpr_and_ppr_local(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: zpr_and_ppr_local: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x8, sp, #2048 +; CHECK-NEXT: str p0, [x8, #15, mul vl] +; CHECK-NEXT: add x8, sp, #1024 +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 + +; <GPRs> +; -> fp +; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding fp-16*vscale +; <hazard padding> fp-1024-16*vscale +; %zpr_local fp-1024-32*vscale (= #-2, mul vl for str/ldr ZPR) +; <hazard padding> +; -> sp +define void @zpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK-LABEL: zpr_and_ppr_local_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str z0, [x8, #-2, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024 + +; <GPRs> +; %ppr_local sp+2064+14*vscale (= #7, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding sp+2064 +; <hazard padding> sp+1040 +; %fpr_local sp+1032 +; 8 bytes of padding sp+1024 +; <hazard padding> +; -> sp +define void @fpr_and_ppr_local(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: fpr_and_ppr_local: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x8, sp, #2064 +; CHECK-NEXT: str p0, [x8, #7, mul vl] +; CHECK-NEXT: str d0, [sp, #1032] +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %fpr_local = alloca double + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile double %double, ptr %fpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024 + +; <GPRs> +; -> fp +; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding +; <hazard padding> +; %fpr_local sp+1032 +; 8 bytes of padding sp+1024 +; <hazard padding> +; -> sp +define void @fpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK-LABEL: fpr_and_ppr_local_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str d0, [sp, #1032] +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %fpr_local = alloca double + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile double %double, ptr %fpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8 + +; <CS GPRs> +; %ppr_local sp+2064+30*vscale (= #15, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding +; <hazard padding> sp+1040+16*vscale +; <fpr callee save: z8> sp+1040 +; <hazard padding> sp+16 +; %gpr_local sp+8 +; 8 bytes of padding +; -> sp +define void @gpr_and_ppr_local(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: gpr_and_ppr_local: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2080 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040 +; CHECK-NEXT: add x8, sp, #2064 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x8, #15, mul vl] +; CHECK-NEXT: str x0, [sp, #8] +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed + %ppr_local = alloca <vscale x 16 x i1> + %gpr_local = alloca i64 + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile i64 %int, ptr %gpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8 + +; <CS GPRs> +; -> fp +; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding +; <hazard padding> +; <fpr callee save: z8> +; <hazard padding> +; %gpr_local sp+8 +; 8 bytes of padding +; -> sp +define void @gpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK-LABEL: gpr_and_ppr_local_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str x0, [sp, #8] +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed + %ppr_local = alloca <vscale x 16 x i1> + %gpr_local = alloca i64 + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile i64 %int, ptr %gpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-320 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-320 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2088-320 x vscale], Type: Variable, Align: 8, Size: 8 + +; <CS GPRs> +; <CS PPRs> +; %ppr_local sp+2080+286*vscale (addvl #17, addpl #7) +; 14 * vscale bytes of padding sp+2080+272*vscale +; <hazard padding> sp+1056+272*vscale +; <CS ZPRs> sp+1056+16*vscale +; %zpr_local sp+1056 +; %fpr_local sp+1048 +; 8 bytes of padding sp+1040 +; <hazard padding> sp+16 +; %gpr_local sp+8 +; 8 bytes of padding sp +; -> sp +define void @all_stack_areas(<vscale x 16 x i1> %pred, double %fp) { +; CHECK-LABEL: all_stack_areas: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xa0, 0x01, 0x1e, 0x22 // sp + 2096 + 160 * VG +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1040 +; CHECK-NEXT: add x0, sp, #2080 +; CHECK-NEXT: add x8, sp, #2080 +; CHECK-NEXT: add x1, sp, #1056 +; CHECK-NEXT: addvl x0, x0, #17 +; CHECK-NEXT: add x2, sp, #1048 +; CHECK-NEXT: add x3, sp, #8 +; CHECK-NEXT: addpl x0, x0, #7 +; CHECK-NEXT: str d0, [sp, #1048] +; CHECK-NEXT: str p0, [x8, #143, mul vl] +; CHECK-NEXT: bl foo +; CHECK-NEXT: add sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + %fpr_local = alloca double + ; // Needed to sort %fpr_local into the FPR region + store double %fp, ptr %fpr_local + ; // Needed to sort %ppr_local into the PPR region + store <vscale x 16 x i1> %pred, ptr %ppr_local + %gpr_local = alloca i64 + call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local) + ret void +} +declare void @foo(ptr, ptr, ptr, ptr) + +; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1064-320 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2096-320 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2104-320 x vscale], Type: Variable, Align: 8, Size: 8 + +; <CS GPRs> +; -> fp +; <CS PPRs> fp-32*vscale +; %ppr_local fp-34*vscale (addpl #-17) +; 14 * vscale bytes of padding fp-48*vscale +; <hazard padding> fp-1024-48*vscale +; <CS ZPRs> fp-1024-304*vscale +; %zpr_local sp-1024-320*vscale (addvl #-20) +; %fpr_local sp+1048 +; 8 bytes of padding sp+1040 +; <hazard padding> sp+16 +; %gpr_local sp+8 +; 8 bytes of padding sp +; -> sp +define void @all_stack_areas_fp(<vscale x 16 x i1> %pred, double %fp) "frame-pointer"="all" { +; CHECK-LABEL: all_stack_areas_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w28, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1056 +; CHECK-NEXT: sub x1, x29, #1024 +; CHECK-NEXT: addpl x0, x29, #-17 +; CHECK-NEXT: add x2, sp, #1048 +; CHECK-NEXT: addvl x1, x1, #-20 +; CHECK-NEXT: add x3, sp, #8 +; CHECK-NEXT: str d0, [sp, #1048] +; CHECK-NEXT: str p0, [x29, #-17, mul vl] +; CHECK-NEXT: bl foo +; CHECK-NEXT: add sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + %fpr_local = alloca double + ; // Needed to sort %fpr_local into the FPR region + store double %fp, ptr %fpr_local + ; // Needed to sort %ppr_local into the PPR region + store <vscale x 16 x i1> %pred, ptr %ppr_local + %gpr_local = alloca i64 + call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local) + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: svecc_call +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2112-288 x vscale], Type: Variable, Align: 16, Size: 1024 + +define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: svecc_call: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 64 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w26, -16 +; CHECK-NEXT: .cfi_offset w27, -24 +; CHECK-NEXT: .cfi_offset w28, -32 +; CHECK-NEXT: .cfi_offset vg, -48 +; CHECK-NEXT: .cfi_offset w30, -56 +; CHECK-NEXT: .cfi_offset w29, -64 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: tbz w19, #0, .LBB8_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB8_2: // %entry +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov w1, #45 // =0x2d +; CHECK-NEXT: mov w2, #37 // =0x25 +; CHECK-NEXT: bl memset +; CHECK-NEXT: tbz w19, #0, .LBB8_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB8_4: // %entry +; CHECK-NEXT: mov w0, #22647 // =0x5877 +; CHECK-NEXT: movk w0, #59491, lsl #16 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: .cfi_restore z12 +; CHECK-NEXT: .cfi_restore z13 +; CHECK-NEXT: .cfi_restore z14 +; CHECK-NEXT: .cfi_restore z15 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: .cfi_def_cfa wsp, 64 +; CHECK-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w26 +; CHECK-NEXT: .cfi_restore w27 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} +declare ptr @memset(ptr, i32, i32) + +; FIXME: aarch64-split-sve-objects is currently not supported in this function +; as it requires stack reealignment (for the 32-byte aligned alloca). +; GPR CSRs +; <hazard padding> +; FPR CSRs +; <hazrd padding> +; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here! +; <realignment padding> +; -> sp +define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: zpr_and_ppr_local_realignment: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: sub x9, sp, #1040 +; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #1024 +; CHECK-NEXT: addvl x9, x9, #-2 +; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: str p0, [x8, #-1, mul vl] +; CHECK-NEXT: str z0, [x8, #-2, mul vl] +; CHECK-NEXT: str x0, [sp] +; CHECK-NEXT: sub sp, x29, #1024 +; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + %gpr_local = alloca i64, align 32 + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + store volatile i64 %gpr, ptr %gpr_local + ret void +} + +define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) +; CHECK-LABEL: zpr_and_ppr_local_stack_probing: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1824 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xb0, 0x16, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2864 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x8, sp, #2848 +; CHECK-NEXT: str p0, [x8, #15, mul vl] +; CHECK-NEXT: add x8, sp, #1824 +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: str x0, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1824 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" "aarch64_pstate_sm_compatible" +{ + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + %gpr_local = alloca i64, i64 100, align 8 + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + store volatile i64 %gpr, ptr %gpr_local + ret void +} diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index 5f52280..333a8be 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE define i32 @basic(i32 noundef %num) { ; CHECK-LABEL: basic: @@ -1503,72 +1504,24 @@ define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1> } define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind "aarch64_pstate_sm_compatible" { -; CHECK0-LABEL: sve_signature_pred_2xv4i1_caller: -; CHECK0: // %bb.0: -; CHECK0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK0-NEXT: addvl sp, sp, #-1 -; CHECK0-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: mov p5.b, p0.b -; CHECK0-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: mov p4.b, p1.b -; CHECK0-NEXT: mov p0.b, p2.b -; CHECK0-NEXT: mov p1.b, p3.b -; CHECK0-NEXT: mov p2.b, p5.b -; CHECK0-NEXT: mov p3.b, p4.b -; CHECK0-NEXT: bl sve_signature_pred_2xv4i1 -; CHECK0-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: addvl sp, sp, #1 -; CHECK0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK0-NEXT: ret -; -; CHECK64-LABEL: sve_signature_pred_2xv4i1_caller: -; CHECK64: // %bb.0: -; CHECK64-NEXT: sub sp, sp, #80 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: addvl sp, sp, #-1 -; CHECK64-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: sub sp, sp, #64 -; CHECK64-NEXT: mov p4.b, p1.b -; CHECK64-NEXT: mov p5.b, p0.b -; CHECK64-NEXT: mov p0.b, p2.b -; CHECK64-NEXT: mov p1.b, p3.b -; CHECK64-NEXT: mov p2.b, p5.b -; CHECK64-NEXT: mov p3.b, p4.b -; CHECK64-NEXT: bl sve_signature_pred_2xv4i1 -; CHECK64-NEXT: add sp, sp, #64 -; CHECK64-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: addvl sp, sp, #1 -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #80 -; CHECK64-NEXT: ret -; -; CHECK1024-LABEL: sve_signature_pred_2xv4i1_caller: -; CHECK1024: // %bb.0: -; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: mov p4.b, p1.b -; CHECK1024-NEXT: mov p5.b, p0.b -; CHECK1024-NEXT: mov p0.b, p2.b -; CHECK1024-NEXT: mov p1.b, p3.b -; CHECK1024-NEXT: mov p2.b, p5.b -; CHECK1024-NEXT: mov p3.b, p4.b -; CHECK1024-NEXT: bl sve_signature_pred_2xv4i1 -; CHECK1024-NEXT: add sp, sp, #1024 -; CHECK1024-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #1 -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1040 -; CHECK1024-NEXT: ret +; CHECK-LABEL: sve_signature_pred_2xv4i1_caller: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p5.b, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p4.b, p1.b +; CHECK-NEXT: mov p0.b, p2.b +; CHECK-NEXT: mov p1.b, p3.b +; CHECK-NEXT: mov p2.b, p5.b +; CHECK-NEXT: mov p3.b, p4.b +; CHECK-NEXT: bl sve_signature_pred_2xv4i1 +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1) ret [2 x <vscale x 4 x i1>] %res } @@ -2113,139 +2066,269 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1088 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w26, -24 -; CHECK1024-NEXT: .cfi_offset w27, -32 -; CHECK1024-NEXT: .cfi_offset w28, -40 -; CHECK1024-NEXT: .cfi_offset vg, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: mov x8, x0 -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: mov x19, x0 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: tbz w19, #0, .LBB28_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB28_2: // %entry -; CHECK1024-NEXT: mov x0, x8 -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: mov w2, #37 // =0x25 -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w19, #0, .LBB28_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB28_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: add sp, sp, #1024 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #18 -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088 -; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w26 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_call: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: mov x8, x0 +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB28_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25 +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB28_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: mov x8, x0 +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: mov x19, x0 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB28_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, x8 +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25 +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB28_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64 +; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret entry: tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) @@ -2505,138 +2588,267 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8 ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_alloca_call: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1088 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w26, -24 -; CHECK1024-NEXT: .cfi_offset w27, -32 -; CHECK1024-NEXT: .cfi_offset w28, -40 -; CHECK1024-NEXT: .cfi_offset vg, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 -; CHECK1024-NEXT: sub sp, sp, #1072 -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: mov x19, x0 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: tbz w19, #0, .LBB29_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB29_2: // %entry -; CHECK1024-NEXT: mov x0, sp -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: mov w2, #37 // =0x25 -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w19, #0, .LBB29_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB29_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: add sp, sp, #1072 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #18 -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088 -; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w26 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_alloca_call: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB29_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB29_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, sp +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25 +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB29_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB29_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1072 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_alloca_call: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: mov x19, x0 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB29_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB29_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, sp +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25 +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB29_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB29_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1072 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64 +; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret entry: tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index 2cbb29e..d8de12c 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -672,5 +672,3 @@ entry: ret i32 %x } declare void @other() -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-FRAMELAYOUT: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll index de7d234..b9bf76c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s declare i32 @llvm.amdgcn.s.quadmask.i32(i32) declare i64 @llvm.amdgcn.s.quadmask.i64(i64) @@ -172,3 +172,91 @@ entry: %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask) ret i64 %qm } + +;; Ensure that AND/ICMP cannot be fused into an AND because s_quadmask_b32 implicitly defines SCC. +define amdgpu_kernel void @test_scc_quadmask_32(i32 %val0, i32 %val1, ptr addrspace(1) %ptr) { +; GFX11-GISEL-LABEL: test_scc_quadmask_32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_and_b32 s0, s0, 1 +; GFX11-GISEL-NEXT: s_quadmask_b32 s1, s1 +; GFX11-GISEL-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s0 +; GFX11-GISEL-NEXT: global_store_b32 v2, v3, s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_scc_quadmask_32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 1 +; GFX11-SDAG-NEXT: s_quadmask_b32 s1, s1 +; GFX11-SDAG-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-SDAG-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX11-SDAG-NEXT: global_store_b32 v2, v3, s[2:3] +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-SDAG-NEXT: s_endpgm + %and = and i32 %val0, 1 + %result = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %val1) nounwind readnone + store i32 %result, ptr addrspace(1) %ptr + %cmp = icmp eq i32 %and, 0 + %sel = select i1 %cmp, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) null, align 4 + ret void +} + +;; Ensure that AND/ICMP cannot be fused into an AND because s_quadmask_b64 implicitly defines SCC. +define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrspace(1) %ptr) { +; GFX11-GISEL-LABEL: test_scc_quadmask_64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_quadmask_b64 s[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-GISEL-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v[2:3], v5, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_scc_quadmask_64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_and_b32 s4, s6, 1 +; GFX11-SDAG-NEXT: s_quadmask_b64 s[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-SDAG-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v5, off +; GFX11-SDAG-NEXT: s_endpgm + %and = and i32 %val0, 1 + %result = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %val1) nounwind readnone + store i64 %result, ptr addrspace(1) %ptr + %cmp = icmp eq i32 %and, 0 + %sel = select i1 %cmp, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) null, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll index 0de7f8f..bd29e9e 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s ; Regression test for issue 160181 ; One variable is chosen to be assigned at zero. Here, that's @both @@ -22,12 +22,20 @@ ;. ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META0:![0-9]+]] ; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.kern_one.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_one.lds.t poison, align 4, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @llvm.amdgcn.kernel.kern_two.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_two.lds.t poison, align 4, !absolute_symbol [[META1]] +; CHECK: @llvm.amdgcn.kernel.kern_block_direct_allocation.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_block_direct_allocation.lds.t poison, align 4, !absolute_symbol [[META1]] + ;. define void @func_one() { ; CHECK-LABEL: define {{[^@]+}}@func_one() { -; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1:![0-9]+]] -; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18:![0-9]+]] -; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2:![0-9]+]] +; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4 +; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[ONE1]], align 4 +; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11:![0-9]+]] ; CHECK-NEXT: ret void ; %val0 = load i32, ptr addrspace(3) @both @@ -38,9 +46,10 @@ define void @func_one() { define amdgpu_kernel void @kern_one() { ; CHECK-LABEL: define {{[^@]+}}@kern_one -; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META16:![0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META24:![0-9]+]] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_one.lds) ] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META17:![0-9]+]] ; CHECK-NEXT: call void @func_one() ; CHECK-NEXT: ret void ; @@ -51,9 +60,13 @@ entry: define void @func_two() { ; CHECK-LABEL: define {{[^@]+}}@func_two() { -; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]] -; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25:![0-9]+]] -; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]] +; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4 +; CHECK-NEXT: [[TWO1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[TWO1]], align 4 +; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]] ; CHECK-NEXT: ret void ; %val0 = load i32, ptr addrspace(3) @both @@ -64,9 +77,10 @@ define void @func_two() { define amdgpu_kernel void @kern_two() { ; CHECK-LABEL: define {{[^@]+}}@kern_two -; CHECK-SAME: () #[[ATTR0]] { +; CHECK-SAME: () #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META18:![0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META26:![0-9]+]], !noalias [[META27:![0-9]+]] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_two.lds) ] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]] ; CHECK-NEXT: call void @func_two() ; CHECK-NEXT: ret void ; @@ -82,11 +96,18 @@ entry: ; remains the best candidate for address zero allocation. define void @func_block_direct_allocation() { ; CHECK-LABEL: define {{[^@]+}}@func_block_direct_allocation() { -; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18]] -; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4 +; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) [[ONE1]], align 4 +; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4 +; CHECK-NEXT: [[TWO2:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3) +; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) [[TWO2]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[VAL1]], [[VAL2]] -; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]] -; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]] +; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]] +; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]] ; CHECK-NEXT: ret void ; %val1 = load i32, ptr addrspace(3) @one @@ -99,7 +120,8 @@ define void @func_block_direct_allocation() { define amdgpu_kernel void @kern_block_direct_allocation() { ; CHECK-LABEL: define {{[^@]+}}@kern_block_direct_allocation -; CHECK-SAME: () #[[ATTR0]] { +; CHECK-SAME: () #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META21:![0-9]+]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_block_direct_allocation.lds) ], !alias.scope [[META22:![0-9]+]], !noalias [[META25:![0-9]+]] ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: call void @func_block_direct_allocation() ; CHECK-NEXT: call void @func_one() @@ -112,35 +134,8 @@ define amdgpu_kernel void @kern_block_direct_allocation() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="16" } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -;. -; CHECK: [[META0]] = !{i32 0, i32 1} -; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]], [[META14:![0-9]+]], [[META16:![0-9]+]], [[META17:![0-9]+]]} -; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]]} -; CHECK: [[META3]] = distinct !{[[META3]]} -; CHECK: [[META4]] = distinct !{[[META4]], [[META3]]} -; CHECK: [[META5]] = distinct !{[[META5]], [[META3]]} -; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]} -; CHECK: [[META7]] = distinct !{[[META7]]} -; CHECK: [[META8]] = distinct !{[[META8]], [[META7]]} -; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]} -; CHECK: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]} -; CHECK: [[META11]] = distinct !{[[META11]]} -; CHECK: [[META12]] = distinct !{[[META12]], [[META11]]} -; CHECK: [[META13]] = distinct !{[[META13]], [[META11]]} -; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]} -; CHECK: [[META15]] = distinct !{[[META15]]} -; CHECK: [[META16]] = distinct !{[[META16]], [[META15]]} -; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]} -; CHECK: [[META18]] = !{[[META19:![0-9]+]], [[META2]], [[META5]], [[META20:![0-9]+]], [[META6]], [[META9]], [[META21:![0-9]+]], [[META10]], [[META13]], [[META22:![0-9]+]], [[META14]], [[META17]]} -; CHECK: [[META19]] = distinct !{[[META19]], [[META3]]} -; CHECK: [[META20]] = distinct !{[[META20]], [[META7]]} -; CHECK: [[META21]] = distinct !{[[META21]], [[META11]]} -; CHECK: [[META22]] = distinct !{[[META22]], [[META15]]} -; CHECK: [[META23]] = !{[[META19]], [[META4]], [[META5]], [[META20]], [[META8]], [[META9]], [[META21]], [[META12]], [[META13]], [[META22]], [[META16]], [[META17]]} -; CHECK: [[META24]] = !{[[META10]], [[META12]], [[META13]], [[META14]], [[META16]], [[META17]]} -; CHECK: [[META25]] = !{[[META19]], [[META2]], [[META4]], [[META20]], [[META6]], [[META8]], [[META21]], [[META10]], [[META12]], [[META22]], [[META14]], [[META16]]} -; CHECK: [[META26]] = !{[[META22]]} -; CHECK: [[META27]] = !{[[META14]], [[META16]], [[META17]]} +; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="12" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-lds-size"="16" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll index 18fb879..21ca041 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll @@ -115,5 +115,150 @@ define ptx_kernel void @inlineasm(ptr %p) { store <2 x float> %mul, ptr %p, align 8 ret void } + +define ptx_kernel void @trunc_v2i32(<2 x i32> %0) { +; CHECK-SM90A-LABEL: trunc_v2i32( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b32 %r<7>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r1, %r2}, [trunc_v2i32_param_0]; +; CHECK-SM90A-NEXT: prmt.b32 %r3, %r1, %r2, 0x3340U; +; CHECK-SM90A-NEXT: mov.b32 %r4, 0; +; CHECK-SM90A-NEXT: prmt.b32 %r5, %r4, 0, 0x3340U; +; CHECK-SM90A-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U; +; CHECK-SM90A-NEXT: mov.b64 %rd1, 0; +; CHECK-SM90A-NEXT: st.b32 [%rd1], %r6; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: trunc_v2i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<7>; +; CHECK-SM100-NEXT: .reg .b64 %rd<3>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b64 %rd1, [trunc_v2i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-SM100-NEXT: mov.b32 %r3, 0; +; CHECK-SM100-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; +; CHECK-SM100-NEXT: prmt.b32 %r5, %r1, %r2, 0x3340U; +; CHECK-SM100-NEXT: prmt.b32 %r6, %r4, %r5, 0x5410U; +; CHECK-SM100-NEXT: mov.b64 %rd2, 0; +; CHECK-SM100-NEXT: st.b32 [%rd2], %r6; +; CHECK-SM100-NEXT: ret; + %2 = trunc <2 x i32> %0 to <2 x i8> + %3 = shufflevector <2 x i8> zeroinitializer, <2 x i8> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + store <4 x i8> %3, ptr null, align 4 + ret void +} + +define ptx_kernel void @zextend_to_v2i32(<2 x i8> %0) { +; CHECK-SM90A-LABEL: zextend_to_v2i32( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b16 %rs<3>; +; CHECK-SM90A-NEXT: .reg .b32 %r<4>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<5>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0]; +; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-SM90A-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-SM90A-NEXT: mov.b64 %rd1, 12; +; CHECK-SM90A-NEXT: st.b32 [%rd1], %r3; +; CHECK-SM90A-NEXT: mov.b64 %rd2, 8; +; CHECK-SM90A-NEXT: st.b32 [%rd2], %r2; +; CHECK-SM90A-NEXT: mov.b64 %rd3, 4; +; CHECK-SM90A-NEXT: st.b32 [%rd3], 0; +; CHECK-SM90A-NEXT: mov.b64 %rd4, 0; +; CHECK-SM90A-NEXT: st.b32 [%rd4], 0; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: zextend_to_v2i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<3>; +; CHECK-SM100-NEXT: .reg .b32 %r<5>; +; CHECK-SM100-NEXT: .reg .b64 %rd<8>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0]; +; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-SM100-NEXT: mov.b64 %rd1, {%r3, %r2}; +; CHECK-SM100-NEXT: mov.b32 %r4, 0; +; CHECK-SM100-NEXT: mov.b64 %rd2, {%r4, %r4}; +; CHECK-SM100-NEXT: mov.b64 %rd3, 4; +; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2; +; CHECK-SM100-NEXT: mov.b64 %rd4, 0; +; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2; +; CHECK-SM100-NEXT: mov.b64 %rd5, 8; +; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1; +; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32; +; CHECK-SM100-NEXT: mov.b64 %rd7, 12; +; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6; +; CHECK-SM100-NEXT: ret; + %2 = zext <2 x i8> %0 to <2 x i32> + %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + store <4 x i32> %3, ptr null, align 4 + ret void +} + +define ptx_kernel void @sextend_to_v2i32(<2 x i8> %0) { +; CHECK-SM90A-LABEL: sextend_to_v2i32( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b16 %rs<3>; +; CHECK-SM90A-NEXT: .reg .b32 %r<6>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<5>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0]; +; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-SM90A-NEXT: cvt.s32.s8 %r3, %r2; +; CHECK-SM90A-NEXT: cvt.u32.u16 %r4, %rs2; +; CHECK-SM90A-NEXT: cvt.s32.s8 %r5, %r4; +; CHECK-SM90A-NEXT: mov.b64 %rd1, 12; +; CHECK-SM90A-NEXT: st.b32 [%rd1], %r5; +; CHECK-SM90A-NEXT: mov.b64 %rd2, 8; +; CHECK-SM90A-NEXT: st.b32 [%rd2], %r3; +; CHECK-SM90A-NEXT: mov.b64 %rd3, 4; +; CHECK-SM90A-NEXT: st.b32 [%rd3], 0; +; CHECK-SM90A-NEXT: mov.b64 %rd4, 0; +; CHECK-SM90A-NEXT: st.b32 [%rd4], 0; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: sextend_to_v2i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<3>; +; CHECK-SM100-NEXT: .reg .b32 %r<7>; +; CHECK-SM100-NEXT: .reg .b64 %rd<8>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0]; +; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2; +; CHECK-SM100-NEXT: cvt.s32.s8 %r3, %r2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r4, %rs1; +; CHECK-SM100-NEXT: cvt.s32.s8 %r5, %r4; +; CHECK-SM100-NEXT: mov.b64 %rd1, {%r5, %r3}; +; CHECK-SM100-NEXT: mov.b32 %r6, 0; +; CHECK-SM100-NEXT: mov.b64 %rd2, {%r6, %r6}; +; CHECK-SM100-NEXT: mov.b64 %rd3, 4; +; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2; +; CHECK-SM100-NEXT: mov.b64 %rd4, 0; +; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2; +; CHECK-SM100-NEXT: mov.b64 %rd5, 8; +; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1; +; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32; +; CHECK-SM100-NEXT: mov.b64 %rd7, 12; +; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6; +; CHECK-SM100-NEXT: ret; + %2 = sext <2 x i8> %0 to <2 x i32> + %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + store <4 x i32> %3, ptr null, align 4 + ret void +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll index 06d54fa..95bff27 100644 --- a/llvm/test/CodeGen/RISCV/rvv/remat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll @@ -301,3 +301,135 @@ define void @vfmv.s.f(ptr %p, double %x) { store volatile double %x, ptr %p ret void } + +; This test is fairly fragile, but it's trying to cover the case which +; caused the revert of bba9172 due to interaction with how rematerialize +; instructions are pruned from the original live interval. In the result +; below, we remat the vmv.v.x into the loop, but fail to remat the vmv.v.x +; a second time after further splitting it's live range. We shouldn't need +; to spill it to the stack at all. +define i64 @dual_remat(i64 %0, <vscale x 16 x i64> %1, <vscale x 16 x i64> %2, ptr %p) #0 { +; CHECK-LABEL: dual_remat: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a1, a2, 3 +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: .LBB8_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a5, a4, 4 +; CHECK-NEXT: add a4, a5, a4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vand.vv v16, v16, v8 +; CHECK-NEXT: vmsne.vi v24, v16, 0 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs1r.v v24, (a4) # vscale x 8-byte Folded Spill +; CHECK-NEXT: vand.vv v16, v0, v8 +; CHECK-NEXT: vmsne.vi v8, v16, 0 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vl1r.v v9, (a4) # vscale x 8-byte Folded Reload +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a4, zero, e8, m2, ta, ma +; CHECK-NEXT: vcpop.m a4, v9 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a6, a5, 4 +; CHECK-NEXT: add a5, a6, a5 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vs8r.v v8, (a3) +; CHECK-NEXT: vs8r.v v8, (a2) +; CHECK-NEXT: addi a5, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli a5, zero, e64, m8, ta, ma +; CHECK-NEXT: vor.vv v16, v16, v8 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a5, a5, 3 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vor.vv v0, v0, v8 +; CHECK-NEXT: beqz a4, .LBB8_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add sp, sp, a1 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <vscale x 16 x i64> zeroinitializer, i64 %0, i64 0 + %broadcast.splat = shufflevector <vscale x 16 x i64> %broadcast.splatinsert, <vscale x 16 x i64> zeroinitializer, <vscale x 16 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.ind = phi <vscale x 16 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] + %3 = and <vscale x 16 x i64> %vec.ind, %broadcast.splat + %4 = icmp ne <vscale x 16 x i64> %3, zeroinitializer + store <vscale x 16 x i64> %broadcast.splat, ptr %p + %5 = tail call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> %4) + %vec.ind.next = or <vscale x 16 x i64> %vec.ind, %1 + br i1 %5, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %and.i = and i64 1, %0 + ret i64 %and.i +} diff --git a/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir b/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir index 5d644c3..718fa6f 100644 --- a/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir +++ b/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir @@ -366,7 +366,8 @@ frameInfo: maxCallFrameSize: 0 localFrameSize: 144 machineFunctionInfo: - stackSizeSVE: 0 + stackSizeZPR: 0 + stackSizePPR: 0 stack: - { id: 0, name: StackGuardSlot, offset: -40, size: 8, alignment: 8, stack-id: default, local-offset: -8 } diff --git a/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir b/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir index 013d933..b7a9892 100644 --- a/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir +++ b/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir @@ -69,7 +69,8 @@ frameInfo: hasCalls: true maxCallFrameSize: 0 machineFunctionInfo: - stackSizeSVE: 0 + stackSizeZPR: 0 + stackSizePPR: 0 stack: - { id: 0, type: spill-slot, offset: -20, size: 4, alignment: 4, stack-id: default } - { id: 1, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default, diff --git a/llvm/test/DebugInfo/X86/dynamic-bitfield.ll b/llvm/test/DebugInfo/X86/dynamic-bitfield.ll index c9148ca4..f893597 100644 --- a/llvm/test/DebugInfo/X86/dynamic-bitfield.ll +++ b/llvm/test/DebugInfo/X86/dynamic-bitfield.ll @@ -27,7 +27,7 @@ source_filename = "bitfield.c" !6 = !{} !7 = !{!0, !2} !8 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "PackedBits", file: !5, line: 3, size: 40, elements: !9) -!9 = !{!10, !12, !16} +!9 = !{!10, !12, !16, !21} !10 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !8, file: !5, line: 5, baseType: !11, size: 8) ; CHECK: DW_TAG_member ; CHECK-NEXT: DW_AT_name{{.*}}"a" @@ -60,5 +60,14 @@ source_filename = "bitfield.c" ; CHECK: DW_AT_bit_size [DW_FORM_exprloc] (DW_OP_lit27) ; CHECK-NEXT: DW_AT_data_bit_offset [DW_FORM_exprloc] (DW_OP_lit13) ; CHECK-NOT: DW_AT_data_member_location -; CHECK: DW_TAG !20 = !{!"clang version 3.9.0 (trunk 267633)"} +!21 = !DIDerivedType(tag: DW_TAG_member, name: "d", scope: !8, file: !5, line: 7, baseType: !13, offset: !DIExpression(DW_OP_constu, 15), flags: DIFlagBitField) +; CHECK: DW_TAG_member +; CHECK-NEXT: DW_AT_name{{.*}}"d" +; CHECK-NOT: DW_TAG +; CHECK-NOT: DW_AT_bit_offset +; CHECK-NOT: DW_AT_byte_size +; CHECK-NOT: DW_AT_bit_size +; CHECK: DW_AT_data_bit_offset [DW_FORM_exprloc] (DW_OP_lit15) +; CHECK-NOT: DW_AT_data_member_location +; CHECK: DW_TAG diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll index 119cffd..d94e78c 100644 --- a/llvm/test/Transforms/InstCombine/fcmp.ll +++ b/llvm/test/Transforms/InstCombine/fcmp.ll @@ -1812,6 +1812,46 @@ define i1 @fcmp_ule_fsub_const(float %x, float %y) { ret i1 %cmp } +define i1 @fcmp_ninf_ule_fsub_const(float %x, float %y) { +; CHECK-LABEL: @fcmp_ninf_ule_fsub_const( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ule float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %fs = fsub float %x, %y + %cmp = fcmp ninf ule float %fs, 0.000000e+00 + ret i1 %cmp +} + +define i1 @fcmp_nnan_ule_fsub_const(float %x, float %y) { +; CHECK-LABEL: @fcmp_nnan_ule_fsub_const( +; CHECK-NEXT: [[CMP:%.*]] = fcmp nnan ule float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %fs = fsub float %x, %y + %cmp = fcmp nnan ule float %fs, 0.000000e+00 + ret i1 %cmp +} + +define i1 @fcmp_ule_fsub_ninf_const(float %x, float %y) { +; CHECK-LABEL: @fcmp_ule_fsub_ninf_const( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ninf ule float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %fs = fsub ninf float %x, %y + %cmp = fcmp ule float %fs, 0.000000e+00 + ret i1 %cmp +} + +define i1 @fcmp_ule_fsub_nnan_const(float %x, float %y) { +; CHECK-LABEL: @fcmp_ule_fsub_nnan_const( +; CHECK-NEXT: [[CMP:%.*]] = fcmp nnan ule float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %fs = fsub nnan float %x, %y + %cmp = fcmp ule float %fs, 0.000000e+00 + ret i1 %cmp +} + define i1 @fcmp_ugt_fsub_const(float %x, float %y) { ; CHECK-LABEL: @fcmp_ugt_fsub_const( ; CHECK-NEXT: [[FS:%.*]] = fsub float [[X:%.*]], [[Y:%.*]] diff --git a/llvm/test/Transforms/InstCombine/icmp-clamp.ll b/llvm/test/Transforms/InstCombine/icmp-clamp.ll new file mode 100644 index 0000000..4866dbf --- /dev/null +++ b/llvm/test/Transforms/InstCombine/icmp-clamp.ll @@ -0,0 +1,295 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +declare void @use(i32) + +define i1 @test_i32_eq(i32 %x) { +; CHECK-LABEL: define i1 @test_i32_eq( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], 95 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], 256 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160) + %cmp = icmp eq i32 %v2, %x + ret i1 %cmp +} + +define i1 @test_i32_ne(i32 %x) { +; CHECK-LABEL: define i1 @test_i32_ne( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], -161 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], -256 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160) + %cmp = icmp ne i32 %v2, %x + ret i1 %cmp +} + +define i1 @test_i32_eq_no_add(i32 %x) { +; CHECK-LABEL: define i1 @test_i32_eq_no_add( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 161 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 0) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160) + %cmp = icmp eq i32 %v2, %x + ret i1 %cmp +} + +define i1 @test_i32_ne_no_add(i32 %x) { +; CHECK-LABEL: define i1 @test_i32_ne_no_add( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[X]], 160 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 0) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160) + %cmp = icmp ne i32 %v2, %x + ret i1 %cmp +} + +define i1 @test_unsigned_eq(i32 %x) { +; CHECK-LABEL: define i1 @test_unsigned_eq( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], -10 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], 91 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.umax.i32(i32 %x, i32 10) + %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 100) + %cmp = icmp eq i32 %v2, %x + ret i1 %cmp +} + +define i1 @test_unsigned_ne(i32 %x) { +; CHECK-LABEL: define i1 @test_unsigned_ne( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], -101 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], -91 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.umax.i32(i32 %x, i32 10) + %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 100) + %cmp = icmp ne i32 %v2, %x + ret i1 %cmp +} + + +; Different bit widths +define i1 @test_i8_eq(i8 %x) { +; CHECK-LABEL: define i1 @test_i8_eq( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X]], 50 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[TMP1]], 101 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i8 @llvm.smax.i8(i8 %x, i8 -50) + %v2 = tail call i8 @llvm.smin.i8(i8 %v1, i8 50) + %cmp = icmp eq i8 %v2, %x + ret i1 %cmp +} + +define i1 @test_i16_eq(i16 %x) { +; CHECK-LABEL: define i1 @test_i16_eq( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[X]], 1000 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16 [[TMP1]], 2001 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -1000) + %v2 = tail call i16 @llvm.smin.i16(i16 %v1, i16 1000) + %cmp = icmp eq i16 %v2, %x + ret i1 %cmp +} + +define i1 @test_i64_eq(i64 %x) { +; CHECK-LABEL: define i1 @test_i64_eq( +; CHECK-SAME: i64 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[X]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP1]], -1 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i64 @llvm.smax.i64(i64 %x, i64 -1) + %v2 = tail call i64 @llvm.smin.i64(i64 %v1, i64 9223372036854775806) + %cmp = icmp eq i64 %v2, %x + ret i1 %cmp +} + +; Negative tests - wrong predicate +define i1 @test_wrong_pred_slt(i32 %x) { +; CHECK-LABEL: define i1 @test_wrong_pred_slt( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 160 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160) + %cmp = icmp slt i32 %v2, %x + ret i1 %cmp +} + + +; Negative tests - not a clamp pattern +define i1 @test_not_clamp_pattern(i32 %x, i32 %y) { +; CHECK-LABEL: define i1 @test_not_clamp_pattern( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[Y]], i32 -95) +; CHECK-NEXT: [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 160) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V2]], [[X]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %y, i32 -95) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160) + %cmp = icmp eq i32 %v2, %x + ret i1 %cmp +} + +; Negative tests - Lo >= Hi +define i1 @test_invalid_range(i32 %x) { +; CHECK-LABEL: define i1 @test_invalid_range( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], 50 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 100) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 50) + %cmp = icmp eq i32 %v2, %x + ret i1 %cmp +} + +; Negative tests - Lo is minimum signed value +define i1 @test_lo_min_signed(i32 %x) { +; CHECK-LABEL: define i1 @test_lo_min_signed( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 161 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -2147483648) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160) + %cmp = icmp eq i32 %v2, %x + ret i1 %cmp +} + +; Negative tests - Hi is maximum signed value +define i1 @test_hi_max_signed(i32 %x) { +; CHECK-LABEL: define i1 @test_hi_max_signed( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], -96 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 2147483647) + %cmp = icmp eq i32 %v2, %x + ret i1 %cmp +} + +; Negative tests - Hi is maximum unsigned value +define i1 @test_hi_max_unsigned(i32 %x) { +; CHECK-LABEL: define i1 @test_hi_max_unsigned( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[X]], 9 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.umax.i32(i32 %x, i32 10) + %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 4294967295) + %cmp = icmp eq i32 %v2, %x + ret i1 %cmp +} + +; Multi-use tests - multiple uses of max +define i1 @test_multi_use_max(i32 %x) { +; CHECK-LABEL: define i1 @test_multi_use_max( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X]], i32 -95) +; CHECK-NEXT: call void @use(i32 [[V1]]) +; CHECK-NEXT: [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 160) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V2]], [[X]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95) + call void @use(i32 %v1) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160) + %cmp = icmp eq i32 %v2, %x + ret i1 %cmp +} + +; Multi-use tests - multiple uses of min +define i1 @test_multi_use_min(i32 %x) { +; CHECK-LABEL: define i1 @test_multi_use_min( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X]], i32 -95) +; CHECK-NEXT: [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 160) +; CHECK-NEXT: call void @use(i32 [[V2]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V2]], [[X]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160) + call void @use(i32 %v2) + %cmp = icmp eq i32 %v2, %x + ret i1 %cmp +} + +; Commuted tests +define i1 @test_commuted_eq(i32 %x) { +; CHECK-LABEL: define i1 @test_commuted_eq( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], 95 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], 256 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95) + %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160) + %cmp = icmp eq i32 %x, %v2 + ret i1 %cmp +} + + +; Vector tests - splat constants +define <2 x i1> @test_vec_splat_eq(<2 x i32> %x) { +; CHECK-LABEL: define <2 x i1> @test_vec_splat_eq( +; CHECK-SAME: <2 x i32> [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[X]], splat (i32 50) +; CHECK-NEXT: [[CMP:%.*]] = icmp ult <2 x i32> [[TMP1]], splat (i32 101) +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %v1 = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 -50, i32 -50>) + %v2 = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %v1, <2 x i32> <i32 50, i32 50>) + %cmp = icmp eq <2 x i32> %v2, %x + ret <2 x i1> %cmp +} + +; Vector tests - poison elements +define <2 x i1> @test_vec_poison_eq(<2 x i32> %x) { +; CHECK-LABEL: define <2 x i1> @test_vec_poison_eq( +; CHECK-SAME: <2 x i32> [[X:%.*]]) { +; CHECK-NEXT: [[V1:%.*]] = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[X]], <2 x i32> <i32 -50, i32 poison>) +; CHECK-NEXT: [[V2:%.*]] = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[V1]], <2 x i32> <i32 50, i32 poison>) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[V2]], [[X]] +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %v1 = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 -50, i32 poison>) + %v2 = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %v1, <2 x i32> <i32 50, i32 poison>) + %cmp = icmp eq <2 x i32> %v2, %x + ret <2 x i1> %cmp +} + +; Vector tests - non-splat +define <2 x i1> @test_vec_non_splat_eq(<2 x i32> %x) { +; CHECK-LABEL: define <2 x i1> @test_vec_non_splat_eq( +; CHECK-SAME: <2 x i32> [[X:%.*]]) { +; CHECK-NEXT: [[V1:%.*]] = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[X]], <2 x i32> <i32 -50, i32 -30>) +; CHECK-NEXT: [[V2:%.*]] = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[V1]], <2 x i32> <i32 50, i32 70>) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[V2]], [[X]] +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %v1 = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 -50, i32 -30>) + %v2 = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %v1, <2 x i32> <i32 50, i32 70>) + %cmp = icmp eq <2 x i32> %v2, %x + ret <2 x i1> %cmp +} diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll b/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll new file mode 100644 index 0000000..1339afe --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll @@ -0,0 +1,75 @@ +; Test branch weight metadata, estimated trip count metadata, and block +; frequencies after loop peeling. + +; RUN: opt < %s -S -passes='print<block-freq>' 2>&1 | \ +; RUN: FileCheck -check-prefix=CHECK %s + +; The -implicit-check-not options make sure that no additional labels or calls +; to @f show up. +; RUN: opt < %s -S -passes='loop-unroll,print<block-freq>' \ +; RUN: -unroll-force-peel-count=2 2>&1 | \ +; RUN: FileCheck %s -check-prefix=CHECK-UR \ +; RUN: -implicit-check-not='{{^[^ ;]*:}}' \ +; RUN: -implicit-check-not='call void @f' + +; CHECK: block-frequency-info: test +; CHECK: do.body: float = 10.0, + +; The sum should still be ~10. +; +; CHECK-UR: block-frequency-info: test +; CHECK-UR: - [[DO_BODY_PEEL:.*]]: float = 1.0, +; CHECK-UR: - [[DO_BODY_PEEL2:.*]]: float = 0.9, +; CHECK-UR: - [[DO_BODY:.*]]: float = 8.1, + +declare void @f(i32) + +define void @test(i32 %n) { +; CHECK-UR-LABEL: define void @test( +; CHECK-UR: [[ENTRY:.*]]: +; CHECK-UR: br label %[[DO_BODY_PEEL_BEGIN:.*]] +; CHECK-UR: [[DO_BODY_PEEL_BEGIN]]: +; CHECK-UR: br label %[[DO_BODY_PEEL:.*]] +; CHECK-UR: [[DO_BODY_PEEL]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END:.*]], label %[[DO_BODY_PEEL_NEXT:.*]], !prof ![[#PROF:]] +; CHECK-UR: [[DO_BODY_PEEL_NEXT]]: +; CHECK-UR: br label %[[DO_BODY_PEEL2:.*]] +; CHECK-UR: [[DO_BODY_PEEL2]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_PEEL_NEXT1:.*]], !prof ![[#PROF]] +; CHECK-UR: [[DO_BODY_PEEL_NEXT1]]: +; CHECK-UR: br label %[[DO_BODY_PEEL_NEXT5:.*]] +; CHECK-UR: [[DO_BODY_PEEL_NEXT5]]: +; CHECK-UR: br label %[[ENTRY_PEEL_NEWPH:.*]] +; CHECK-UR: [[ENTRY_PEEL_NEWPH]]: +; CHECK-UR: br label %[[DO_BODY]] +; CHECK-UR: [[DO_BODY]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END_LOOPEXIT:.*]], label %[[DO_BODY]], !prof ![[#PROF]], !llvm.loop ![[#LOOP_UR_LATCH:]] +; CHECK-UR: [[DO_END_LOOPEXIT]]: +; CHECK-UR: br label %[[DO_END]] +; CHECK-UR: [[DO_END]]: +; CHECK-UR: ret void + +entry: + br label %do.body + +do.body: + %i = phi i32 [ 0, %entry ], [ %inc, %do.body ] + %inc = add i32 %i, 1 + call void @f(i32 %i) + %c = icmp sge i32 %inc, %n + br i1 %c, label %do.end, label %do.body, !prof !0 + +do.end: + ret void +} + +!0 = !{!"branch_weights", i32 1, i32 9} + +; CHECK-UR: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9} +; CHECK-UR: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_PC:]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; CHECK-UR: ![[#LOOP_UR_PC]] = !{!"llvm.loop.peeled.count", i32 2} +; CHECK-UR: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 8} +; CHECK-UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll b/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll index c58f8f1..63a0dd4 100644 --- a/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll +++ b/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll @@ -15,9 +15,9 @@ define void @test() { ; CHECK: loop.peel: ; CHECK-NEXT: [[X_PEEL:%.*]] = call i32 @get.x() ; CHECK-NEXT: switch i32 [[X_PEEL]], label [[LOOP_LATCH_PEEL:%.*]] [ -; CHECK-NEXT: i32 0, label [[LOOP_LATCH_PEEL]] -; CHECK-NEXT: i32 1, label [[LOOP_EXIT:%.*]] -; CHECK-NEXT: i32 2, label [[LOOP_EXIT]] +; CHECK-NEXT: i32 0, label [[LOOP_LATCH_PEEL]] +; CHECK-NEXT: i32 1, label [[LOOP_EXIT:%.*]] +; CHECK-NEXT: i32 2, label [[LOOP_EXIT]] ; CHECK-NEXT: ], !prof [[PROF0:![0-9]+]] ; CHECK: loop.latch.peel: ; CHECK-NEXT: br label [[LOOP_PEEL_NEXT:%.*]] @@ -26,10 +26,10 @@ define void @test() { ; CHECK: loop.peel2: ; CHECK-NEXT: [[X_PEEL3:%.*]] = call i32 @get.x() ; CHECK-NEXT: switch i32 [[X_PEEL3]], label [[LOOP_LATCH_PEEL4:%.*]] [ -; CHECK-NEXT: i32 0, label [[LOOP_LATCH_PEEL4]] -; CHECK-NEXT: i32 1, label [[LOOP_EXIT]] -; CHECK-NEXT: i32 2, label [[LOOP_EXIT]] -; CHECK-NEXT: ], !prof [[PROF1:![0-9]+]] +; CHECK-NEXT: i32 0, label [[LOOP_LATCH_PEEL4]] +; CHECK-NEXT: i32 1, label [[LOOP_EXIT]] +; CHECK-NEXT: i32 2, label [[LOOP_EXIT]] +; CHECK-NEXT: ], !prof [[PROF0]] ; CHECK: loop.latch.peel4: ; CHECK-NEXT: br label [[LOOP_PEEL_NEXT1:%.*]] ; CHECK: loop.peel.next1: @@ -41,31 +41,33 @@ define void @test() { ; CHECK: loop: ; CHECK-NEXT: [[X:%.*]] = call i32 @get.x() ; CHECK-NEXT: switch i32 [[X]], label [[LOOP_LATCH:%.*]] [ -; CHECK-NEXT: i32 0, label [[LOOP_LATCH]] -; CHECK-NEXT: i32 1, label [[LOOP_EXIT_LOOPEXIT:%.*]] -; CHECK-NEXT: i32 2, label [[LOOP_EXIT_LOOPEXIT]] -; CHECK-NEXT: ], !prof [[PROF2:![0-9]+]] +; CHECK-NEXT: i32 0, label [[LOOP_LATCH]] +; CHECK-NEXT: i32 1, label [[LOOP_EXIT_LOOPEXIT:%.*]] +; CHECK-NEXT: i32 2, label [[LOOP_EXIT_LOOPEXIT]] +; CHECK-NEXT: ], !prof [[PROF0]] ; CHECK: loop.latch: -; CHECK-NEXT: br label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br label [[LOOP]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: loop.exit.loopexit: ; CHECK-NEXT: br label [[LOOP_EXIT]] ; CHECK: loop.exit: ; CHECK-NEXT: ret void +; +; DISABLEADV-LABEL: @test( +; DISABLEADV-NEXT: entry: +; DISABLEADV-NEXT: br label [[LOOP:%.*]] +; DISABLEADV: loop: +; DISABLEADV-NEXT: [[X:%.*]] = call i32 @get.x() +; DISABLEADV-NEXT: switch i32 [[X]], label [[LOOP_LATCH:%.*]] [ +; DISABLEADV-NEXT: i32 0, label [[LOOP_LATCH]] +; DISABLEADV-NEXT: i32 1, label [[LOOP_EXIT:%.*]] +; DISABLEADV-NEXT: i32 2, label [[LOOP_EXIT]] +; DISABLEADV-NEXT: ], !prof [[PROF0:![0-9]+]] +; DISABLEADV: loop.latch: +; DISABLEADV-NEXT: br label [[LOOP]] +; DISABLEADV: loop.exit: +; DISABLEADV-NEXT: ret void +; -; DISABLEADV-LABEL: @test() -; DISABLEADV-NEXT: entry: -; DISABLEADV-NEXT: br label %loop -; DISABLEADV: loop -; DISABLEADV-NEXT: %x = call i32 @get.x() -; DISABLEADV-NEXT: switch i32 %x, label %loop.latch [ -; DISABLEADV-NEXT: i32 0, label %loop.latch -; DISABLEADV-NEXT: i32 1, label %loop.exit -; DISABLEADV-NEXT: i32 2, label %loop.exit -; DISABLEADV-NEXT: ], !prof !0 -; DISABLEADV: loop.latch: -; DISABLEADV-NEXT: br label %loop -; DISABLEADV: loop.exit: -; DISABLEADV-NEXT: ret void entry: br label %loop @@ -89,9 +91,9 @@ loop.exit: ;. ; CHECK: [[PROF0]] = !{!"branch_weights", i32 100, i32 200, i32 20, i32 10} -; CHECK: [[PROF1]] = !{!"branch_weights", i32 90, i32 180, i32 20, i32 10} -; CHECK: [[PROF2]] = !{!"branch_weights", i32 80, i32 160, i32 20, i32 10} -; CHECK: [[LOOP3]] = distinct !{!3, !4, !5} -; CHECK: [[META4:![0-9]+]] = !{!"llvm.loop.peeled.count", i32 2} -; CHECK: [[META5:![0-9]+]] = !{!"llvm.loop.unroll.disable"} +; CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[META2]] = !{!"llvm.loop.peeled.count", i32 2} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.disable"} +;. +; DISABLEADV: [[PROF0]] = !{!"branch_weights", i32 100, i32 200, i32 20, i32 10} ;. diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll index d91cb5b..e951215 100644 --- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll +++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll @@ -15,13 +15,13 @@ ; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !16 ; CHECK: [[NEXT0]]: ; CHECK: br i1 %c, label %{{.*}}, label %side_exit, !prof !15 -; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !17 +; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !16 ; CHECK: [[NEXT1]]: ; CHECK: br i1 %c, label %{{.*}}, label %side_exit, !prof !15 -; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !18 +; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !16 ; CHECK: [[NEXT2]]: ; CHECK: br i1 %c, label %{{.*}}, label %side_exit.loopexit, !prof !15 -; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !18 +; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !16, !llvm.loop !17 define i32 @basic(ptr %p, i32 %k, i1 %c) #0 !prof !15 { entry: @@ -84,6 +84,7 @@ attributes #1 = { nounwind optsize } ;CHECK: !15 = !{!"branch_weights", i32 1, i32 0} ; This is a weights of latch and its copies. ;CHECK: !16 = !{!"branch_weights", i32 3001, i32 1001} -;CHECK: !17 = !{!"branch_weights", i32 2000, i32 1001} -;CHECK: !18 = !{!"branch_weights", i32 1001, i32 1001} +;CHECK: !17 = distinct !{!17, !18, !19, {{.*}}} +;CHECK: !18 = !{!"llvm.loop.peeled.count", i32 4} +;CHECK: !19 = !{!"llvm.loop.estimated_trip_count", i32 0} diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll index 15dce234..dec126f 100644 --- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll +++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll @@ -5,7 +5,7 @@ ; RUN: opt < %s -S -profile-summary-huge-working-set-size-threshold=9 -debug-only=loop-unroll -passes='require<profile-summary>,function(require<opt-remark-emit>,loop-unroll)' 2>&1 | FileCheck %s --check-prefix=NOPEEL ; REQUIRES: asserts -; Make sure we use the profile information correctly to peel-off 3 iterations +; Make sure we use the profile information correctly to peel-off 4 iterations ; from the loop, and update the branch weights for the peeled loop properly. ; CHECK: Loop Unroll: F[basic] @@ -20,11 +20,11 @@ ; CHECK-LABEL: @basic ; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !15 ; CHECK: [[NEXT0]]: -; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !16 +; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !15 ; CHECK: [[NEXT1]]: -; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !17 +; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !15 ; CHECK: [[NEXT2]]: -; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !17 +; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !15, !llvm.loop !16 define void @basic(ptr %p, i32 %k) #0 !prof !15 { entry: @@ -104,6 +104,7 @@ attributes #1 = { nounwind optsize } !16 = !{!"branch_weights", i32 3001, i32 1001} ;CHECK: !15 = !{!"branch_weights", i32 3001, i32 1001} -;CHECK: !16 = !{!"branch_weights", i32 2000, i32 1001} -;CHECK: !17 = !{!"branch_weights", i32 1001, i32 1001} +;CHECK: !16 = distinct !{!16, !17, !18, {{.*}}} +;CHECK: !17 = !{!"llvm.loop.peeled.count", i32 4} +;CHECK: !18 = !{!"llvm.loop.estimated_trip_count", i32 0} diff --git a/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll b/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll index ec71c67..0a3d201 100644 --- a/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll +++ b/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll @@ -3,7 +3,7 @@ define i32 @f(i1 %cond1) #0 !prof !0 { ; CHECK-LABEL: define i32 @f -; CHECK-SAME: (i1 [[COND1:%.*]]) !prof [[PROF0:![0-9]+]] { +; CHECK-SAME: (i1 [[COND1:%.*]]) {{.*}}{ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP1_PEEL_BEGIN:%.*]] ; CHECK: loop1.peel.begin: @@ -19,7 +19,7 @@ define i32 @f(i1 %cond1) #0 !prof !0 { ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop1: ; CHECK-NEXT: [[LD:%.*]] = load i64, ptr null, align 8 -; CHECK-NEXT: br i1 [[COND1]], label [[LOOP1]], label [[EXIT1_LOOPEXIT:%.*]], !prof [[PROF2:![0-9]+]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND1]], label [[LOOP1]], label [[EXIT1_LOOPEXIT:%.*]], !prof [[PROF1]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit1.loopexit: ; CHECK-NEXT: [[LD_LCSSA_PH:%.*]] = phi i64 [ [[LD]], [[LOOP1]] ] ; CHECK-NEXT: br label [[EXIT1]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll new file mode 100644 index 0000000..e97d6e66d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll @@ -0,0 +1,244 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck --check-prefix=VF2IC1 %s +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck --check-prefix=VF2IC2 %s +; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck --check-prefix=VF1IC2 %s + +define i32 @FOR_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) { +; VF2IC1-LABEL: define i32 @FOR_used_outside( +; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; VF2IC1-NEXT: [[ENTRY:.*]]: +; VF2IC1-NEXT: br label %[[LOOP:.*]] +; VF2IC1: [[LOOP]]: +; VF2IC1-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF2IC1-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ] +; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]] +; VF2IC1-NEXT: [[TMP10]] = load i32, ptr [[TMP9]], align 4 +; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]] +; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC1-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4 +; VF2IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1 +; VF2IC1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; VF2IC1-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC1: [[FOR_END]]: +; VF2IC1-NEXT: [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] +; VF2IC1-NEXT: ret i32 [[TMP32]] +; +; VF2IC2-LABEL: define i32 @FOR_used_outside( +; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*]]: +; VF2IC2-NEXT: br label %[[LOOP:.*]] +; VF2IC2: [[LOOP]]: +; VF2IC2-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF2IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ] +; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]] +; VF2IC2-NEXT: [[TMP23]] = load i32, ptr [[TMP22]], align 4 +; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]] +; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF2IC2-NEXT: store i32 [[TMP47]], ptr [[TMP44]], align 4 +; VF2IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1 +; VF2IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; VF2IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC2: [[FOR_END]]: +; VF2IC2-NEXT: [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] +; VF2IC2-NEXT: ret i32 [[TMP66]] +; +; VF1IC2-LABEL: define i32 @FOR_used_outside( +; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; VF1IC2-NEXT: [[ENTRY:.*]]: +; VF1IC2-NEXT: br label %[[LOOP:.*]] +; VF1IC2: [[LOOP]]: +; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF1IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ] +; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]] +; VF1IC2-NEXT: [[TMP7]] = load i32, ptr [[TMP6]], align 4 +; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]] +; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] +; VF1IC2-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4 +; VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1 +; VF1IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; VF1IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF1IC2: [[FOR_END]]: +; VF1IC2-NEXT: [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] +; VF1IC2-NEXT: ret i32 [[TMP30]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %for = phi i32 [ 33, %entry ], [ %for.next, %loop ] + %gep.A = getelementptr inbounds nuw i32, ptr %A, i64 %iv + %for.next = load i32, ptr %gep.A, align 4 + %add = add nsw i32 %for, %for.next + %gep.B = getelementptr inbounds nuw i32, ptr %B, i64 %iv + store i32 %add, ptr %gep.B, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %for.end, label %loop + +for.end: + ret i32 %for +} + +define i32 @FOR_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) { +; VF2IC1-LABEL: define i32 @FOR_next_used_outside( +; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; VF2IC1-NEXT: [[ENTRY:.*]]: +; VF2IC1-NEXT: br label %[[LOOP:.*]] +; VF2IC1: [[LOOP]]: +; VF2IC1-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF2IC1-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ] +; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]] +; VF2IC1-NEXT: [[TMP10]] = load i32, ptr [[TMP9]], align 4 +; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]] +; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC1-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4 +; VF2IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1 +; VF2IC1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; VF2IC1-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC1: [[FOR_END]]: +; VF2IC1-NEXT: [[TMP28:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ] +; VF2IC1-NEXT: ret i32 [[TMP28]] +; +; VF2IC2-LABEL: define i32 @FOR_next_used_outside( +; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*]]: +; VF2IC2-NEXT: br label %[[LOOP:.*]] +; VF2IC2: [[LOOP]]: +; VF2IC2-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF2IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ] +; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]] +; VF2IC2-NEXT: [[TMP23]] = load i32, ptr [[TMP22]], align 4 +; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]] +; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF2IC2-NEXT: store i32 [[TMP47]], ptr [[TMP44]], align 4 +; VF2IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1 +; VF2IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; VF2IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC2: [[FOR_END]]: +; VF2IC2-NEXT: [[TMP62:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ] +; VF2IC2-NEXT: ret i32 [[TMP62]] +; +; VF1IC2-LABEL: define i32 @FOR_next_used_outside( +; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; VF1IC2-NEXT: [[ENTRY:.*]]: +; VF1IC2-NEXT: br label %[[LOOP:.*]] +; VF1IC2: [[LOOP]]: +; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF1IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ] +; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]] +; VF1IC2-NEXT: [[TMP7]] = load i32, ptr [[TMP6]], align 4 +; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]] +; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] +; VF1IC2-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4 +; VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1 +; VF1IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; VF1IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF1IC2: [[FOR_END]]: +; VF1IC2-NEXT: [[TMP27:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ] +; VF1IC2-NEXT: ret i32 [[TMP27]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %for = phi i32 [ 33, %entry ], [ %for.next, %loop ] + %gep.A = getelementptr inbounds nuw i32, ptr %A, i64 %iv + %for.next = load i32, ptr %gep.A, align 4 + %add = add nsw i32 %for, %for.next + %gep.B = getelementptr inbounds nuw i32, ptr %B, i64 %iv + store i32 %add, ptr %gep.B, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %for.end, label %loop + +for.end: + ret i32 %for.next +} + +define i32 @FOR_and_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) { +; VF2IC1-LABEL: define i32 @FOR_and_next_used_outside( +; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; VF2IC1-NEXT: [[ENTRY:.*]]: +; VF2IC1-NEXT: br label %[[LOOP:.*]] +; VF2IC1: [[LOOP]]: +; VF2IC1-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF2IC1-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ] +; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]] +; VF2IC1-NEXT: [[TMP10]] = load i32, ptr [[TMP9]], align 4 +; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]] +; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC1-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4 +; VF2IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1 +; VF2IC1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; VF2IC1-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC1: [[FOR_END]]: +; VF2IC1-NEXT: [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] +; VF2IC1-NEXT: [[TMP33:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ] +; VF2IC1-NEXT: [[RES:%.*]] = add i32 [[TMP32]], [[TMP33]] +; VF2IC1-NEXT: ret i32 [[RES]] +; +; VF2IC2-LABEL: define i32 @FOR_and_next_used_outside( +; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*]]: +; VF2IC2-NEXT: br label %[[LOOP:.*]] +; VF2IC2: [[LOOP]]: +; VF2IC2-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF2IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ] +; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]] +; VF2IC2-NEXT: [[TMP23]] = load i32, ptr [[TMP22]], align 4 +; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]] +; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF2IC2-NEXT: store i32 [[TMP47]], ptr [[TMP44]], align 4 +; VF2IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1 +; VF2IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; VF2IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC2: [[FOR_END]]: +; VF2IC2-NEXT: [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] +; VF2IC2-NEXT: [[TMP71:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ] +; VF2IC2-NEXT: [[RES:%.*]] = add i32 [[TMP66]], [[TMP71]] +; VF2IC2-NEXT: ret i32 [[RES]] +; +; VF1IC2-LABEL: define i32 @FOR_and_next_used_outside( +; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; VF1IC2-NEXT: [[ENTRY:.*]]: +; VF1IC2-NEXT: br label %[[LOOP:.*]] +; VF1IC2: [[LOOP]]: +; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF1IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ] +; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]] +; VF1IC2-NEXT: [[TMP7]] = load i32, ptr [[TMP6]], align 4 +; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]] +; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] +; VF1IC2-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4 +; VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1 +; VF1IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; VF1IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF1IC2: [[FOR_END]]: +; VF1IC2-NEXT: [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] +; VF1IC2-NEXT: [[TMP33:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ] +; VF1IC2-NEXT: [[RES:%.*]] = add i32 [[TMP30]], [[TMP33]] +; VF1IC2-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %for = phi i32 [ 33, %entry ], [ %for.next, %loop ] + %gep.A = getelementptr inbounds nuw i32, ptr %A, i64 %iv + %for.next = load i32, ptr %gep.A, align 4 + %add = add nsw i32 %for, %for.next + %gep.B = getelementptr inbounds nuw i32, ptr %B, i64 %iv + store i32 %add, ptr %gep.B, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %for.end, label %loop + +for.end: + %res = add i32 %for, %for.next + ret i32 %res +} + + diff --git a/llvm/test/Transforms/LoopVectorize/reduction-order.ll b/llvm/test/Transforms/LoopVectorize/reduction-order.ll index b07c3833..b51db48 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-order.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-order.ll @@ -1,63 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 ; RUN: opt -passes='loop-vectorize' -force-vector-width=4 -force-vector-interleave=1 -S < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" ; Make sure the selects generated from reduction are always emitted ; in deterministic order. -; CHECK-LABEL: @foo( -; CHECK: vector.body: -; CHECK: [[VEC_PHI_1:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD_5:%.+]], %vector.body ] -; CHECK: [[VEC_PHI_2:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD_3:%.+]], %vector.body ] -; CHECK: icmp ule <4 x i64> -; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]] -; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5) -; CHECK: select <4 x i1> {{.*}}, <4 x i32> [[ADD_5]], <4 x i32> -; CHECK-NEXT: select <4 x i1> {{.*}}, <4 x i32> [[ADD_3]], <4 x i32> -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body ; -define internal i64 @foo(ptr %t0) !prof !1 { -t16: - br label %t20 - -t17: ; preds = %t20 - %t18 = phi i32 [ %t24, %t20 ] - %t19 = phi i32 [ %t28, %t20 ] - br label %t31 +define i32 @foo() !prof !1 { +; CHECK-LABEL: define i32 @foo() {{.*}}{ +; CHECK-NEXT: [[T16:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3> +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 9) +; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]] +; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5) +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_5]], <4 x i32> [[VEC_PHI_1]] +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_3]], <4 x i32> [[VEC_PHI_2]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + br label %loop -t20: ; preds = %t20, %t16 - %t21 = phi i64 [ 0, %t16 ], [ %t29, %t20 ] - %t22 = phi i32 [ 0, %t16 ], [ %t28, %t20 ] - %t23 = phi i32 [ 0, %t16 ], [ %t24, %t20 ] - %t24 = add i32 3, %t23 - %t28 = add i32 %t22, 5 - %t29 = add nuw nsw i64 %t21, 1 - %t30 = icmp eq i64 %t29, 10 - br i1 %t30, label %t17, label %t20, !prof !2 +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red.1 = phi i32 [ 0, %entry ], [ %red.1.next, %loop ] + %red.2 = phi i32 [ 0, %entry ], [ %red.2.next, %loop ] + %red.2.next = add i32 3, %red.2 + %red.1.next = add i32 %red.1, 5 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 10 + br i1 %ec, label %exit, label %loop, !prof !2 -t31: - ret i64 undef +exit: + %r.2 = phi i32 [ %red.2.next, %loop ] + %r.1 = phi i32 [ %red.1.next, %loop ] + %add = add i32 %r.2, %r.1 + ret i32 %add } ; Make sure we do not fail when checking for ordered reduction. This test just ; exercises the path and bails out without performing vectorization. -; CHECK-LABEL: quux -; CHECK-NOT: fadd <4 x -define void @quux(i1 %arg) { -bb: +define double @quux(i1 %arg) { +; CHECK-LABEL: define double @quux( +; CHECK-SAME: i1 [[ARG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[HEADER:.*]] +; CHECK: [[HEADER]]: +; CHECK-NEXT: [[TMP5:%.*]] = phi double [ 1.300000e+01, %[[ENTRY]] ], [ [[TMP:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: [[TMP6:%.*]] = fadd double [[TMP5]], 1.000000e+00 +; CHECK-NEXT: br label %[[LATCH]] +; CHECK: [[LATCH]]: +; CHECK-NEXT: [[TMP]] = phi double [ [[TMP6]], %[[HEADER]] ] +; CHECK-NEXT: br i1 [[ARG]], label %[[HEADER]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[R:%.*]] = phi double [ [[TMP]], %[[LATCH]] ] +; CHECK-NEXT: ret double [[R]] +; +entry: br label %header -latch: ; preds = %header - %tmp = phi double [ %tmp6, %header ] - br i1 %arg, label %header, label %bb2 - -bb2: ; preds = %latch - %tmp3 = phi double [ %tmp, %latch ] - ret void - -header: ; preds = %latch, %bb - %tmp5 = phi double [ 1.300000e+01, %bb ], [ %tmp, %latch ] +header: + %tmp5 = phi double [ 1.300000e+01, %entry ], [ %tmp, %latch ] %tmp6 = fadd double %tmp5, 1.000000e+00 br label %latch + +latch: + %tmp = phi double [ %tmp6, %header ] + br i1 %arg, label %header, label %exit + +exit: + %r = phi double [ %tmp, %latch ] + ret double %r } !1 = !{!"function_entry_count", i64 801} diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll index 644900d..9620697 100644 --- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll +++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll @@ -500,3 +500,50 @@ exit: %first.addr.0.lcssa.i = phi ptr [ %first, %entry ], [ %iv, %loop.header ], [ %iv.next, %loop.latch ] ret ptr %first.addr.0.lcssa.i } + +define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_size_nofree_via_context(ptr noalias %p1, ptr noalias %p2) nosync { +; CHECK-LABEL: define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_size_nofree_via_context( +; CHECK-SAME: ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 1024) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 1024) ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]] +; CHECK: [[LOOP_INC]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX1]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]] +; CHECK: [[LOOP_END]]: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX1]], %[[LOOP]] ], [ -1, %[[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + call void @llvm.assume(i1 true) [ "align"(ptr %p1, i64 4), "dereferenceable"(ptr %p1, i64 1024) ] + call void @llvm.assume(i1 true) [ "align"(ptr %p2, i64 4), "dereferenceable"(ptr %p2, i64 1024) ] + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 1024 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ -1, %loop.inc ] + ret i64 %retval +} diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp index 141282e..30f0a8e5 100644 --- a/llvm/unittests/ADT/APFloatTest.cpp +++ b/llvm/unittests/ADT/APFloatTest.cpp @@ -10176,4 +10176,11 @@ TEST(APFloatTest, hasSignBitInMSB) { EXPECT_FALSE(APFloat::hasSignBitInMSB(APFloat::Float8E8M0FNU())); } +TEST(APFloatTest, FrexpQuietSNaN) { + APFloat SNaN = APFloat::getSNaN(APFloat::PPCDoubleDouble()); + int Exp; + APFloat Result = frexp(SNaN, Exp, APFloat::rmNearestTiesToEven); + EXPECT_FALSE(Result.isSignaling()); +} + } // namespace diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp index d1c0f64..d8457a3 100644 --- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp +++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp @@ -230,8 +230,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) CallBase *Call = findCall(*Func, "call"); Trie.buildAndAttachMIBMetadata(Call); - EXPECT_TRUE(Call->hasFnAttr("memprof")); - EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous"); + EXPECT_FALSE(Call->hasFnAttr("memprof")); EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof)); MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof); ASSERT_EQ(MemProfMD->getNumOperands(), 2u); @@ -280,8 +279,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) CallBase *Call = findCall(*Func, "call"); Trie.buildAndAttachMIBMetadata(Call); - EXPECT_TRUE(Call->hasFnAttr("memprof")); - EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous"); + EXPECT_FALSE(Call->hasFnAttr("memprof")); EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof)); MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof); ASSERT_EQ(MemProfMD->getNumOperands(), 2u); @@ -335,8 +333,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) CallBase *Call = findCall(*Func, "call"); Trie.buildAndAttachMIBMetadata(Call); - EXPECT_TRUE(Call->hasFnAttr("memprof")); - EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous"); + EXPECT_FALSE(Call->hasFnAttr("memprof")); EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof)); MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof); ASSERT_EQ(MemProfMD->getNumOperands(), 2u); @@ -395,8 +392,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) CallBase *Call = findCall(*Func, "call"); Trie.buildAndAttachMIBMetadata(Call); - EXPECT_TRUE(Call->hasFnAttr("memprof")); - EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous"); + EXPECT_FALSE(Call->hasFnAttr("memprof")); EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof)); MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof); ASSERT_EQ(MemProfMD->getNumOperands(), 2u); @@ -467,8 +463,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) ASSERT_NE(Call, nullptr); Trie.buildAndAttachMIBMetadata(Call); - EXPECT_TRUE(Call->hasFnAttr("memprof")); - EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous"); + EXPECT_FALSE(Call->hasFnAttr("memprof")); EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof)); MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof); EXPECT_THAT(MemProfMD, MemprofMetadataEquals(ExpectedVals)); @@ -541,8 +536,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) // Restore original option value. MemProfKeepAllNotColdContexts = OrigMemProfKeepAllNotColdContexts; - EXPECT_TRUE(Call->hasFnAttr("memprof")); - EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous"); + EXPECT_FALSE(Call->hasFnAttr("memprof")); EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof)); MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof); EXPECT_THAT(MemProfMD, MemprofMetadataEquals(ExpectedVals)); @@ -670,8 +664,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) // The hot allocations will be converted to NotCold and pruned as they // are unnecessary to determine how to clone the cold allocation. - EXPECT_TRUE(Call->hasFnAttr("memprof")); - EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous"); + EXPECT_FALSE(Call->hasFnAttr("memprof")); EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof)); MemProfMD = Call->getMetadata(LLVMContext::MD_memprof); ASSERT_EQ(MemProfMD->getNumOperands(), 2u); diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td index 1eda5e4..8e43c42 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td @@ -996,6 +996,35 @@ class OpenMP_NumTeamsClauseSkip< def OpenMP_NumTeamsClause : OpenMP_NumTeamsClauseSkip<>; //===----------------------------------------------------------------------===// +// V5.1: [10.1.2] `sizes` clause +//===----------------------------------------------------------------------===// + +class OpenMP_SizesClauseSkip< + bit traits = false, bit arguments = false, bit assemblyFormat = false, + bit description = false, bit extraClassDeclaration = false + > : OpenMP_Clause<traits, arguments, assemblyFormat, description, + extraClassDeclaration> { + let arguments = (ins + Variadic<IntLikeType>:$sizes + ); + + let optAssemblyFormat = [{ + `sizes` `(` $sizes `:` type($sizes) `)` + }]; + + let description = [{ + The `sizes` clauses defines the size of a grid over a multi-dimensional + logical iteration space. This grid is used for loop transformations such as + `tile` and `strip`. The size per dimension can be a variable, but only + values that are not at least 2 make sense. It is not specified what happens + when smaller values are used, but should still result in a loop nest that + executes each logical iteration once. + }]; +} + +def OpenMP_SizesClause : OpenMP_SizesClauseSkip<>; + +//===----------------------------------------------------------------------===// // V5.2: [10.1.2] `num_threads` clause //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td index bbcfb87f..5ad4e4b 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td @@ -38,6 +38,44 @@ def OpenMP_MapBoundsType : OpenMP_Type<"MapBounds", "map_bounds_ty"> { let summary = "Type for representing omp map clause bounds information"; } +//===---------------------------------------------------------------------===// +// OpenMP Canonical Loop Info Type +//===---------------------------------------------------------------------===// + +def CanonicalLoopInfoType : OpenMP_Type<"CanonicalLoopInfo", "cli"> { + let summary = "Type for representing a reference to a canonical loop"; + let description = [{ + A variable of type CanonicalLoopInfo refers to an OpenMP-compatible + canonical loop in the same function. Values of this type are not + available at runtime and therefore cannot be used by the program itself, + i.e. an opaque type. It is similar to the transform dialect's + `!transform.interface` type, but instead of implementing an interface + for each transformation, the OpenMP dialect itself defines possible + operations on this type. + + A value of type CanonicalLoopInfoType (in the following: CLI) value can be + + 1. created by omp.new_cli. + 2. passed to omp.canonical_loop to associate the loop to that CLI. A CLI + can only be associated once. + 3. passed to an omp loop transformation operation that modifies the loop + associated with the CLI. The CLI is the "applyee" and the operation is + the consumer. A CLI can only be consumed once. + 4. passed to an omp loop transformation operation to associate the cli with + a result of that transformation. The CLI is the "generatee" and the + operation is the generator. + + A CLI cannot + + 1. be returned from a function. + 2. be passed to operations that are not specifically designed to take a + CanonicalLoopInfoType, including AnyType. + + A CLI directly corresponds to an object of + OpenMPIRBuilder's CanonicalLoopInfo struct when lowering to LLVM-IR. + }]; +} + //===----------------------------------------------------------------------===// // Base classes for OpenMP dialect operations. //===----------------------------------------------------------------------===// @@ -211,8 +249,35 @@ class OpenMP_Op<string mnemonic, list<Trait> traits = [], // Doesn't actually create a C++ base class (only defines default values for // tablegen classes that derive from this). Use LoopTransformationInterface // instead for common operations. -class OpenMPTransform_Op<string mnemonic, list<Trait> traits = []> : - OpenMP_Op<mnemonic, !listconcat([DeclareOpInterfaceMethods<LoopTransformationInterface>], traits) > { +class OpenMPTransform_Op<string mnemonic, + list<Trait> traits = [], + list<OpenMP_Clause> clauses = []> : + OpenMP_Op<mnemonic, + traits = !listconcat([DeclareOpInterfaceMethods<LoopTransformationInterface>], traits), + clauses = clauses> { +} + +// Base clause for loop transformations using the standard syntax. +// +// omp.opname ($generatees) <- ($applyees) clause(...) clause(...) ... <attr-dicr> +// omp.opname ($applyees) clause(...) clause(...) ... <attr-dict> +// +// $generatees is optional and is assumed to be empty if omitted +class OpenMPTransformBase_Op<string mnemonic, + list<Trait> traits = [], + list<OpenMP_Clause> clauses = []> : + OpenMPTransform_Op<mnemonic, + traits = !listconcat(traits, [AttrSizedOperandSegments]), + clauses = clauses> { + + let arguments = !con( + (ins Variadic<CanonicalLoopInfoType>:$generatees, + Variadic<CanonicalLoopInfoType>:$applyees + ), clausesArgs); + + let assemblyFormat = [{ custom<LoopTransformClis>($generatees, $applyees) }] + # clausesAssemblyFormat + # [{ attr-dict }]; } #endif // OPENMP_OP_BASE diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 5c77e21..b73091e 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -358,44 +358,6 @@ def SingleOp : OpenMP_Op<"single", traits = [ } //===---------------------------------------------------------------------===// -// OpenMP Canonical Loop Info Type -//===---------------------------------------------------------------------===// - -def CanonicalLoopInfoType : OpenMP_Type<"CanonicalLoopInfo", "cli"> { - let summary = "Type for representing a reference to a canonical loop"; - let description = [{ - A variable of type CanonicalLoopInfo refers to an OpenMP-compatible - canonical loop in the same function. Values of this type are not - available at runtime and therefore cannot be used by the program itself, - i.e. an opaque type. It is similar to the transform dialect's - `!transform.interface` type, but instead of implementing an interface - for each transformation, the OpenMP dialect itself defines possible - operations on this type. - - A value of type CanonicalLoopInfoType (in the following: CLI) value can be - - 1. created by omp.new_cli. - 2. passed to omp.canonical_loop to associate the loop to that CLI. A CLI - can only be associated once. - 3. passed to an omp loop transformation operation that modifies the loop - associated with the CLI. The CLI is the "applyee" and the operation is - the consumer. A CLI can only be consumed once. - 4. passed to an omp loop transformation operation to associate the cli with - a result of that transformation. The CLI is the "generatee" and the - operation is the generator. - - A CLI cannot - - 1. be returned from a function. - 2. be passed to operations that are not specifically designed to take a - CanonicalLoopInfoType, including AnyType. - - A CLI directly corresponds to an object of - OpenMPIRBuilder's CanonicalLoopInfo struct when lowering to LLVM-IR. - }]; -} - -//===---------------------------------------------------------------------===// // OpenMP Canonical Loop Info Creation //===---------------------------------------------------------------------===// @@ -564,6 +526,31 @@ def UnrollHeuristicOp : OpenMPTransform_Op<"unroll_heuristic", []> { } //===----------------------------------------------------------------------===// +// OpenMP tile operation +//===----------------------------------------------------------------------===// + +def TileOp : OpenMPTransformBase_Op<"tile", + clauses = [OpenMP_SizesClause]> { + let summary = "OpenMP tile operation"; + let description = [{ + Represents the OpenMP tile directive introduced in OpenMP 5.1. + + The construct partitions the logical iteration space of the affected loops + into equally-sized tiles, then creates two sets of nested loops. The outer + loops, called the grid loops, iterate over all tiles. The inner loops, + called the intratile loops, iterate over the logical iterations of a tile. + The sizes clause determines the size of a tile. + + Currently, the affected loops must be rectangular (the tripcount of the + inner loop must not depend on any iv of an surrounding affected loop) and + perfectly nested (except for the innermost affected loop, no operations + other than the nested loop and the terminator in the loop body). + }] # clausesDescription; + + let hasVerifier = 1; +} + +//===----------------------------------------------------------------------===// // 2.8.3 Workshare Construct //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp index 7626d35..c64e10f5 100644 --- a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp @@ -123,7 +123,8 @@ void mlir::arith::populateEmulateUnsupportedFloatsLegality( vector::OuterProductOp, vector::ScanOp>( [&](Operation *op) { return converter.isLegal(op); }); target.addLegalOp<arith::BitcastOp, arith::ExtFOp, arith::TruncFOp, - arith::ConstantOp, vector::SplatOp, vector::BroadcastOp>(); + arith::ConstantOp, arith::SelectOp, vector::SplatOp, + vector::BroadcastOp>(); } void EmulateUnsupportedFloatsPass::runOnOperation() { diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 32ebe06..5672942 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -33,6 +33,7 @@ #include "llvm/ADT/TypeSwitch.h" #include "llvm/ADT/bit.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Support/InterleavedRange.h" #include <cstddef> #include <iterator> #include <optional> @@ -3385,6 +3386,9 @@ void NewCliOp::getAsmResultNames(OpAsmSetValueNameFn setNameFn) { Value result = getResult(); auto [newCli, gen, cons] = decodeCli(result); + // Structured binding `gen` cannot be captured in lambdas before C++20 + OpOperand *generator = gen; + // Derive the CLI variable name from its generator: // * "canonloop" for omp.canonical_loop // * custom name for loop transformation generatees @@ -3403,6 +3407,24 @@ void NewCliOp::getAsmResultNames(OpAsmSetValueNameFn setNameFn) { .Case([&](UnrollHeuristicOp op) -> std::string { llvm_unreachable("heuristic unrolling does not generate a loop"); }) + .Case([&](TileOp op) -> std::string { + auto [generateesFirst, generateesCount] = + op.getGenerateesODSOperandIndexAndLength(); + unsigned firstGrid = generateesFirst; + unsigned firstIntratile = generateesFirst + generateesCount / 2; + unsigned end = generateesFirst + generateesCount; + unsigned opnum = generator->getOperandNumber(); + // In the OpenMP apply and looprange clauses, indices are 1-based + if (firstGrid <= opnum && opnum < firstIntratile) { + unsigned gridnum = opnum - firstGrid + 1; + return ("grid" + Twine(gridnum)).str(); + } + if (firstIntratile <= opnum && opnum < end) { + unsigned intratilenum = opnum - firstIntratile + 1; + return ("intratile" + Twine(intratilenum)).str(); + } + llvm_unreachable("Unexpected generatee argument"); + }) .Default([&](Operation *op) { assert(false && "TODO: Custom name for this operation"); return "transformed"; @@ -3632,6 +3654,138 @@ UnrollHeuristicOp::getGenerateesODSOperandIndexAndLength() { } //===----------------------------------------------------------------------===// +// TileOp +//===----------------------------------------------------------------------===// + +static void printLoopTransformClis(OpAsmPrinter &p, TileOp op, + OperandRange generatees, + OperandRange applyees) { + if (!generatees.empty()) + p << '(' << llvm::interleaved(generatees) << ')'; + + if (!applyees.empty()) + p << " <- (" << llvm::interleaved(applyees) << ')'; +} + +static ParseResult parseLoopTransformClis( + OpAsmParser &parser, + SmallVectorImpl<OpAsmParser::UnresolvedOperand> &generateesOperands, + SmallVectorImpl<OpAsmParser::UnresolvedOperand> &applyeesOperands) { + if (parser.parseOptionalLess()) { + // Syntax 1: generatees present + + if (parser.parseOperandList(generateesOperands, + mlir::OpAsmParser::Delimiter::Paren)) + return failure(); + + if (parser.parseLess()) + return failure(); + } else { + // Syntax 2: generatees omitted + } + + // Parse `<-` (`<` has already been parsed) + if (parser.parseMinus()) + return failure(); + + if (parser.parseOperandList(applyeesOperands, + mlir::OpAsmParser::Delimiter::Paren)) + return failure(); + + return success(); +} + +LogicalResult TileOp::verify() { + if (getApplyees().empty()) + return emitOpError() << "must apply to at least one loop"; + + if (getSizes().size() != getApplyees().size()) + return emitOpError() << "there must be one tile size for each applyee"; + + if (!getGeneratees().empty() && + 2 * getSizes().size() != getGeneratees().size()) + return emitOpError() + << "expecting two times the number of generatees than applyees"; + + DenseSet<Value> parentIVs; + + Value parent = getApplyees().front(); + for (auto &&applyee : llvm::drop_begin(getApplyees())) { + auto [parentCreate, parentGen, parentCons] = decodeCli(parent); + auto [create, gen, cons] = decodeCli(applyee); + + if (!parentGen) + return emitOpError() << "applyee CLI has no generator"; + + auto parentLoop = dyn_cast_or_null<CanonicalLoopOp>(parentGen->getOwner()); + if (!parentGen) + return emitOpError() + << "currently only supports omp.canonical_loop as applyee"; + + parentIVs.insert(parentLoop.getInductionVar()); + + if (!gen) + return emitOpError() << "applyee CLI has no generator"; + auto loop = dyn_cast_or_null<CanonicalLoopOp>(gen->getOwner()); + if (!loop) + return emitOpError() + << "currently only supports omp.canonical_loop as applyee"; + + // Canonical loop must be perfectly nested, i.e. the body of the parent must + // only contain the omp.canonical_loop of the nested loops, and + // omp.terminator + bool isPerfectlyNested = [&]() { + auto &parentBody = parentLoop.getRegion(); + if (!parentBody.hasOneBlock()) + return false; + auto &parentBlock = parentBody.getBlocks().front(); + + auto nestedLoopIt = parentBlock.begin(); + if (nestedLoopIt == parentBlock.end() || + (&*nestedLoopIt != loop.getOperation())) + return false; + + auto termIt = std::next(nestedLoopIt); + if (termIt == parentBlock.end() || !isa<TerminatorOp>(termIt)) + return false; + + if (std::next(termIt) != parentBlock.end()) + return false; + + return true; + }(); + if (!isPerfectlyNested) + return emitOpError() << "tiled loop nest must be perfectly nested"; + + if (parentIVs.contains(loop.getTripCount())) + return emitOpError() << "tiled loop nest must be rectangular"; + + parent = applyee; + } + + // TODO: The tile sizes must be computed before the loop, but checking this + // requires dominance analysis. For instance: + // + // %canonloop = omp.new_cli + // omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + // // write to %x + // omp.terminator + // } + // %ts = llvm.load %x + // omp.tile <- (%canonloop) sizes(%ts : i32) + + return success(); +} + +std::pair<unsigned, unsigned> TileOp ::getApplyeesODSOperandIndexAndLength() { + return getODSOperandIndexAndLength(odsIndex_applyees); +} + +std::pair<unsigned, unsigned> TileOp::getGenerateesODSOperandIndexAndLength() { + return getODSOperandIndexAndLength(odsIndex_generatees); +} + +//===----------------------------------------------------------------------===// // Critical construct (2.17.1) //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 9413a92..784e5d6 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -824,7 +824,7 @@ struct WgToSgStoreScatterOpWithOffset return failure(); xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getValue()); + xegpu::getDistributeLayoutAttr(op.getOperand(0)); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -844,12 +844,19 @@ struct WgToSgStoreScatterOpWithOffset auto chunkSizeAttr = rewriter.getI64IntegerAttr(chunkSize); for (auto [val, offs, mask] : llvm::zip( adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) { - xegpu::StoreScatterOp::create(rewriter, loc, val, op.getDest(), offs, - mask, chunkSizeAttr, op.getL1HintAttr(), - op.getL2HintAttr(), op.getL3HintAttr()); + auto store = xegpu::StoreScatterOp::create( + rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr, + op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr()); // Update the layout attribute to drop sg_layout and sg_data. - if (auto newLayout = layout.dropSgLayoutAndData()) - op->setAttr("layout", newLayout); + if (!layout.getEffectiveLaneLayoutAsInt().empty() || + !layout.getEffectiveInstDataAsInt().empty()) { + for (OpOperand &operand : store->getOpOperands()) { + // Skip for operand one (memref) + if (operand.getOperandNumber() == 1) + continue; + xegpu::setDistributeLayoutAttr(operand, layout.dropSgLayoutAndData()); + } + } } rewriter.eraseOp(op); return success(); @@ -1247,10 +1254,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { target.addDynamicallyLegalOp<xegpu::StoreScatterOp>( [=](xegpu::StoreScatterOp op) -> bool { - // Check if the layout attribute is present on the result. - auto layout = op->getAttrOfType<xegpu::LayoutAttr>("layout"); - if (!layout) - return true; + auto layout = xegpu::getDistributeLayoutAttr(op.getOperand(0)); return isLegal(layout); }); diff --git a/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp b/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp index d6b8a8a..e3f075f 100644 --- a/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp +++ b/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp @@ -54,6 +54,7 @@ struct OpStrings { std::string opCppName; SmallVector<std::string> opResultNames; SmallVector<std::string> opOperandNames; + SmallVector<std::string> opRegionNames; }; static std::string joinNameList(llvm::ArrayRef<std::string> names) { @@ -87,8 +88,8 @@ static TypeStrings getStrings(irdl::TypeOp type) { /// Generates OpStrings from an OperatioOp static OpStrings getStrings(irdl::OperationOp op) { auto operandOp = op.getOp<irdl::OperandsOp>(); - auto resultOp = op.getOp<irdl::ResultsOp>(); + auto regionsOp = op.getOp<irdl::RegionsOp>(); OpStrings strings; strings.opName = op.getSymName(); @@ -108,6 +109,13 @@ static OpStrings getStrings(irdl::OperationOp op) { })); } + if (regionsOp) { + strings.opRegionNames = SmallVector<std::string>( + llvm::map_range(regionsOp->getNames(), [](Attribute attr) { + return llvm::formatv("{0}", cast<StringAttr>(attr)); + })); + } + return strings; } @@ -122,6 +130,7 @@ static void fillDict(irdl::detail::dictionary &dict, static void fillDict(irdl::detail::dictionary &dict, const OpStrings &strings) { const auto operandCount = strings.opOperandNames.size(); const auto resultCount = strings.opResultNames.size(); + const auto regionCount = strings.opRegionNames.size(); dict["OP_NAME"] = strings.opName; dict["OP_CPP_NAME"] = strings.opCppName; @@ -131,6 +140,7 @@ static void fillDict(irdl::detail::dictionary &dict, const OpStrings &strings) { operandCount ? joinNameList(strings.opOperandNames) : "{\"\"}"; dict["OP_RESULT_INITIALIZER_LIST"] = resultCount ? joinNameList(strings.opResultNames) : "{\"\"}"; + dict["OP_REGION_COUNT"] = std::to_string(regionCount); } /// Fills a dictionary with values from DialectStrings @@ -179,6 +189,8 @@ static void generateOpGetterDeclarations(irdl::detail::dictionary &dict, const OpStrings &opStrings) { auto opGetters = std::string{}; auto resGetters = std::string{}; + auto regionGetters = std::string{}; + auto regionAdaptorGetters = std::string{}; for (size_t i = 0, end = opStrings.opOperandNames.size(); i < end; ++i) { const auto op = @@ -196,8 +208,23 @@ static void generateOpGetterDeclarations(irdl::detail::dictionary &dict, op, i); } + for (size_t i = 0, end = opStrings.opRegionNames.size(); i < end; ++i) { + const auto op = + llvm::convertToCamelFromSnakeCase(opStrings.opRegionNames[i], true); + regionAdaptorGetters += llvm::formatv( + R"(::mlir::Region &get{0}() { return *getRegions()[{1}]; } + )", + op, i); + regionGetters += llvm::formatv( + R"(::mlir::Region &get{0}() { return (*this)->getRegion({1}); } + )", + op, i); + } + dict["OP_OPERAND_GETTER_DECLS"] = opGetters; dict["OP_RESULT_GETTER_DECLS"] = resGetters; + dict["OP_REGION_ADAPTER_GETTER_DECLS"] = regionAdaptorGetters; + dict["OP_REGION_GETTER_DECLS"] = regionGetters; } static void generateOpBuilderDeclarations(irdl::detail::dictionary &dict, @@ -238,6 +265,22 @@ static void generateOpBuilderDeclarations(irdl::detail::dictionary &dict, dict["OP_BUILD_DECLS"] = buildDecls; } +// add traits to the dictionary, return true if any were added +static SmallVector<std::string> generateTraits(irdl::OperationOp op, + const OpStrings &strings) { + SmallVector<std::string> cppTraitNames; + if (!strings.opRegionNames.empty()) { + cppTraitNames.push_back( + llvm::formatv("::mlir::OpTrait::NRegions<{0}>::Impl", + strings.opRegionNames.size()) + .str()); + + // Requires verifyInvariantsImpl is implemented on the op + cppTraitNames.emplace_back("::mlir::OpTrait::OpInvariants"); + } + return cppTraitNames; +} + static LogicalResult generateOperationInclude(irdl::OperationOp op, raw_ostream &output, irdl::detail::dictionary &dict) { @@ -247,6 +290,13 @@ static LogicalResult generateOperationInclude(irdl::OperationOp op, const auto opStrings = getStrings(op); fillDict(dict, opStrings); + SmallVector<std::string> traitNames = generateTraits(op, opStrings); + if (traitNames.empty()) + dict["OP_TEMPLATE_ARGS"] = opStrings.opCppName; + else + dict["OP_TEMPLATE_ARGS"] = llvm::formatv("{0}, {1}", opStrings.opCppName, + llvm::join(traitNames, ", ")); + generateOpGetterDeclarations(dict, opStrings); generateOpBuilderDeclarations(dict, opStrings); @@ -301,6 +351,110 @@ static LogicalResult generateInclude(irdl::DialectOp dialect, return success(); } +static void generateRegionConstraintVerifiers( + irdl::detail::dictionary &dict, irdl::OperationOp op, + const OpStrings &strings, SmallVectorImpl<std::string> &verifierHelpers, + SmallVectorImpl<std::string> &verifierCalls) { + auto regionsOp = op.getOp<irdl::RegionsOp>(); + if (strings.opRegionNames.empty() || !regionsOp) + return; + + for (size_t i = 0; i < strings.opRegionNames.size(); ++i) { + std::string regionName = strings.opRegionNames[i]; + std::string helperFnName = + llvm::formatv("__mlir_irdl_local_region_constraint_{0}_{1}", + strings.opCppName, regionName) + .str(); + + // Extract the actual region constraint from the IRDL RegionOp + std::string condition = "true"; + std::string textualConditionName = "any region"; + + if (auto regionDefOp = + dyn_cast<irdl::RegionOp>(regionsOp->getArgs()[i].getDefiningOp())) { + // Generate constraint condition based on RegionOp attributes + SmallVector<std::string> conditionParts; + SmallVector<std::string> descriptionParts; + + // Check number of blocks constraint + if (auto blockCount = regionDefOp.getNumberOfBlocks()) { + conditionParts.push_back( + llvm::formatv("region.getBlocks().size() == {0}", + blockCount.value()) + .str()); + descriptionParts.push_back( + llvm::formatv("exactly {0} block(s)", blockCount.value()).str()); + } + + // Check entry block arguments constraint + if (regionDefOp.getConstrainedArguments()) { + size_t expectedArgCount = regionDefOp.getEntryBlockArgs().size(); + conditionParts.push_back( + llvm::formatv("region.getNumArguments() == {0}", expectedArgCount) + .str()); + descriptionParts.push_back( + llvm::formatv("{0} entry block argument(s)", expectedArgCount) + .str()); + } + + // Combine conditions + if (!conditionParts.empty()) { + condition = llvm::join(conditionParts, " && "); + } + + // Generate descriptive error message + if (!descriptionParts.empty()) { + textualConditionName = + llvm::formatv("region with {0}", + llvm::join(descriptionParts, " and ")) + .str(); + } + } + + verifierHelpers.push_back(llvm::formatv( + R"(static ::llvm::LogicalResult {0}(::mlir::Operation *op, ::mlir::Region ®ion, ::llvm::StringRef regionName, unsigned regionIndex) {{ + if (!({1})) {{ + return op->emitOpError("region #") << regionIndex + << (regionName.empty() ? " " : " ('" + regionName + "') ") + << "failed to verify constraint: {2}"; + } + return ::mlir::success(); +})", + helperFnName, condition, textualConditionName)); + + verifierCalls.push_back(llvm::formatv(R"( + if (::mlir::failed({0}(*this, (*this)->getRegion({1}), "{2}", {1}))) + return ::mlir::failure();)", + helperFnName, i, regionName) + .str()); + } +} + +static void generateVerifiers(irdl::detail::dictionary &dict, + irdl::OperationOp op, const OpStrings &strings) { + SmallVector<std::string> verifierHelpers; + SmallVector<std::string> verifierCalls; + + generateRegionConstraintVerifiers(dict, op, strings, verifierHelpers, + verifierCalls); + + // Add an overall verifier that sequences the helper calls + std::string verifierDef = + llvm::formatv(R"( +::llvm::LogicalResult {0}::verifyInvariantsImpl() {{ + if(::mlir::failed(verify())) + return ::mlir::failure(); + + {1} + + return ::mlir::success(); +})", + strings.opCppName, llvm::join(verifierCalls, "\n")); + + dict["OP_VERIFIER_HELPERS"] = llvm::join(verifierHelpers, "\n"); + dict["OP_VERIFIER"] = verifierDef; +} + static std::string generateOpDefinition(irdl::detail::dictionary &dict, irdl::OperationOp op) { static const auto perOpDefTemplate = mlir::irdl::detail::Template{ @@ -370,6 +524,8 @@ void {0}::build(::mlir::OpBuilder &opBuilder, ::mlir::OperationState &opState, { dict["OP_BUILD_DEFS"] = buildDefinition; + generateVerifiers(dict, op, opStrings); + std::string str; llvm::raw_string_ostream stream{str}; perOpDefTemplate.render(stream, dict); @@ -427,7 +583,7 @@ static LogicalResult generateLib(irdl::DialectOp dialect, raw_ostream &output, dict["TYPE_PARSER"] = llvm::formatv( R"(static ::mlir::OptionalParseResult generatedTypeParser(::mlir::AsmParser &parser, ::llvm::StringRef *mnemonic, ::mlir::Type &value) { return ::mlir::AsmParser::KeywordSwitch<::mlir::OptionalParseResult>(parser) - {0} + {0} .Default([&](llvm::StringRef keyword, llvm::SMLoc) {{ *mnemonic = keyword; return std::nullopt; @@ -520,6 +676,8 @@ static LogicalResult verifySupported(irdl::DialectOp dialect) { "IRDL C++ translation does not yet support variadic results"); })) .Case<irdl::AnyOp>(([](irdl::AnyOp) { return success(); })) + .Case<irdl::RegionOp>(([](irdl::RegionOp) { return success(); })) + .Case<irdl::RegionsOp>(([](irdl::RegionsOp) { return success(); })) .Default([](mlir::Operation *op) -> LogicalResult { return op->emitError("IRDL C++ translation does not yet support " "translation of ") diff --git a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt index e9068e9..93ce0be 100644 --- a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt +++ b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt @@ -12,15 +12,15 @@ public: struct Properties { }; public: - __OP_CPP_NAME__GenericAdaptorBase(::mlir::Operation *op) - : odsAttrs(op->getRawDictionaryAttrs()), odsOpName(op->getName()), - odsRegions(op->getRegions()) + __OP_CPP_NAME__GenericAdaptorBase(::mlir::Operation *op) + : odsAttrs(op->getRawDictionaryAttrs()), odsOpName(op->getName()), + odsRegions(op->getRegions()) {} /// Return the unstructured operand index of a structured operand along with // the amount of unstructured operands it contains. std::pair<unsigned, unsigned> - getStructuredOperandIndexAndLength (unsigned index, + getStructuredOperandIndexAndLength (unsigned index, unsigned odsOperandsSize) { return {index, 1}; } @@ -32,6 +32,12 @@ public: ::mlir::DictionaryAttr getAttributes() { return odsAttrs; } + + __OP_REGION_ADAPTER_GETTER_DECLS__ + + ::mlir::RegionRange getRegions() { + return odsRegions; + } protected: ::mlir::DictionaryAttr odsAttrs; ::std::optional<::mlir::OperationName> odsOpName; @@ -42,28 +48,28 @@ protected: } // namespace detail template <typename RangeT> -class __OP_CPP_NAME__GenericAdaptor +class __OP_CPP_NAME__GenericAdaptor : public detail::__OP_CPP_NAME__GenericAdaptorBase { using ValueT = ::llvm::detail::ValueOfRange<RangeT>; using Base = detail::__OP_CPP_NAME__GenericAdaptorBase; public: __OP_CPP_NAME__GenericAdaptor(RangeT values, ::mlir::DictionaryAttr attrs, - ::mlir::OpaqueProperties properties, - ::mlir::RegionRange regions = {}) - : __OP_CPP_NAME__GenericAdaptor(values, attrs, - (properties ? *properties.as<::mlir::EmptyProperties *>() + ::mlir::OpaqueProperties properties, + ::mlir::RegionRange regions = {}) + : __OP_CPP_NAME__GenericAdaptor(values, attrs, + (properties ? *properties.as<::mlir::EmptyProperties *>() : ::mlir::EmptyProperties{}), regions) {} - __OP_CPP_NAME__GenericAdaptor(RangeT values, + __OP_CPP_NAME__GenericAdaptor(RangeT values, const __OP_CPP_NAME__GenericAdaptorBase &base) : Base(base), odsOperands(values) {} - // This template parameter allows using __OP_CPP_NAME__ which is declared + // This template parameter allows using __OP_CPP_NAME__ which is declared // later. template <typename LateInst = __OP_CPP_NAME__, typename = std::enable_if_t< std::is_same_v<LateInst, __OP_CPP_NAME__>>> - __OP_CPP_NAME__GenericAdaptor(RangeT values, LateInst op) + __OP_CPP_NAME__GenericAdaptor(RangeT values, LateInst op) : Base(op), odsOperands(values) {} /// Return the unstructured operand index of a structured operand along with @@ -77,7 +83,7 @@ public: RangeT getStructuredOperands(unsigned index) { auto valueRange = getStructuredOperandIndexAndLength(index); return {std::next(odsOperands.begin(), valueRange.first), - std::next(odsOperands.begin(), + std::next(odsOperands.begin(), valueRange.first + valueRange.second)}; } @@ -91,7 +97,7 @@ private: RangeT odsOperands; }; -class __OP_CPP_NAME__Adaptor +class __OP_CPP_NAME__Adaptor : public __OP_CPP_NAME__GenericAdaptor<::mlir::ValueRange> { public: using __OP_CPP_NAME__GenericAdaptor::__OP_CPP_NAME__GenericAdaptor; @@ -100,7 +106,7 @@ public: ::llvm::LogicalResult verify(::mlir::Location loc); }; -class __OP_CPP_NAME__ : public ::mlir::Op<__OP_CPP_NAME__> { +class __OP_CPP_NAME__ : public ::mlir::Op<__OP_TEMPLATE_ARGS__> { public: using Op::Op; using Op::print; @@ -112,6 +118,8 @@ public: return {}; } + ::llvm::LogicalResult verifyInvariantsImpl(); + static constexpr ::llvm::StringLiteral getOperationName() { return ::llvm::StringLiteral("__DIALECT_NAME__.__OP_NAME__"); } @@ -147,7 +155,7 @@ public: ::mlir::Operation::operand_range getStructuredOperands(unsigned index) { auto valueRange = getStructuredOperandIndexAndLength(index); return {std::next(getOperation()->operand_begin(), valueRange.first), - std::next(getOperation()->operand_begin(), + std::next(getOperation()->operand_begin(), valueRange.first + valueRange.second)}; } @@ -162,18 +170,19 @@ public: ::mlir::Operation::result_range getStructuredResults(unsigned index) { auto valueRange = getStructuredResultIndexAndLength(index); return {std::next(getOperation()->result_begin(), valueRange.first), - std::next(getOperation()->result_begin(), + std::next(getOperation()->result_begin(), valueRange.first + valueRange.second)}; } __OP_OPERAND_GETTER_DECLS__ __OP_RESULT_GETTER_DECLS__ - + __OP_REGION_GETTER_DECLS__ + __OP_BUILD_DECLS__ - static void build(::mlir::OpBuilder &odsBuilder, - ::mlir::OperationState &odsState, - ::mlir::TypeRange resultTypes, - ::mlir::ValueRange operands, + static void build(::mlir::OpBuilder &odsBuilder, + ::mlir::OperationState &odsState, + ::mlir::TypeRange resultTypes, + ::mlir::ValueRange operands, ::llvm::ArrayRef<::mlir::NamedAttribute> attributes = {}); static __OP_CPP_NAME__ create(::mlir::OpBuilder &odsBuilder, diff --git a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt index 30ca420..f4a1b7a 100644 --- a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt +++ b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt @@ -6,12 +6,14 @@ R"( __NAMESPACE_OPEN__ +__OP_VERIFIER_HELPERS__ + __OP_BUILD_DEFS__ -void __OP_CPP_NAME__::build(::mlir::OpBuilder &odsBuilder, - ::mlir::OperationState &odsState, - ::mlir::TypeRange resultTypes, - ::mlir::ValueRange operands, +void __OP_CPP_NAME__::build(::mlir::OpBuilder &odsBuilder, + ::mlir::OperationState &odsState, + ::mlir::TypeRange resultTypes, + ::mlir::ValueRange operands, ::llvm::ArrayRef<::mlir::NamedAttribute> attributes) { assert(operands.size() == __OP_OPERAND_COUNT__); @@ -19,6 +21,9 @@ void __OP_CPP_NAME__::build(::mlir::OpBuilder &odsBuilder, odsState.addOperands(operands); odsState.addAttributes(attributes); odsState.addTypes(resultTypes); + for (unsigned i = 0; i != __OP_REGION_COUNT__; ++i) { + (void)odsState.addRegion(); + } } __OP_CPP_NAME__ @@ -44,6 +49,7 @@ __OP_CPP_NAME__::create(::mlir::ImplicitLocOpBuilder &odsBuilder, return create(odsBuilder, odsBuilder.getLoc(), resultTypes, operands, attributes); } +__OP_VERIFIER__ __NAMESPACE_CLOSE__ diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 53209a4..9fcb02e 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -3175,6 +3175,45 @@ applyUnrollHeuristic(omp::UnrollHeuristicOp op, llvm::IRBuilderBase &builder, return success(); } +/// Apply a `#pragma omp tile` / `!$omp tile` transformation using the +/// OpenMPIRBuilder. +static LogicalResult applyTile(omp::TileOp op, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + llvm::OpenMPIRBuilder::LocationDescription loc(builder); + + SmallVector<llvm::CanonicalLoopInfo *> translatedLoops; + SmallVector<llvm::Value *> translatedSizes; + + for (Value size : op.getSizes()) { + llvm::Value *translatedSize = moduleTranslation.lookupValue(size); + assert(translatedSize && + "sizes clause arguments must already be translated"); + translatedSizes.push_back(translatedSize); + } + + for (Value applyee : op.getApplyees()) { + llvm::CanonicalLoopInfo *consBuilderCLI = + moduleTranslation.lookupOMPLoop(applyee); + assert(applyee && "Canonical loop must already been translated"); + translatedLoops.push_back(consBuilderCLI); + } + + auto generatedLoops = + ompBuilder->tileLoops(loc.DL, translatedLoops, translatedSizes); + if (!op.getGeneratees().empty()) { + for (auto [mlirLoop, genLoop] : + zip_equal(op.getGeneratees(), generatedLoops)) + moduleTranslation.mapOmpLoop(mlirLoop, genLoop); + } + + // CLIs can only be consumed once + for (Value applyee : op.getApplyees()) + moduleTranslation.invalidateOmpLoop(applyee); + + return success(); +} + /// Convert an Atomic Ordering attribute to llvm::AtomicOrdering. static llvm::AtomicOrdering convertAtomicOrdering(std::optional<omp::ClauseMemoryOrderKind> ao) { @@ -6227,6 +6266,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder, // the omp.canonical_loop. return applyUnrollHeuristic(op, builder, moduleTranslation); }) + .Case([&](omp::TileOp op) { + return applyTile(op, builder, moduleTranslation); + }) .Case([&](omp::TargetAllocMemOp) { return convertTargetAllocMemOp(*op, builder, moduleTranslation); }) diff --git a/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir b/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir index 99790cc..fcd004a 100644 --- a/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir +++ b/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir @@ -85,3 +85,14 @@ func.func @no_expansion(%x: f32) -> f32 { %y = arith.addf %x, %c : f32 func.return %y : f32 } + +// ----- + +func.func @no_promote_select(%c: i1, %x: bf16, %y: bf16) -> bf16 { +// CHECK-LABEL: @no_promote_select +// CHECK-SAME: (%[[C:.+]]: i1, %[[X:.+]]: bf16, %[[Y:.+]]: bf16) +// CHECK: %[[Z:.+]] = arith.select %[[C]], %[[X]], %[[Y]] : bf16 +// CHECK: return %[[Z]] + %z = arith.select %c, %x, %y : bf16 + func.return %z : bf16 +} diff --git a/mlir/test/Dialect/OpenMP/cli-tile.mlir b/mlir/test/Dialect/OpenMP/cli-tile.mlir new file mode 100644 index 0000000..73d5478 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/cli-tile.mlir @@ -0,0 +1,138 @@ +// RUN: mlir-opt %s | FileCheck %s --enable-var-scope +// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope + + +// Raw syntax check (MLIR output is always pretty-printed) +// CHECK-LABEL: @omp_tile_raw( +// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) { +func.func @omp_tile_raw(%tc : i32, %ts : i32) -> () { + // CHECK-NEXT: %canonloop = omp.new_cli + %canonloop = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: %grid1 = omp.new_cli + %grid = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: %intratile1 = omp.new_cli + %intratile = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { + "omp.canonical_loop" (%tc, %canonloop) ({ + ^bb0(%iv: i32): + // CHECK: omp.terminator + omp.terminator + }) : (i32, !omp.cli) -> () + // CHECK: omp.tile (%grid1, %intratile1) <- (%canonloop) sizes(%[[ts]] : i32) + "omp.tile"(%grid, %intratile, %canonloop, %ts) <{operandSegmentSizes = array<i32: 2, 1, 1>}> : (!omp.cli, !omp.cli, !omp.cli, i32) -> () + //"omp.tile" (%canonloop) : (!omp.cli) -> () + return +} + + +// Pretty syntax check +// CHECK-LABEL: @omp_tile_pretty( +// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) { +func.func @omp_tile_pretty(%tc : i32, %ts : i32) -> () { + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %grid = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %intratile = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.tile (%grid1, %intratile1) <- (%canonloop) sizes(%[[ts]] : i32) + omp.tile(%grid, %intratile) <- (%canonloop) sizes(%ts : i32) + return +} + + +// Specifying the generatees for omp.tile is optional +// CHECK-LABEL: @omp_tile_optionalgen_pretty( +// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) { +func.func @omp_tile_optionalgen_pretty(%tc : i32, %ts : i32) -> () { + // CHECK-NEXT: %canonloop = omp.new_cli + %canonloop = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.tile <- (%canonloop) sizes(%[[ts]] : i32) + omp.tile <- (%canonloop) sizes(%ts : i32) + return +} + + +// Two-dimensional tiling +// CHECK-LABEL: @omp_tile_2d_pretty( +// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32, %[[ts1:.+]]: i32, %[[ts2:.+]]: i32) { +func.func @omp_tile_2d_pretty(%tc1 : i32, %tc2 : i32, %ts1 : i32, %ts2 : i32) -> () { + // CHECK-NEXT: %canonloop = omp.new_cli + %cli_outer = omp.new_cli + // CHECK-NEXT: %canonloop_d1 = omp.new_cli + %cli_inner = omp.new_cli + // CHECK-NEXT: %grid1 = omp.new_cli + %grid1 = omp.new_cli + // CHECK-NEXT: %grid2 = omp.new_cli + %grid2 = omp.new_cli + // CHECK-NEXT: %intratile1 = omp.new_cli + %intratile1 = omp.new_cli + // CHECK-NEXT: %intratile2 = omp.new_cli + %intratile2 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc1]]) { + omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc1) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc2]]) { + omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc2) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.tile (%grid1, %grid2, %intratile1, %intratile2) <- (%canonloop, %canonloop_d1) sizes(%[[ts1]], %[[ts2]] : i32, i32) + omp.tile (%grid1, %grid2, %intratile1, %intratile2) <- (%cli_outer, %cli_inner) sizes(%ts1, %ts2 : i32, i32) + return +} + + +// Three-dimensional tiling +// CHECK-LABEL: @omp_tile_3d_pretty( +// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) { +func.func @omp_tile_3d_pretty(%tc : i32, %ts : i32) -> () { + // CHECK-NEXT: %canonloop = omp.new_cli + %cli_outer = omp.new_cli + // CHECK-NEXT: %canonloop_d1 = omp.new_cli + %cli_middle = omp.new_cli + // CHECK-NEXT: %canonloop_d2 = omp.new_cli + %cli_inner = omp.new_cli + // CHECK-NEXT: %grid1 = omp.new_cli + %grid1 = omp.new_cli + // CHECK-NEXT: %grid2 = omp.new_cli + %grid2 = omp.new_cli + // CHECK-NEXT: %grid3 = omp.new_cli + %grid3 = omp.new_cli + // CHECK-NEXT: %intratile1 = omp.new_cli + %intratile1 = omp.new_cli + // CHECK-NEXT: %intratile2 = omp.new_cli + %intratile2 = omp.new_cli + // CHECK-NEXT: %intratile3 = omp.new_cli + %intratile3 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { + omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) { + omp.canonical_loop(%cli_middle) %iv_middle : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d2) %iv_d2 : i32 in range(%[[tc]]) { + omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.tile (%grid1, %grid2, %grid3, %intratile1, %intratile2, %intratile3) <- (%canonloop, %canonloop_d1, %canonloop_d2) sizes(%[[ts]], %[[ts]], %[[ts]] : i32, i32, i32) + omp.tile (%grid1, %grid2, %grid3, %intratile1, %intratile2, %intratile3) <- (%cli_outer, %cli_middle, %cli_inner) sizes(%ts, %ts, %ts: i32, i32, i32) + return +} diff --git a/mlir/test/Dialect/OpenMP/invalid-tile.mlir b/mlir/test/Dialect/OpenMP/invalid-tile.mlir new file mode 100644 index 0000000..e63a062 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/invalid-tile.mlir @@ -0,0 +1,119 @@ +// RUN: mlir-opt -split-input-file -verify-diagnostics %s + + +func.func @missing_sizes(%tc : i32, %ts : i32) { + %canonloop = omp.new_cli + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + omp.terminator + } + + // expected-error@+1 {{'omp.tile' op there must be one tile size for each applyee}} + omp.tile <-(%canonloop) + + llvm.return +} + +// ----- + +func.func @no_loop(%tc : i32, %ts : i32) { + // expected-error@+1 {{'omp.tile' op must apply to at least one loop}} + omp.tile <-() + + return +} + +// ----- + +func.func @missing_generator(%tc : i32, %ts : i32) { + // expected-error@+1 {{'omp.new_cli' op CLI has no generator}} + %canonloop = omp.new_cli + + // expected-note@+1 {{see consumer here: "omp.tile"(%0, %arg1) <{operandSegmentSizes = array<i32: 0, 1, 1>}> : (!omp.cli, i32) -> ()}} + omp.tile <-(%canonloop) sizes(%ts : i32) + + return +} + +// ----- + +func.func @insufficient_sizes(%tc : i32, %ts : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc) { + omp.terminator + } + omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc) { + omp.terminator + } + + // expected-error@+1 {{'omp.tile' op there must be one tile size for each applyee}} + omp.tile <-(%canonloop1, %canonloop2) sizes(%ts : i32) + + llvm.return +} + +// ----- + +func.func @insufficient_applyees(%tc : i32, %ts : i32) { + %canonloop = omp.new_cli + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + omp.terminator + } + + // expected-error@+1 {{omp.tile' op there must be one tile size for each applyee}} + omp.tile <- (%canonloop) sizes(%ts, %ts : i32, i32) + + return +} + +// ----- + +func.func @insufficient_generatees(%tc : i32, %ts : i32) { + %canonloop = omp.new_cli + %grid = omp.new_cli + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + omp.terminator + } + + // expected-error@+1 {{'omp.tile' op expecting two times the number of generatees than applyees}} + omp.tile (%grid) <- (%canonloop) sizes(%ts : i32) + + return +} + +// ----- + +func.func @not_perfectly_nested(%tc : i32, %ts : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%tc) { + %v = arith.constant 42 : i32 + omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%tc) { + omp.terminator + } + omp.terminator + } + + // expected-error@+1 {{'omp.tile' op tiled loop nest must be perfectly nested}} + omp.tile <-(%canonloop1, %canonloop2) sizes(%ts, %ts : i32, i32) + + llvm.return +} + +// ----- + +func.func @non_nectangular(%tc : i32, %ts : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%tc) { + omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%iv1) { + omp.terminator + } + omp.terminator + } + + // expected-error@+1 {{'omp.tile' op tiled loop nest must be rectangular}} + omp.tile <-(%canonloop1, %canonloop2) sizes(%ts, %ts : i32, i32) + + llvm.return +} diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 03c6386..38392fd 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -282,15 +282,20 @@ gpu.module @test_distribution { // CHECK-LABEL: @store_scatter // CHECK-SAME: %[[ARG0:.*]]: memref<256xf16> gpu.func @store_scatter(%dest : memref<256xf16>) { - // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16> - // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex> - // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1> + // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16> + // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex> + // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1> // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> + // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>, + // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>} // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> - %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<25.5> : vector<256xf16> - %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<0> : vector<256xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<1> : vector<256xi1> - xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [32], sg_data = [8]>, l1_hint = #xegpu.cache_hint<cached>} + %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<25.5> : vector<256xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<0> : vector<256xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<1> : vector<256xi1> + xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>, + layout_operand_2 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>, + layout_operand_3 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>, + l1_hint = #xegpu.cache_hint<cached>} : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1> gpu.return } diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir new file mode 100644 index 0000000..4ac4f02 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir @@ -0,0 +1,101 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + + + +llvm.func @tile_trivial_loop(%baseptr: !llvm.ptr, %tc: i32, %ts: i32) -> () { + %literal_cli = omp.new_cli + omp.canonical_loop(%literal_cli) %iv : i32 in range(%tc) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(42.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + omp.tile <- (%literal_cli) sizes(%ts : i32) + llvm.return +} + + +// CHECK: ; ModuleID = 'LLVMDialectModule' +// CHECK-NEXT: source_filename = "LLVMDialectModule" +// CHECK-EMPTY: +// CHECK-NEXT: define void @tile_trivial_loop(ptr %0, i32 %1, i32 %2) { +// CHECK-NEXT: br label %omp_omp.loop.preheader +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.preheader: ; preds = %3 +// CHECK-NEXT: %4 = udiv i32 %1, %2 +// CHECK-NEXT: %5 = urem i32 %1, %2 +// CHECK-NEXT: %6 = icmp ne i32 %5, 0 +// CHECK-NEXT: %7 = zext i1 %6 to i32 +// CHECK-NEXT: %omp_floor0.tripcount = add nuw i32 %4, %7 +// CHECK-NEXT: br label %omp_floor0.preheader +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.preheader: ; preds = %omp_omp.loop.preheader +// CHECK-NEXT: br label %omp_floor0.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.header: ; preds = %omp_floor0.inc, %omp_floor0.preheader +// CHECK-NEXT: %omp_floor0.iv = phi i32 [ 0, %omp_floor0.preheader ], [ %omp_floor0.next, %omp_floor0.inc ] +// CHECK-NEXT: br label %omp_floor0.cond +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.cond: ; preds = %omp_floor0.header +// CHECK-NEXT: %omp_floor0.cmp = icmp ult i32 %omp_floor0.iv, %omp_floor0.tripcount +// CHECK-NEXT: br i1 %omp_floor0.cmp, label %omp_floor0.body, label %omp_floor0.exit +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.body: ; preds = %omp_floor0.cond +// CHECK-NEXT: %8 = icmp eq i32 %omp_floor0.iv, %4 +// CHECK-NEXT: %9 = select i1 %8, i32 %5, i32 %2 +// CHECK-NEXT: br label %omp_tile0.preheader +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.preheader: ; preds = %omp_floor0.body +// CHECK-NEXT: br label %omp_tile0.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.header: ; preds = %omp_tile0.inc, %omp_tile0.preheader +// CHECK-NEXT: %omp_tile0.iv = phi i32 [ 0, %omp_tile0.preheader ], [ %omp_tile0.next, %omp_tile0.inc ] +// CHECK-NEXT: br label %omp_tile0.cond +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.cond: ; preds = %omp_tile0.header +// CHECK-NEXT: %omp_tile0.cmp = icmp ult i32 %omp_tile0.iv, %9 +// CHECK-NEXT: br i1 %omp_tile0.cmp, label %omp_tile0.body, label %omp_tile0.exit +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.body: ; preds = %omp_tile0.cond +// CHECK-NEXT: %10 = mul nuw i32 %2, %omp_floor0.iv +// CHECK-NEXT: %11 = add nuw i32 %10, %omp_tile0.iv +// CHECK-NEXT: br label %omp_omp.loop.body +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.body: ; preds = %omp_tile0.body +// CHECK-NEXT: br label %omp.loop.region +// CHECK-EMPTY: +// CHECK-NEXT: omp.loop.region: ; preds = %omp_omp.loop.body +// CHECK-NEXT: %12 = getelementptr inbounds float, ptr %0, i32 %11 +// CHECK-NEXT: store float 4.200000e+01, ptr %12, align 4 +// CHECK-NEXT: br label %omp.region.cont +// CHECK-EMPTY: +// CHECK-NEXT: omp.region.cont: ; preds = %omp.loop.region +// CHECK-NEXT: br label %omp_tile0.inc +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.inc: ; preds = %omp.region.cont +// CHECK-NEXT: %omp_tile0.next = add nuw i32 %omp_tile0.iv, 1 +// CHECK-NEXT: br label %omp_tile0.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.exit: ; preds = %omp_tile0.cond +// CHECK-NEXT: br label %omp_tile0.after +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.after: ; preds = %omp_tile0.exit +// CHECK-NEXT: br label %omp_floor0.inc +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.inc: ; preds = %omp_tile0.after +// CHECK-NEXT: %omp_floor0.next = add nuw i32 %omp_floor0.iv, 1 +// CHECK-NEXT: br label %omp_floor0.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.exit: ; preds = %omp_floor0.cond +// CHECK-NEXT: br label %omp_floor0.after +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.after: ; preds = %omp_floor0.exit +// CHECK-NEXT: br label %omp_omp.loop.after +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.after: ; preds = %omp_floor0.after +// CHECK-NEXT: ret void +// CHECK-NEXT: } +// CHECK-EMPTY: +// CHECK-NEXT: !llvm.module.flags = !{!0} +// CHECK-EMPTY: +// CHECK-NEXT: !0 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir new file mode 100644 index 0000000..6fad81c --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir @@ -0,0 +1,190 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + + +llvm.func @tile_2d_loop(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %ts1: i32, %ts2: i32) -> () { + %literal_outer = omp.new_cli + %literal_inner = omp.new_cli + omp.canonical_loop(%literal_outer) %iv1 : i32 in range(%tc1) { + omp.canonical_loop(%literal_inner) %iv2 : i32 in range(%tc2) { + %idx = llvm.add %iv1, %iv2 : i32 + %ptr = llvm.getelementptr inbounds %baseptr[%idx] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(42.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + omp.terminator + } + omp.tile <- (%literal_outer, %literal_inner) sizes(%ts1, %ts2 : i32,i32) + llvm.return +} + + +// CHECK: ; ModuleID = 'LLVMDialectModule' +// CHECK-NEXT: source_filename = "LLVMDialectModule" +// CHECK-EMPTY: +// CHECK-NEXT: define void @tile_2d_loop(ptr %0, i32 %1, i32 %2, i32 %3, i32 %4) { +// CHECK-NEXT: br label %omp_omp.loop.preheader +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.preheader: ; preds = %5 +// CHECK-NEXT: %6 = udiv i32 %1, %3 +// CHECK-NEXT: %7 = urem i32 %1, %3 +// CHECK-NEXT: %8 = icmp ne i32 %7, 0 +// CHECK-NEXT: %9 = zext i1 %8 to i32 +// CHECK-NEXT: %omp_floor0.tripcount = add nuw i32 %6, %9 +// CHECK-NEXT: %10 = udiv i32 %2, %4 +// CHECK-NEXT: %11 = urem i32 %2, %4 +// CHECK-NEXT: %12 = icmp ne i32 %11, 0 +// CHECK-NEXT: %13 = zext i1 %12 to i32 +// CHECK-NEXT: %omp_floor1.tripcount = add nuw i32 %10, %13 +// CHECK-NEXT: br label %omp_floor0.preheader +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.header: ; preds = %omp_omp.loop.inc +// CHECK-NEXT: %omp_omp.loop.iv = phi i32 [ %omp_omp.loop.next, %omp_omp.loop.inc ] +// CHECK-NEXT: br label %omp_omp.loop.cond +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.cond: ; preds = %omp_omp.loop.header +// CHECK-NEXT: %omp_omp.loop.cmp = icmp ult i32 %19, %1 +// CHECK-NEXT: br i1 %omp_omp.loop.cmp, label %omp_omp.loop.body, label %omp_omp.loop.exit +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.body: ; preds = %omp_tile1.body, %omp_omp.loop.cond +// CHECK-NEXT: br label %omp.loop.region +// CHECK-EMPTY: +// CHECK-NEXT: omp.loop.region: ; preds = %omp_omp.loop.body +// CHECK-NEXT: br label %omp_omp.loop.preheader1 +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.preheader1: ; preds = %omp.loop.region +// CHECK-NEXT: br label %omp_omp.loop.body4 +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.preheader: ; preds = %omp_omp.loop.preheader +// CHECK-NEXT: br label %omp_floor0.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.header: ; preds = %omp_floor0.inc, %omp_floor0.preheader +// CHECK-NEXT: %omp_floor0.iv = phi i32 [ 0, %omp_floor0.preheader ], [ %omp_floor0.next, %omp_floor0.inc ] +// CHECK-NEXT: br label %omp_floor0.cond +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.cond: ; preds = %omp_floor0.header +// CHECK-NEXT: %omp_floor0.cmp = icmp ult i32 %omp_floor0.iv, %omp_floor0.tripcount +// CHECK-NEXT: br i1 %omp_floor0.cmp, label %omp_floor0.body, label %omp_floor0.exit +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.body: ; preds = %omp_floor0.cond +// CHECK-NEXT: br label %omp_floor1.preheader +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor1.preheader: ; preds = %omp_floor0.body +// CHECK-NEXT: br label %omp_floor1.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor1.header: ; preds = %omp_floor1.inc, %omp_floor1.preheader +// CHECK-NEXT: %omp_floor1.iv = phi i32 [ 0, %omp_floor1.preheader ], [ %omp_floor1.next, %omp_floor1.inc ] +// CHECK-NEXT: br label %omp_floor1.cond +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor1.cond: ; preds = %omp_floor1.header +// CHECK-NEXT: %omp_floor1.cmp = icmp ult i32 %omp_floor1.iv, %omp_floor1.tripcount +// CHECK-NEXT: br i1 %omp_floor1.cmp, label %omp_floor1.body, label %omp_floor1.exit +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor1.body: ; preds = %omp_floor1.cond +// CHECK-NEXT: %14 = icmp eq i32 %omp_floor0.iv, %6 +// CHECK-NEXT: %15 = select i1 %14, i32 %7, i32 %3 +// CHECK-NEXT: %16 = icmp eq i32 %omp_floor1.iv, %10 +// CHECK-NEXT: %17 = select i1 %16, i32 %11, i32 %4 +// CHECK-NEXT: br label %omp_tile0.preheader +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.preheader: ; preds = %omp_floor1.body +// CHECK-NEXT: br label %omp_tile0.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.header: ; preds = %omp_tile0.inc, %omp_tile0.preheader +// CHECK-NEXT: %omp_tile0.iv = phi i32 [ 0, %omp_tile0.preheader ], [ %omp_tile0.next, %omp_tile0.inc ] +// CHECK-NEXT: br label %omp_tile0.cond +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.cond: ; preds = %omp_tile0.header +// CHECK-NEXT: %omp_tile0.cmp = icmp ult i32 %omp_tile0.iv, %15 +// CHECK-NEXT: br i1 %omp_tile0.cmp, label %omp_tile0.body, label %omp_tile0.exit +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.body: ; preds = %omp_tile0.cond +// CHECK-NEXT: br label %omp_tile1.preheader +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile1.preheader: ; preds = %omp_tile0.body +// CHECK-NEXT: br label %omp_tile1.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile1.header: ; preds = %omp_tile1.inc, %omp_tile1.preheader +// CHECK-NEXT: %omp_tile1.iv = phi i32 [ 0, %omp_tile1.preheader ], [ %omp_tile1.next, %omp_tile1.inc ] +// CHECK-NEXT: br label %omp_tile1.cond +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile1.cond: ; preds = %omp_tile1.header +// CHECK-NEXT: %omp_tile1.cmp = icmp ult i32 %omp_tile1.iv, %17 +// CHECK-NEXT: br i1 %omp_tile1.cmp, label %omp_tile1.body, label %omp_tile1.exit +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile1.body: ; preds = %omp_tile1.cond +// CHECK-NEXT: %18 = mul nuw i32 %3, %omp_floor0.iv +// CHECK-NEXT: %19 = add nuw i32 %18, %omp_tile0.iv +// CHECK-NEXT: %20 = mul nuw i32 %4, %omp_floor1.iv +// CHECK-NEXT: %21 = add nuw i32 %20, %omp_tile1.iv +// CHECK-NEXT: br label %omp_omp.loop.body +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.body4: ; preds = %omp_omp.loop.preheader1 +// CHECK-NEXT: br label %omp.loop.region12 +// CHECK-EMPTY: +// CHECK-NEXT: omp.loop.region12: ; preds = %omp_omp.loop.body4 +// CHECK-NEXT: %22 = add i32 %19, %21 +// CHECK-NEXT: %23 = getelementptr inbounds float, ptr %0, i32 %22 +// CHECK-NEXT: store float 4.200000e+01, ptr %23, align 4 +// CHECK-NEXT: br label %omp.region.cont11 +// CHECK-EMPTY: +// CHECK-NEXT: omp.region.cont11: ; preds = %omp.loop.region12 +// CHECK-NEXT: br label %omp_tile1.inc +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile1.inc: ; preds = %omp.region.cont11 +// CHECK-NEXT: %omp_tile1.next = add nuw i32 %omp_tile1.iv, 1 +// CHECK-NEXT: br label %omp_tile1.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile1.exit: ; preds = %omp_tile1.cond +// CHECK-NEXT: br label %omp_tile1.after +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile1.after: ; preds = %omp_tile1.exit +// CHECK-NEXT: br label %omp_tile0.inc +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.inc: ; preds = %omp_tile1.after +// CHECK-NEXT: %omp_tile0.next = add nuw i32 %omp_tile0.iv, 1 +// CHECK-NEXT: br label %omp_tile0.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.exit: ; preds = %omp_tile0.cond +// CHECK-NEXT: br label %omp_tile0.after +// CHECK-EMPTY: +// CHECK-NEXT: omp_tile0.after: ; preds = %omp_tile0.exit +// CHECK-NEXT: br label %omp_floor1.inc +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor1.inc: ; preds = %omp_tile0.after +// CHECK-NEXT: %omp_floor1.next = add nuw i32 %omp_floor1.iv, 1 +// CHECK-NEXT: br label %omp_floor1.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor1.exit: ; preds = %omp_floor1.cond +// CHECK-NEXT: br label %omp_floor1.after +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor1.after: ; preds = %omp_floor1.exit +// CHECK-NEXT: br label %omp_floor0.inc +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.inc: ; preds = %omp_floor1.after +// CHECK-NEXT: %omp_floor0.next = add nuw i32 %omp_floor0.iv, 1 +// CHECK-NEXT: br label %omp_floor0.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.exit: ; preds = %omp_floor0.cond +// CHECK-NEXT: br label %omp_floor0.after +// CHECK-EMPTY: +// CHECK-NEXT: omp_floor0.after: ; preds = %omp_floor0.exit +// CHECK-NEXT: br label %omp_omp.loop.after +// CHECK-EMPTY: +// CHECK-NEXT: omp.region.cont: ; No predecessors! +// CHECK-NEXT: br label %omp_omp.loop.inc +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.inc: ; preds = %omp.region.cont +// CHECK-NEXT: %omp_omp.loop.next = add nuw i32 %19, 1 +// CHECK-NEXT: br label %omp_omp.loop.header +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.exit: ; preds = %omp_omp.loop.cond +// CHECK-NEXT: br label %omp_omp.loop.after +// CHECK-EMPTY: +// CHECK-NEXT: omp_omp.loop.after: ; preds = %omp_floor0.after, %omp_omp.loop.exit +// CHECK-NEXT: ret void +// CHECK-NEXT: } +// CHECK-EMPTY: +// CHECK-NEXT: !llvm.module.flags = !{!0} +// CHECK-EMPTY: +// CHECK-NEXT: !0 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt b/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt index 103bc94..7d32577 100644 --- a/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt +++ b/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt @@ -12,5 +12,7 @@ add_mlir_library(MLIRTestIRDLToCppDialect mlir_target_link_libraries(MLIRTestIRDLToCppDialect PUBLIC MLIRIR MLIRPass + MLIRSCFDialect MLIRTransforms + MLIRTestDialect ) diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp b/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp index 9550e4c..421db7e 100644 --- a/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp +++ b/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp @@ -13,6 +13,7 @@ // #include "mlir/IR/Dialect.h" #include "mlir/IR/Region.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/DialectImplementation.h" #include "mlir/Interfaces/InferTypeOpInterface.h" @@ -54,16 +55,34 @@ struct TestOpConversion : public OpConversionPattern<test_irdl_to_cpp::BeefOp> { } }; +struct TestRegionConversion + : public OpConversionPattern<test_irdl_to_cpp::ConditionalOp> { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(mlir::test_irdl_to_cpp::ConditionalOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Just exercising the C++ API even though these are not enforced in the + // dialect definition + assert(op.getThen().getBlocks().size() == 1); + assert(adaptor.getElse().getBlocks().size() == 1); + auto ifOp = scf::IfOp::create(rewriter, op.getLoc(), op.getInput()); + rewriter.replaceOp(op, ifOp); + return success(); + } +}; + struct ConvertTestDialectToSomethingPass : PassWrapper<ConvertTestDialectToSomethingPass, OperationPass<ModuleOp>> { void runOnOperation() override { MLIRContext *ctx = &getContext(); RewritePatternSet patterns(ctx); - patterns.add<TestOpConversion>(ctx); + patterns.add<TestOpConversion, TestRegionConversion>(ctx); ConversionTarget target(getContext()); - target.addIllegalOp<test_irdl_to_cpp::BeefOp>(); - target.addLegalOp<test_irdl_to_cpp::BarOp>(); - target.addLegalOp<test_irdl_to_cpp::HashOp>(); + target.addIllegalOp<test_irdl_to_cpp::BeefOp, + test_irdl_to_cpp::ConditionalOp>(); + target.addLegalOp<test_irdl_to_cpp::BarOp, test_irdl_to_cpp::HashOp, + scf::IfOp, scf::YieldOp>(); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) signalPassFailure(); @@ -73,6 +92,10 @@ struct ConvertTestDialectToSomethingPass StringRef getDescription() const final { return "Checks the convertability of an irdl dialect"; } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert<scf::SCFDialect>(); + } }; void registerIrdlTestDialect(mlir::DialectRegistry ®istry) { diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir index f6233ee..1915324 100644 --- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir +++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir @@ -1,15 +1,29 @@ // RUN: mlir-opt %s --pass-pipeline="builtin.module(test-irdl-conversion-check)" | FileCheck %s // CHECK-LABEL: module { module { - // CHECK: func.func @test() { + // CHECK: func.func @test(%[[test_arg:[^ ]*]]: i1) { // CHECK: %[[v0:[^ ]*]] = "test_irdl_to_cpp.bar"() : () -> i32 // CHECK: %[[v1:[^ ]*]] = "test_irdl_to_cpp.bar"() : () -> i32 // CHECK: %[[v2:[^ ]*]] = "test_irdl_to_cpp.hash"(%[[v0]], %[[v0]]) : (i32, i32) -> i32 + // CHECK: scf.if %[[test_arg]] // CHECK: return // CHECK: } - func.func @test() { + func.func @test(%test_arg: i1) { %0 = "test_irdl_to_cpp.bar"() : () -> i32 %1 = "test_irdl_to_cpp.beef"(%0, %0) : (i32, i32) -> i32 + "test_irdl_to_cpp.conditional"(%test_arg) ({ + ^cond(%test: i1): + %3 = "test_irdl_to_cpp.bar"() : () -> i32 + "test.terminator"() : ()->() + }, { + ^then(%what: i1, %ever: i32): + %4 = "test_irdl_to_cpp.bar"() : () -> i32 + "test.terminator"() : ()->() + }, { + ^else(): + %5 = "test_irdl_to_cpp.bar"() : () -> i32 + "test.terminator"() : ()->() + }) : (i1) -> () return } diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir index 42e713e..85fb8cb 100644 --- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir +++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir @@ -2,7 +2,7 @@ // CHECK: class TestIrdlToCpp irdl.dialect @test_irdl_to_cpp { - + // CHECK: class FooType irdl.type @foo @@ -32,4 +32,53 @@ irdl.dialect @test_irdl_to_cpp { irdl.operands(lhs: %0, rhs: %0) irdl.results(res: %0) } + + // CHECK: ConditionalOp declarations + // CHECK: ConditionalOpGenericAdaptorBase + // CHECK: ::mlir::Region &getCond() { return *getRegions()[0]; } + // CHECK: ::mlir::Region &getThen() { return *getRegions()[1]; } + // CHECK: ::mlir::Region &getElse() { return *getRegions()[2]; } + // + // CHECK: class ConditionalOp : public ::mlir::Op<ConditionalOp, ::mlir::OpTrait::NRegions<3>::Impl, ::mlir::OpTrait::OpInvariants> + // CHECK: ::mlir::Region &getCond() { return (*this)->getRegion(0); } + // CHECK: ::mlir::Region &getThen() { return (*this)->getRegion(1); } + // CHECK: ::mlir::Region &getElse() { return (*this)->getRegion(2); } + + // CHECK: ConditionalOp definitions + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_cond + // CHECK: if (!(region.getNumArguments() == 1)) { + // CHECK: failed to verify constraint: region with 1 entry block argument(s) + + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_then + // CHECK: if (!(true)) { + + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_else + // CHECK: if (!(region.getNumArguments() == 0)) { + // CHECK: failed to verify constraint: region with 0 entry block argument(s) + + // CHECK: ConditionalOp::build + // CHECK: for (unsigned i = 0; i != 3; ++i) + // CHECK-NEXT: (void)odsState.addRegion(); + + // CHECK: ConditionalOp::verifyInvariantsImpl + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_cond + // CHECK: failure + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_then + // CHECK: failure + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_else + // CHECK: failure + // CHECK: success + irdl.operation @conditional { + %r0 = irdl.region // Unconstrained region + %r1 = irdl.region() // Region with no entry block arguments + + // TODO(#161018): support irdl.is in irdl-to-cpp + // %v0 = irdl.is i1 // Type constraint: i1 (boolean) + %v0 = irdl.any + %r2 = irdl.region(%v0) // Region with one i1 entry block argument + irdl.regions(cond: %r2, then: %r0, else: %r1) + + %0 = irdl.any + irdl.operands(input: %0) + } } diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir index 403b492..cc27456 100644 --- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir +++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir @@ -7,7 +7,7 @@ irdl.dialect @test_irdl_to_cpp { irdl.results(res: %1) } } -// ----- +// ----- irdl.dialect @test_irdl_to_cpp { irdl.operation @operands_no_any_of { @@ -42,7 +42,7 @@ irdl.dialect @test_irdl_to_cpp { irdl.dialect @test_irdl_to_cpp { irdl.type @ty { - %0 = irdl.any + %0 = irdl.any // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.parameters operation}} irdl.parameters(ty: %0) } @@ -51,29 +51,8 @@ irdl.dialect @test_irdl_to_cpp { // ----- irdl.dialect @test_irdl_to_cpp { - irdl.operation @test_op { - // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.region operation}} - %0 = irdl.region() - irdl.regions(reg: %0) - } - -} - -// ----- - -irdl.dialect @test_irdl_to_cpp { - irdl.operation @test_op { - // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.regions operation}} - irdl.regions() - } - -} - -// ----- - -irdl.dialect @test_irdl_to_cpp { irdl.type @test_derived { // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.base operation}} %0 = irdl.base "!builtin.integer" - } + } } diff --git a/mlir/test/mlir-tblgen/op-format-invalid.td b/mlir/test/mlir-tblgen/op-format-invalid.td index 2f29543..0a022ad 100644 --- a/mlir/test/mlir-tblgen/op-format-invalid.td +++ b/mlir/test/mlir-tblgen/op-format-invalid.td @@ -307,7 +307,7 @@ def DirectiveTypeZOperandInvalidI : TestFormat_Op<[{ def LiteralInvalidA : TestFormat_Op<[{ `a:` }]>; -// CHECK: error: expected valid literal but got '1': single character literal must be a letter or one of '_:,=<>()[]{}?+*' +// CHECK: error: expected valid literal but got '1': single character literal must be a letter or one of '_:,=<>()[]{}?+-*' def LiteralInvalidB : TestFormat_Op<[{ `1` }]>; diff --git a/mlir/test/mlir-tblgen/op-format-spec.td b/mlir/test/mlir-tblgen/op-format-spec.td index 1541cd0..1ac2311 100644 --- a/mlir/test/mlir-tblgen/op-format-spec.td +++ b/mlir/test/mlir-tblgen/op-format-spec.td @@ -123,7 +123,7 @@ def DirectiveTypeValid : TestFormat_Op<[{ // CHECK-NOT: error def LiteralValid : TestFormat_Op<[{ - `_` `:` `,` `=` `<` `>` `(` `)` `[` `]` `?` `+` `*` ` ` `` `->` `\n` `abc$._` + `_` `:` `,` `=` `<` `>` `(` `)` `[` `]` `?` `+` `-` `*` ` ` `` `->` `\n` `abc$._` attr-dict }]>; diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp index a1899a8..8dd9713 100644 --- a/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp @@ -403,6 +403,7 @@ void DefFormat::genLiteralParser(StringRef value, FmtContext &ctx, .Case("]", "RSquare") .Case("?", "Question") .Case("+", "Plus") + .Case("-", "Minus") .Case("*", "Star") .Case("...", "Ellipsis") << "()"; diff --git a/mlir/tools/mlir-tblgen/FormatGen.cpp b/mlir/tools/mlir-tblgen/FormatGen.cpp index 4dfdde2..04d3ed1 100644 --- a/mlir/tools/mlir-tblgen/FormatGen.cpp +++ b/mlir/tools/mlir-tblgen/FormatGen.cpp @@ -518,7 +518,7 @@ bool mlir::tblgen::isValidLiteral(StringRef value, // If there is only one character, this must either be punctuation or a // single character bare identifier. if (value.size() == 1) { - StringRef bare = "_:,=<>()[]{}?+*"; + StringRef bare = "_:,=<>()[]{}?+-*"; if (isalpha(front) || bare.contains(front)) return true; if (emitError) diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp index 0d113b3..ccf21d1 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp @@ -852,6 +852,7 @@ static void genLiteralParser(StringRef value, MethodBody &body) { .Case("]", "RSquare()") .Case("?", "Question()") .Case("+", "Plus()") + .Case("-", "Minus()") .Case("*", "Star()") .Case("...", "Ellipsis()"); } diff --git a/offload/libomptarget/OpenMP/InteropAPI.cpp b/offload/libomptarget/OpenMP/InteropAPI.cpp index eb5425e..c55ef2c 100644 --- a/offload/libomptarget/OpenMP/InteropAPI.cpp +++ b/offload/libomptarget/OpenMP/InteropAPI.cpp @@ -124,7 +124,7 @@ void *getProperty<void *>(omp_interop_val_t &InteropVal, case omp_ipr_device_context: return InteropVal.device_info.Context; case omp_ipr_targetsync: - return InteropVal.async_info->Queue; + return InteropVal.async_info ? InteropVal.async_info->Queue : nullptr; default:; } getTypeMismatch(Property, Err); @@ -167,7 +167,6 @@ bool getPropertyCheck(omp_interop_val_t **InteropPtr, omp_interop_property_t property_id, \ int *err) { \ omp_interop_val_t *interop_val = (omp_interop_val_t *)interop; \ - assert((interop_val)->interop_type == kmp_interop_type_targetsync); \ if (!getPropertyCheck(&interop_val, property_id, err)) { \ return (RETURN_TYPE)(0); \ } \ @@ -275,8 +274,8 @@ omp_interop_val_t *__tgt_interop_get(ident_t *LocRef, int32_t InteropType, return Interop; } -int __tgt_interop_use(ident_t *LocRef, omp_interop_val_t *Interop, - interop_ctx_t *Ctx, dep_pack_t *Deps) { +int __tgt_interop_use60(ident_t *LocRef, omp_interop_val_t *Interop, + interop_ctx_t *Ctx, dep_pack_t *Deps) { bool Nowait = Ctx->flags.nowait; DP("Call to %s with interop " DPxMOD ", nowait %" PRId32 "\n", __func__, DPxPTR(Interop), Nowait); @@ -359,6 +358,40 @@ EXTERN int ompx_interop_add_completion_callback(omp_interop_val_t *Interop, return omp_irc_success; } +// Backwards compatibility wrappers +void __tgt_interop_init(ident_t *LocRef, int32_t Gtid, + omp_interop_val_t *&InteropPtr, int32_t InteropType, + int32_t DeviceId, int32_t Ndeps, + kmp_depend_info_t *DepList, int32_t HaveNowait) { + constexpr int32_t old_kmp_interop_type_targetsync = 2; + interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid}; + dep_pack_t Deps = {Ndeps, 0, DepList, nullptr}; + InteropPtr = + __tgt_interop_get(LocRef, + InteropType == old_kmp_interop_type_targetsync + ? kmp_interop_type_targetsync + : kmp_interop_type_target, + DeviceId, 0, nullptr, &Ctx, Ndeps ? &Deps : nullptr); +} + +void __tgt_interop_use(ident_t *LocRef, int32_t Gtid, + omp_interop_val_t *&InteropPtr, int32_t DeviceId, + int32_t Ndeps, kmp_depend_info_t *DepList, + int32_t HaveNowait) { + interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid}; + dep_pack_t Deps = {Ndeps, 0, DepList, nullptr}; + __tgt_interop_use60(LocRef, InteropPtr, &Ctx, Ndeps ? &Deps : nullptr); +} + +void __tgt_interop_destroy(ident_t *LocRef, int32_t Gtid, + omp_interop_val_t *&InteropPtr, int32_t DeviceId, + int32_t Ndeps, kmp_depend_info_t *DepList, + int32_t HaveNowait) { + interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid}; + dep_pack_t Deps = {Ndeps, 0, DepList, nullptr}; + __tgt_interop_release(LocRef, InteropPtr, &Ctx, Ndeps ? &Deps : nullptr); +} + } // extern "C" llvm::Expected<DeviceTy &> omp_interop_val_t::getDevice() const { diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports index 8e2db6b..1374bfe 100644 --- a/offload/libomptarget/exports +++ b/offload/libomptarget/exports @@ -68,8 +68,11 @@ VERS1.0 { omp_get_interop_int; omp_get_interop_name; omp_get_interop_type_desc; - __tgt_interop_get; + __tgt_interop_init; __tgt_interop_use; + __tgt_interop_destroy; + __tgt_interop_get; + __tgt_interop_use60; __tgt_interop_release; __tgt_target_sync; __llvmPushCallConfiguration; diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 7b834ee..f73fa047 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2712,6 +2712,37 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Plugin::success(); } + interop_spec_t selectInteropPreference(int32_t InteropType, + int32_t NumPrefers, + interop_spec_t *Prefers) override { + // TODO: update once targetsync is supported + if (InteropType == kmp_interop_type_target) + return interop_spec_t{tgt_fr_hsa, {false, 0}, 0}; + return interop_spec_t{tgt_fr_none, {false, 0}, 0}; + } + + Expected<omp_interop_val_t *> + createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override { + auto *Ret = new omp_interop_val_t( + DeviceId, static_cast<kmp_interop_type_t>(InteropType)); + Ret->fr_id = tgt_fr_hsa; + Ret->vendor_id = omp_vendor_amd; + + // TODO: implement targetsync support + + Ret->device_info.Platform = nullptr; + Ret->device_info.Device = reinterpret_cast<void *>(Agent.handle); + Ret->device_info.Context = nullptr; + + return Ret; + } + + Error releaseInterop(omp_interop_val_t *Interop) override { + if (Interop) + delete Interop; + return Plugin::success(); + } + Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, AsyncInfoWrapperTy &AsyncInfo) override { AMDGPUStreamTy *Stream = nullptr; diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b30c651..e5c4a1b 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -917,6 +917,50 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::success(); } + interop_spec_t selectInteropPreference(int32_t InteropType, + int32_t NumPrefers, + interop_spec_t *Prefers) override { + return interop_spec_t{tgt_fr_cuda, {true, 0}, 0}; + } + + Expected<omp_interop_val_t *> + createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override { + auto *Ret = new omp_interop_val_t( + DeviceId, static_cast<kmp_interop_type_t>(InteropType)); + Ret->fr_id = tgt_fr_cuda; + Ret->vendor_id = omp_vendor_nvidia; + + if (InteropType == kmp_interop_type_target || + InteropType == kmp_interop_type_targetsync) { + Ret->device_info.Platform = nullptr; + Ret->device_info.Device = reinterpret_cast<void *>(Device); + Ret->device_info.Context = Context; + } + + if (InteropType == kmp_interop_type_targetsync) { + Ret->async_info = new __tgt_async_info(); + if (auto Err = setContext()) + return Err; + CUstream Stream; + if (auto Err = CUDAStreamManager.getResource(Stream)) + return Err; + + Ret->async_info->Queue = Stream; + } + return Ret; + } + + Error releaseInterop(omp_interop_val_t *Interop) override { + if (!Interop) + return Plugin::success(); + + if (Interop->async_info) + delete Interop->async_info; + + delete Interop; + return Plugin::success(); + } + Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, AsyncInfoWrapperTy &AsyncInfo) override { if (auto Err = setContext()) diff --git a/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90 b/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90 new file mode 100644 index 0000000..65e04af --- /dev/null +++ b/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90 @@ -0,0 +1,43 @@ +! This test validates that declare mapper for a derived type that extends +! a parent type with an allocatable component correctly maps the nested +! allocatable payload via the mapper when the whole object is mapped on +! target. + +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic + +program target_declare_mapper_parent_allocatable + implicit none + + type, abstract :: base_t + real, allocatable :: base_arr(:) + end type base_t + + type, extends(base_t) :: real_t + real, allocatable :: real_arr(:) + end type real_t + !$omp declare mapper(custommapper: real_t :: t) map(t%base_arr, t%real_arr) + + type(real_t) :: r + integer :: i + allocate(r%base_arr(10), source=1.0) + allocate(r%real_arr(10), source=1.0) + + !$omp target map(mapper(custommapper), tofrom: r) + do i = 1, size(r%base_arr) + r%base_arr(i) = 2.0 + r%real_arr(i) = 3.0 + r%real_arr(i) = r%base_arr(1) + end do + !$omp end target + + + !CHECK: base_arr: 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. + print*, "base_arr: ", r%base_arr + !CHECK: real_arr: 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. + print*, "real_arr: ", r%real_arr + + deallocate(r%real_arr) + deallocate(r%base_arr) +end program target_declare_mapper_parent_allocatable |