diff options
Diffstat (limited to 'llvm')
52 files changed, 2981 insertions, 786 deletions
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index fd72a38..9855444 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -115,14 +115,17 @@ struct LegalityQuery { struct MemDesc { LLT MemoryTy; uint64_t AlignInBits; - AtomicOrdering Ordering; + AtomicOrdering Ordering; //< For cmpxchg this is the success ordering. + AtomicOrdering FailureOrdering; //< For cmpxchg, otherwise NotAtomic. MemDesc() = default; - MemDesc(LLT MemoryTy, uint64_t AlignInBits, AtomicOrdering Ordering) - : MemoryTy(MemoryTy), AlignInBits(AlignInBits), Ordering(Ordering) {} + MemDesc(LLT MemoryTy, uint64_t AlignInBits, AtomicOrdering Ordering, + AtomicOrdering FailureOrdering) + : MemoryTy(MemoryTy), AlignInBits(AlignInBits), Ordering(Ordering), + FailureOrdering(FailureOrdering) {} MemDesc(const MachineMemOperand &MMO) : MemDesc(MMO.getMemoryType(), MMO.getAlign().value() * 8, - MMO.getSuccessOrdering()) {} + MMO.getSuccessOrdering(), MMO.getFailureOrdering()) {} }; /// Operations which require memory can use this to place requirements on the diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h index ea68b45..7b1b5d9 100644 --- a/llvm/include/llvm/CodeGen/MIR2Vec.h +++ b/llvm/include/llvm/CodeGen/MIR2Vec.h @@ -38,6 +38,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Error.h" #include "llvm/Support/ErrorOr.h" #include <map> #include <set> @@ -92,46 +93,31 @@ public: /// Get the string key for a vocabulary entry at the given position std::string getStringKey(unsigned Pos) const; - MIRVocabulary() = delete; - MIRVocabulary(VocabMap &&Entries, const TargetInstrInfo *TII); - MIRVocabulary(ir2vec::VocabStorage &&Storage, const TargetInstrInfo &TII) - : Storage(std::move(Storage)), TII(TII) {} - - bool isValid() const { - return UniqueBaseOpcodeNames.size() > 0 && - Layout.TotalEntries == Storage.size() && Storage.isValid(); - } - - unsigned getDimension() const { - if (!isValid()) - return 0; - return Storage.getDimension(); - } + unsigned getDimension() const { return Storage.getDimension(); } // Accessor methods const Embedding &operator[](unsigned Opcode) const { - assert(isValid() && "MIR2Vec Vocabulary is invalid"); unsigned LocalIndex = getCanonicalOpcodeIndex(Opcode); return Storage[static_cast<unsigned>(Section::Opcodes)][LocalIndex]; } // Iterator access using const_iterator = ir2vec::VocabStorage::const_iterator; - const_iterator begin() const { - assert(isValid() && "MIR2Vec Vocabulary is invalid"); - return Storage.begin(); - } + const_iterator begin() const { return Storage.begin(); } - const_iterator end() const { - assert(isValid() && "MIR2Vec Vocabulary is invalid"); - return Storage.end(); - } + const_iterator end() const { return Storage.end(); } /// Total number of entries in the vocabulary - size_t getCanonicalSize() const { - assert(isValid() && "Invalid vocabulary"); - return Storage.size(); - } + size_t getCanonicalSize() const { return Storage.size(); } + + MIRVocabulary() = delete; + + /// Factory method to create MIRVocabulary from vocabulary map + static Expected<MIRVocabulary> create(VocabMap &&Entries, + const TargetInstrInfo &TII); + +private: + MIRVocabulary(VocabMap &&Entries, const TargetInstrInfo &TII); }; } // namespace mir2vec @@ -145,7 +131,6 @@ class MIR2VecVocabLegacyAnalysis : public ImmutablePass { StringRef getPassName() const override; Error readVocabulary(); - void emitError(Error Err, LLVMContext &Ctx); protected: void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -156,7 +141,7 @@ protected: public: static char ID; MIR2VecVocabLegacyAnalysis() : ImmutablePass(ID) {} - mir2vec::MIRVocabulary getMIR2VecVocabulary(const Module &M); + Expected<mir2vec::MIRVocabulary> getMIR2VecVocabulary(const Module &M); }; /// This pass prints the embeddings in the MIR2Vec vocabulary diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h index 6529412..f3839c9 100644 --- a/llvm/include/llvm/IR/DIBuilder.h +++ b/llvm/include/llvm/IR/DIBuilder.h @@ -729,7 +729,8 @@ namespace llvm { /// \param Subscripts Subscripts. LLVM_ABI DICompositeType *createVectorType(uint64_t Size, uint32_t AlignInBits, DIType *Ty, - DINodeArray Subscripts); + DINodeArray Subscripts, + Metadata *BitStride = nullptr); /// Create debugging information entry for an /// enumeration. diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 96da698..8856eda 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1983,16 +1983,16 @@ def int_experimental_vector_match : DefaultAttrsIntrinsic< [ llvm_anyvector_ty, llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], // Mask - [ IntrNoMem ]>; + [ IntrNoMem, IntrSpeculatable ]>; // Extract based on mask bits def int_experimental_vector_extract_last_active: DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - LLVMVectorElementType<0>], [IntrNoMem]>; + LLVMVectorElementType<0>], [IntrNoMem, IntrSpeculatable]>; // Operators -let IntrProperties = [IntrNoMem] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable] in { // Integer arithmetic def int_vp_add : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], [ LLVMMatchType<0>, @@ -2039,26 +2039,6 @@ let IntrProperties = [IntrNoMem] in { LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty]>; - def int_vp_sdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; - def int_vp_udiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; - def int_vp_srem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; - def int_vp_urem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; def int_vp_abs : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], [ LLVMMatchType<0>, llvm_i1_ty, @@ -2390,7 +2370,29 @@ let IntrProperties = [IntrNoMem] in { llvm_i32_ty]>; } -let IntrProperties = [IntrNoMem, ImmArg<ArgIndex<1>>] in { +// Integer VP division and remainder: not speculatable. +def int_vp_sdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], [IntrNoMem]>; +def int_vp_udiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], [IntrNoMem]>; +def int_vp_srem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], [IntrNoMem]>; +def int_vp_urem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], [IntrNoMem]>; + +let IntrProperties = [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>] in { def int_vp_ctlz : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], [ LLVMMatchType<0>, llvm_i1_ty, @@ -2422,18 +2424,18 @@ def int_loop_dependence_war_mask: def int_get_active_lane_mask: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyint_ty, LLVMMatchType<1>], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_experimental_get_vector_length: DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_anyint_ty, llvm_i32_ty, llvm_i1_ty], - [IntrNoMem, + [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; def int_experimental_cttz_elts: DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyvector_ty, llvm_i1_ty], - [IntrNoMem, ImmArg<ArgIndex<1>>]>; + [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>; def int_experimental_vp_splice: DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -2442,21 +2444,21 @@ def int_experimental_vp_splice: llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg<ArgIndex<2>>]>; + [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]>; def int_experimental_vp_reverse: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_experimental_vp_splat: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMVectorElementType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_vp_is_fpclass: DefaultAttrsIntrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], @@ -2753,16 +2755,22 @@ def int_preserve_static_offset : DefaultAttrsIntrinsic<[llvm_ptr_ty], def int_vector_reverse : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], - [IntrNoMem]>; + [IntrNoMem, + IntrSpeculatable]>; def int_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg<ArgIndex<2>>]>; + [IntrNoMem, + IntrSpeculatable, + ImmArg<ArgIndex<2>>]>; //===---------- Intrinsics to query properties of scalable vectors --------===// -def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>; +def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], + [], + [IntrNoMem, + IntrSpeculatable]>; //===---------- Intrinsics to perform subvector insertion/extraction ------===// def int_vector_insert : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -2776,18 +2784,22 @@ def int_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty], foreach n = 2...8 in { def int_vector_interleave#n : DefaultAttrsIntrinsic<[llvm_anyvector_ty], !listsplat(LLVMOneNthElementsVectorType<0, n>, n), - [IntrNoMem]>; + [IntrNoMem, + IntrSpeculatable]>; def int_vector_deinterleave#n : DefaultAttrsIntrinsic<!listsplat(LLVMOneNthElementsVectorType<0, n>, n), [llvm_anyvector_ty], - [IntrNoMem]>; + [IntrNoMem, + IntrSpeculatable]>; } //===-------------- Intrinsics to perform partial reduction ---------------===// def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>], - [llvm_anyvector_ty, llvm_anyvector_ty], - [IntrNoMem]>; + [llvm_anyvector_ty, + llvm_anyvector_ty], + [IntrNoMem, + IntrSpeculatable]>; //===----------------- Pointer Authentication Intrinsics ------------------===// // diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index b744537..31546e6 100755 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1495,22 +1495,22 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, default: llvm_unreachable("Missing case"); case Instruction::PtrToAddr: - // TODO: Add some of the ptrtoint folds here as well. - break; case Instruction::PtrToInt: if (auto *CE = dyn_cast<ConstantExpr>(C)) { Constant *FoldedValue = nullptr; - // If the input is a inttoptr, eliminate the pair. This requires knowing + // If the input is an inttoptr, eliminate the pair. This requires knowing // the width of a pointer, so it can't be done in ConstantExpr::getCast. if (CE->getOpcode() == Instruction::IntToPtr) { - // zext/trunc the inttoptr to pointer size. - FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0), - DL.getIntPtrType(CE->getType()), + // zext/trunc the inttoptr to pointer/address size. + Type *MidTy = Opcode == Instruction::PtrToInt + ? DL.getAddressType(CE->getType()) + : DL.getIntPtrType(CE->getType()); + FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0), MidTy, /*IsSigned=*/false, DL); } else if (auto *GEP = dyn_cast<GEPOperator>(CE)) { // If we have GEP, we can perform the following folds: - // (ptrtoint (gep null, x)) -> x - // (ptrtoint (gep (gep null, x), y) -> x + y, etc. + // (ptrtoint/ptrtoaddr (gep null, x)) -> x + // (ptrtoint/ptrtoaddr (gep (gep null, x), y) -> x + y, etc. unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); APInt BaseOffset(BitWidth, 0); auto *Base = cast<Constant>(GEP->stripAndAccumulateConstantOffsets( @@ -1518,7 +1518,8 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, if (Base->isNullValue()) { FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset); } else { - // ptrtoint (gep i8, Ptr, (sub 0, V)) -> sub (ptrtoint Ptr), V + // ptrtoint/ptrtoaddr (gep i8, Ptr, (sub 0, V)) + // -> sub (ptrtoint/ptrtoaddr Ptr), V if (GEP->getNumIndices() == 1 && GEP->getSourceElementType()->isIntegerTy(8)) { auto *Ptr = cast<Constant>(GEP->getPointerOperand()); @@ -1528,12 +1529,13 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, Sub->getOpcode() == Instruction::Sub && Sub->getOperand(0)->isNullValue()) FoldedValue = ConstantExpr::getSub( - ConstantExpr::getPtrToInt(Ptr, IntIdxTy), Sub->getOperand(1)); + ConstantExpr::getCast(Opcode, Ptr, IntIdxTy), + Sub->getOperand(1)); } } } if (FoldedValue) { - // Do a zext or trunc to get to the ptrtoint dest size. + // Do a zext or trunc to get to the ptrtoint/ptrtoaddr dest size. return ConstantFoldIntegerCast(FoldedValue, DestTy, /*IsSigned=*/false, DL); } diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 4c2e1fe..54f55b2 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -812,7 +812,9 @@ static bool isPointerUseReplacable(const Use &U) { auto *User = Worklist.pop_back_val(); if (!Visited.insert(User).second) continue; - if (isa<ICmpInst, PtrToIntInst>(User)) + // FIXME: The PtrToIntInst case here is not strictly correct, as it + // changes which provenance is exposed. + if (isa<ICmpInst, PtrToIntInst, PtrToAddrInst>(User)) continue; if (isa<PHINode, SelectInst>(User)) Worklist.append(User->user_begin(), User->user_end()); diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 6f6776c..30bcff7 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15749,51 +15749,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock( return RewriteMap.lookup_or(S, S); }; - // Check for the SCEV expression (A /u B) * B while B is a constant, inside - // \p Expr. The check is done recuresively on \p Expr, which is assumed to - // be a composition of Min/Max SCEVs. Return whether the SCEV expression (A - // /u B) * B was found, and return the divisor B in \p DividesBy. For - // example, if Expr = umin (umax ((A /u 8) * 8, 16), 64), return true since - // (A /u 8) * 8 matched the pattern, and return the constant SCEV 8 in \p - // DividesBy. - std::function<bool(const SCEV *, const SCEV *&)> HasDivisibiltyInfo = - [&](const SCEV *Expr, const SCEV *&DividesBy) { - if (auto *Mul = dyn_cast<SCEVMulExpr>(Expr)) { - if (Mul->getNumOperands() != 2) - return false; - auto *MulLHS = Mul->getOperand(0); - auto *MulRHS = Mul->getOperand(1); - if (isa<SCEVConstant>(MulLHS)) - std::swap(MulLHS, MulRHS); - if (auto *Div = dyn_cast<SCEVUDivExpr>(MulLHS)) - if (Div->getOperand(1) == MulRHS) { - DividesBy = MulRHS; - return true; - } - } - if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) - return HasDivisibiltyInfo(MinMax->getOperand(0), DividesBy) || - HasDivisibiltyInfo(MinMax->getOperand(1), DividesBy); - return false; - }; - - // Return true if Expr known to divide by \p DividesBy. - std::function<bool(const SCEV *, const SCEV *&)> IsKnownToDivideBy = - [&](const SCEV *Expr, const SCEV *DividesBy) { - if (SE.getURemExpr(Expr, DividesBy)->isZero()) - return true; - if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) - return IsKnownToDivideBy(MinMax->getOperand(0), DividesBy) && - IsKnownToDivideBy(MinMax->getOperand(1), DividesBy); - return false; - }; - const SCEV *RewrittenLHS = GetMaybeRewritten(LHS); const SCEV *DividesBy = nullptr; - if (HasDivisibiltyInfo(RewrittenLHS, DividesBy)) - // Check that the whole expression is divided by DividesBy - DividesBy = - IsKnownToDivideBy(RewrittenLHS, DividesBy) ? DividesBy : nullptr; + const APInt &Multiple = SE.getConstantMultiple(RewrittenLHS); + if (!Multiple.isOne()) + DividesBy = SE.getConstant(Multiple); // Collect rewrites for LHS and its transitive operands based on the // condition. diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index fa0ccd6..906d62a3 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1215,7 +1215,7 @@ bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const { LLT MemTy = LdSt.getMMO().getMemoryType(); SmallVector<LegalityQuery::MemDesc, 2> MemDescrs( {{MemTy, MemTy.getSizeInBits().getKnownMinValue(), - AtomicOrdering::NotAtomic}}); + AtomicOrdering::NotAtomic, AtomicOrdering::NotAtomic}}); unsigned IndexedOpc = getIndexedOpc(LdSt.getOpcode()); SmallVector<LLT> OpTys; if (IndexedOpc == TargetOpcode::G_INDEXED_STORE) diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp index b2f8435..cdc1f64 100644 --- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp @@ -958,7 +958,8 @@ void LoadStoreOpt::initializeStoreMergeTargetInfo(unsigned AddrSpace) { for (unsigned Size = 2; Size <= MaxStoreSizeToForm; Size *= 2) { LLT Ty = LLT::scalar(Size); SmallVector<LegalityQuery::MemDesc, 2> MemDescrs( - {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic}}); + {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic, + AtomicOrdering::NotAtomic}}); SmallVector<LLT> StoreTys({Ty, PtrTy}); LegalityQuery Q(TargetOpcode::G_STORE, StoreTys, MemDescrs); LegalizeActionStep ActionStep = LI.getAction(Q); diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp index 87565c0..e859765 100644 --- a/llvm/lib/CodeGen/MIR2Vec.cpp +++ b/llvm/lib/CodeGen/MIR2Vec.cpp @@ -49,14 +49,8 @@ cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0), //===----------------------------------------------------------------------===// MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries, - const TargetInstrInfo *TII) - : TII(*TII) { - // Fixme: Use static factory methods for creating vocabularies instead of - // public constructors - // Early return for invalid inputs - creates empty/invalid vocabulary - if (!TII || OpcodeEntries.empty()) - return; - + const TargetInstrInfo &TII) + : TII(TII) { buildCanonicalOpcodeMapping(); unsigned CanonicalOpcodeCount = UniqueBaseOpcodeNames.size(); @@ -67,6 +61,15 @@ MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries, Layout.TotalEntries = Storage.size(); } +Expected<MIRVocabulary> MIRVocabulary::create(VocabMap &&Entries, + const TargetInstrInfo &TII) { + if (Entries.empty()) + return createStringError(errc::invalid_argument, + "Empty vocabulary entries provided"); + + return MIRVocabulary(std::move(Entries), TII); +} + std::string MIRVocabulary::extractBaseOpcodeName(StringRef InstrName) { // Extract base instruction name using regex to capture letters and // underscores Examples: "ADD32rr" -> "ADD", "ARITH_FENCE" -> "ARITH_FENCE" @@ -107,13 +110,11 @@ unsigned MIRVocabulary::getCanonicalIndexForBaseName(StringRef BaseName) const { } unsigned MIRVocabulary::getCanonicalOpcodeIndex(unsigned Opcode) const { - assert(isValid() && "MIR2Vec Vocabulary is invalid"); auto BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode)); return getCanonicalIndexForBaseName(BaseOpcode); } std::string MIRVocabulary::getStringKey(unsigned Pos) const { - assert(isValid() && "MIR2Vec Vocabulary is invalid"); assert(Pos < Layout.TotalEntries && "Position out of bounds in vocabulary"); // For now, all entries are opcodes since we only have one section @@ -232,16 +233,11 @@ Error MIR2VecVocabLegacyAnalysis::readVocabulary() { return Error::success(); } -void MIR2VecVocabLegacyAnalysis::emitError(Error Err, LLVMContext &Ctx) { - Ctx.emitError(toString(std::move(Err))); -} - -mir2vec::MIRVocabulary +Expected<mir2vec::MIRVocabulary> MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) { if (StrVocabMap.empty()) { if (Error Err = readVocabulary()) { - emitError(std::move(Err), M.getContext()); - return mir2vec::MIRVocabulary(std::move(StrVocabMap), nullptr); + return std::move(Err); } } @@ -255,15 +251,13 @@ MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) { if (auto *MF = MMI.getMachineFunction(F)) { const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - return mir2vec::MIRVocabulary(std::move(StrVocabMap), TII); + return mir2vec::MIRVocabulary::create(std::move(StrVocabMap), *TII); } } - // No machine functions available - return invalid vocabulary - emitError(make_error<StringError>("No machine functions found in module", - inconvertibleErrorCode()), - M.getContext()); - return mir2vec::MIRVocabulary(std::move(StrVocabMap), nullptr); + // No machine functions available - return error + return createStringError(errc::invalid_argument, + "No machine functions found in module"); } //===----------------------------------------------------------------------===// @@ -284,13 +278,15 @@ bool MIR2VecVocabPrinterLegacyPass::runOnMachineFunction(MachineFunction &MF) { bool MIR2VecVocabPrinterLegacyPass::doFinalization(Module &M) { auto &Analysis = getAnalysis<MIR2VecVocabLegacyAnalysis>(); - auto MIR2VecVocab = Analysis.getMIR2VecVocabulary(M); + auto MIR2VecVocabOrErr = Analysis.getMIR2VecVocabulary(M); - if (!MIR2VecVocab.isValid()) { - OS << "MIR2Vec Vocabulary Printer: Invalid vocabulary\n"; + if (!MIR2VecVocabOrErr) { + OS << "MIR2Vec Vocabulary Printer: Failed to get vocabulary - " + << toString(MIR2VecVocabOrErr.takeError()) << "\n"; return false; } + auto &MIR2VecVocab = *MIR2VecVocabOrErr; unsigned Pos = 0; for (const auto &Entry : MIR2VecVocab) { OS << "Key: " << MIR2VecVocab.getStringKey(Pos++) << ": "; diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 3a9651c..89ed4da 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -110,6 +110,7 @@ STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII"); STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found"); STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage"); STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages"); +STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores"); /// A command line option to turn software pipelining on or off. static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), @@ -193,6 +194,13 @@ static cl::opt<bool> MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining")); +/// A command line argument to limit the number of store instructions in the +/// target basic block. +static cl::opt<unsigned> SwpMaxNumStores( + "pipeliner-max-num-stores", + cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden, + cl::init(200)); + namespace llvm { // A command line option to enable the CopyToPhi DAG mutation. @@ -544,6 +552,23 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) { return false; } + unsigned NumStores = 0; + for (MachineInstr &MI : *L.getHeader()) + if (MI.mayStore()) + ++NumStores; + if (NumStores > SwpMaxNumStores) { + LLVM_DEBUG(dbgs() << "Too many stores\n"); + NumFailTooManyStores++; + ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop", + L.getStartLoc(), L.getHeader()) + << "Too many store instructions in the loop: " + << ore::NV("NumStores", NumStores) << " > " + << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << "."; + }); + return false; + } + // Remove any subregisters from inputs to phi nodes. preprocessPhiNodes(*L.getHeader()); return true; diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 1ae20a9f..07a870f 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -715,11 +715,20 @@ DICompositeType *DIBuilder::createArrayType( DICompositeType *DIBuilder::createVectorType(uint64_t Size, uint32_t AlignInBits, DIType *Ty, - DINodeArray Subscripts) { - auto *R = DICompositeType::get(VMContext, dwarf::DW_TAG_array_type, "", - nullptr, 0, nullptr, Ty, Size, AlignInBits, 0, - DINode::FlagVector, Subscripts, 0, - /*EnumKind=*/std::nullopt, nullptr); + DINodeArray Subscripts, + Metadata *BitStride) { + auto *R = DICompositeType::get( + VMContext, dwarf::DW_TAG_array_type, /*Name=*/"", + /*File=*/nullptr, /*Line=*/0, /*Scope=*/nullptr, /*BaseType=*/Ty, + /*SizeInBits=*/Size, /*AlignInBits=*/AlignInBits, /*OffsetInBits=*/0, + /*Flags=*/DINode::FlagVector, /*Elements=*/Subscripts, + /*RuntimeLang=*/0, /*EnumKind=*/std::nullopt, /*VTableHolder=*/nullptr, + /*TemplateParams=*/nullptr, /*Identifier=*/"", + /*Discriminator=*/nullptr, /*DataLocation=*/nullptr, + /*Associated=*/nullptr, /*Allocated=*/nullptr, /*Rank=*/nullptr, + /*Annotations=*/nullptr, /*Specification=*/nullptr, + /*NumExtraInhabitants=*/0, + /*BitStride=*/BitStride); trackIfUnresolved(R); return R; } diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 91e64e6..bd0a17d 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -315,6 +315,8 @@ public: } void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) { + assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) && + "expected SVE stack sizes to be aligned to 16-bytes"); StackSizeZPR = ZPR; StackSizePPR = PPR; HasCalculatedStackSizeSVE = true; @@ -425,6 +427,8 @@ public: // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes' void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) { + assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) && + "expected SVE callee-save sizes to be aligned to 16-bytes"); ZPRCalleeSavedStackSize = ZPR; PPRCalleeSavedStackSize = PPR; HasSVECalleeSavedStackSize = true; diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 1568161..f110558 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -60,7 +60,6 @@ static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) { case AArch64::PTRUE_C_B: return I->getFlag(MachineInstr::FrameSetup) || I->getFlag(MachineInstr::FrameDestroy); - case AArch64::SEH_SavePReg: case AArch64::SEH_SaveZReg: return true; } @@ -75,6 +74,8 @@ static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) { case AArch64::LDR_PXI: return I->getFlag(MachineInstr::FrameSetup) || I->getFlag(MachineInstr::FrameDestroy); + case AArch64::SEH_SavePReg: + return true; } } @@ -94,6 +95,26 @@ AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon( HasFP = AFL.hasFP(MF); NeedsWinCFI = AFL.needsWinCFI(MF); + + // Windows unwind can't represent the required stack adjustments if we have + // both SVE callee-saves and dynamic stack allocations, and the frame pointer + // is before the SVE spills. The allocation of the frame pointer must be the + // last instruction in the prologue so the unwinder can restore the stack + // pointer correctly. (And there isn't any unwind opcode for `addvl sp, x29, + // -17`.) + // + // Because of this, we do spills in the opposite order on Windows: first SVE, + // then GPRs. The main side-effect of this is that it makes accessing + // parameters passed on the stack more expensive. + // + // We could consider rearranging the spills for simpler cases. + if (Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize()) { + if (AFI->hasStackHazardSlotIndex()) + reportFatalUsageError("SME hazard padding is not supported on Windows"); + SVELayout = SVEStackLayout::CalleeSavesAboveFrameRecord; + } else if (AFI->hasSplitSVEObjects()) { + SVELayout = SVEStackLayout::Split; + } } MachineBasicBlock::iterator @@ -334,6 +355,55 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump( return true; } +SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const { + StackOffset PPRCalleeSavesSize = + StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); + StackOffset ZPRCalleeSavesSize = + StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); + StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize; + StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize; + if (SVELayout == SVEStackLayout::Split) + return {{PPRCalleeSavesSize, PPRLocalsSize}, + {ZPRCalleeSavesSize, ZPRLocalsSize}}; + // For simplicity, attribute all locals to ZPRs when split SVE is disabled. + return {{PPRCalleeSavesSize, StackOffset{}}, + {ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}}; +} + +struct SVEPartitions { + struct { + MachineBasicBlock::iterator Begin, End; + } PPR, ZPR; +}; + +static SVEPartitions partitionSVECS(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + StackOffset PPRCalleeSavesSize, + StackOffset ZPRCalleeSavesSize, + bool IsEpilogue) { + MachineBasicBlock::iterator PPRsI = MBBI; + MachineBasicBlock::iterator End = + IsEpilogue ? MBB.begin() : MBB.getFirstTerminator(); + auto AdjustI = [&](auto MBBI) { return IsEpilogue ? std::prev(MBBI) : MBBI; }; + // Process the SVE CS to find the starts/ends of the ZPR and PPR areas. + if (PPRCalleeSavesSize) { + PPRsI = AdjustI(PPRsI); + assert(isPartOfPPRCalleeSaves(*PPRsI) && "Unexpected instruction"); + while (PPRsI != End && isPartOfPPRCalleeSaves(AdjustI(PPRsI))) + IsEpilogue ? (--PPRsI) : (++PPRsI); + } + MachineBasicBlock::iterator ZPRsI = PPRsI; + if (ZPRCalleeSavesSize) { + ZPRsI = AdjustI(ZPRsI); + assert(isPartOfZPRCalleeSaves(*ZPRsI) && "Unexpected instruction"); + while (ZPRsI != End && isPartOfZPRCalleeSaves(AdjustI(ZPRsI))) + IsEpilogue ? (--ZPRsI) : (++ZPRsI); + } + if (IsEpilogue) + return {{PPRsI, MBBI}, {ZPRsI, PPRsI}}; + return {{MBBI, PPRsI}, {PPRsI, ZPRsI}}; +} + AArch64PrologueEmitter::AArch64PrologueEmitter(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64FrameLowering &AFL) @@ -613,30 +683,12 @@ void AArch64PrologueEmitter::emitPrologue() { bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg()); unsigned FixedObject = AFL.getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); - // Windows unwind can't represent the required stack adjustments if we have - // both SVE callee-saves and dynamic stack allocations, and the frame - // pointer is before the SVE spills. The allocation of the frame pointer - // must be the last instruction in the prologue so the unwinder can restore - // the stack pointer correctly. (And there isn't any unwind opcode for - // `addvl sp, x29, -17`.) - // - // Because of this, we do spills in the opposite order on Windows: first SVE, - // then GPRs. The main side-effect of this is that it makes accessing - // parameters passed on the stack more expensive. - // - // We could consider rearranging the spills for simpler cases. - bool FPAfterSVECalleeSaves = - Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize(); - - if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex()) - reportFatalUsageError("SME hazard padding is not supported on Windows"); - auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; // All of the remaining stack allocations are for locals. determineLocalsStackSize(NumBytes, PrologueSaveSize); MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI; - if (FPAfterSVECalleeSaves) { + if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { // If we're doing SVE saves first, we need to immediately allocate space // for fixed objects, then space for the SVE callee saves. // @@ -712,110 +764,66 @@ void AArch64PrologueEmitter::emitPrologue() { if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding); - StackOffset PPRCalleeSavesSize = - StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); - StackOffset ZPRCalleeSavesSize = - StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); - StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize; - StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize; - StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize; - - std::optional<MachineBasicBlock::iterator> ZPRCalleeSavesBegin, - ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd; - + auto [PPR, ZPR] = getSVEStackFrameSizes(); + StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; + StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes); StackOffset CFAOffset = - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); + StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize; + MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI; - if (!FPAfterSVECalleeSaves) { - // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR - // areas. - PPRCalleeSavesBegin = AfterGPRSavesI; - if (PPRCalleeSavesSize) { - LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = " - << PPRCalleeSavesSize.getScalable() << "\n"); - - assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) && - "Unexpected instruction"); - while (isPartOfPPRCalleeSaves(AfterSVESavesI) && - AfterSVESavesI != MBB.getFirstTerminator()) - ++AfterSVESavesI; + // Allocate space for the callee saves and PPR locals (if any). + if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) { + auto [PPRRange, ZPRRange] = + partitionSVECS(MBB, AfterGPRSavesI, PPR.CalleeSavesSize, + ZPR.CalleeSavesSize, /*IsEpilogue=*/false); + AfterSVESavesI = ZPRRange.End; + if (EmitAsyncCFI) + emitCalleeSavedSVELocations(AfterSVESavesI); + + StackOffset AllocateBeforePPRs = SVECalleeSavesSize; + StackOffset AllocateAfterPPRs = PPR.LocalsSize; + if (SVELayout == SVEStackLayout::Split) { + AllocateBeforePPRs = PPR.CalleeSavesSize; + AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize; } - PPRCalleeSavesEnd = ZPRCalleeSavesBegin = AfterSVESavesI; - if (ZPRCalleeSavesSize) { - LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = " - << ZPRCalleeSavesSize.getScalable() << "\n"); - assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) && - "Unexpected instruction"); - while (isPartOfZPRCalleeSaves(AfterSVESavesI) && - AfterSVESavesI != MBB.getFirstTerminator()) - ++AfterSVESavesI; - } - ZPRCalleeSavesEnd = AfterSVESavesI; - } - - if (EmitAsyncCFI) - emitCalleeSavedSVELocations(AfterSVESavesI); - - if (AFI->hasSplitSVEObjects()) { - assert(!FPAfterSVECalleeSaves && - "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects"); - assert(!AFL.canUseRedZone(MF) && - "Cannot use redzone with aarch64-split-sve-objects"); - // TODO: Handle HasWinCFI/NeedsWinCFI? - assert(!NeedsWinCFI && - "WinCFI with aarch64-split-sve-objects is not supported"); - - // Split ZPR and PPR allocation. - // Allocate PPR callee saves - allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize, + allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || ZPRCalleeSavesSize || - ZPRLocalsSize || PPRLocalsSize); - CFAOffset += PPRCalleeSavesSize; - - // Allocate PPR locals + ZPR callee saves - assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin && + MFI.hasVarSizedObjects() || AllocateAfterPPRs || + ZPR.LocalsSize || NonSVELocalsSize); + CFAOffset += AllocateBeforePPRs; + assert(PPRRange.End == ZPRRange.Begin && "Expected ZPR callee saves after PPR locals"); - allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding, - PPRLocalsSize + ZPRCalleeSavesSize, - EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || ZPRLocalsSize); - CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize; - - // Allocate ZPR locals - allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding, - ZPRLocalsSize + StackOffset::getFixed(NumBytes), + allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects()); + MFI.hasVarSizedObjects() || ZPR.LocalsSize || + NonSVELocalsSize); + CFAOffset += AllocateAfterPPRs; } else { - // Allocate space for the callee saves (if any). - StackOffset LocalsSize = - PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes); - if (!FPAfterSVECalleeSaves) - allocateStackSpace(AfterGPRSavesI, 0, SVECalleeSavesSize, - EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || LocalsSize); + assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord); + // Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been + // allocated (and separate PPR locals are not supported, all SVE locals, + // both PPR and ZPR, are within the ZPR locals area). + assert(!PPR.LocalsSize && "Unexpected PPR locals!"); CFAOffset += SVECalleeSavesSize; + } - // Allocate space for the rest of the frame including SVE locals. Align the - // stack as necessary. - assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) && - "Cannot use redzone with stack realignment"); - if (!AFL.canUseRedZone(MF)) { - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize; - allocateStackSpace(AfterSVESavesI, RealignmentPadding, - SVELocalsSize + StackOffset::getFixed(NumBytes), - EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects()); - } + // Allocate space for the rest of the frame including ZPR locals. Align the + // stack as necessary. + assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) && + "Cannot use redzone with stack realignment"); + if (!AFL.canUseRedZone(MF)) { + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the + // correct value here, as NumBytes also includes padding bytes, which + // shouldn't be counted here. + allocateStackSpace( + AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize, + EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects()); } // If we need a base pointer, set it up here. It's whatever the value of the - // stack pointer is at this point. Any variable size objects will be allocated - // after this, so we can still use the base pointer to reference locals. + // stack pointer is at this point. Any variable size objects will be + // allocated after this, so we can still use the base pointer to reference + // locals. // // FIXME: Clarify FrameSetup flags here. // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is @@ -1270,7 +1278,9 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations( StackOffset::getScalable(MFI.getObjectOffset(FI)) - StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI)); - if (AFI->hasSplitSVEObjects() && + // The scalable vectors are below (lower address) the scalable predicates + // with split SVE objects, so we must subtract the size of the predicates. + if (SVELayout == SVEStackLayout::Split && MFI.getStackID(FI) == TargetStackID::ScalableVector) Offset -= PPRStackSize; @@ -1349,13 +1359,10 @@ void AArch64EpilogueEmitter::emitEpilogue() { return; } - bool FPAfterSVECalleeSaves = - Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize(); - bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes); // Assume we can't combine the last pop with the sp restore. bool CombineAfterCSRBump = false; - if (FPAfterSVECalleeSaves) { + if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { AfterCSRPopSize += FixedObject; } else if (!CombineSPBump && PrologueSaveSize != 0) { MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator()); @@ -1390,7 +1397,8 @@ void AArch64EpilogueEmitter::emitEpilogue() { while (FirstGPRRestoreI != Begin) { --FirstGPRRestoreI; if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) || - (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) { + (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord && + isPartOfSVECalleeSaves(FirstGPRRestoreI))) { ++FirstGPRRestoreI; break; } else if (CombineSPBump) @@ -1414,13 +1422,9 @@ void AArch64EpilogueEmitter::emitEpilogue() { if (HasFP && AFI->hasSwiftAsyncContext()) emitSwiftAsyncContextFramePointer(EpilogueEndI, DL); - StackOffset ZPRStackSize = AFL.getZPRStackSize(MF); - StackOffset PPRStackSize = AFL.getPPRStackSize(MF); - StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; - // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { - assert(!SVEStackSize && "Cannot combine SP bump with SVE"); + assert(!AFI->hasSVEStackSize() && "Cannot combine SP bump with SVE"); // When we are about to restore the CSRs, the CFA register is SP again. if (EmitCFI && HasFP) @@ -1437,188 +1441,122 @@ void AArch64EpilogueEmitter::emitEpilogue() { NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); - if (!AFI->hasSplitSVEObjects()) { - // Process the SVE callee-saves to determine what space needs to be - // deallocated. - StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; - MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI, - RestoreEnd = FirstGPRRestoreI; - int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize(); - int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize(); - int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize; - - if (SVECalleeSavedSize) { - if (FPAfterSVECalleeSaves) - RestoreEnd = MBB.getFirstTerminator(); - - RestoreBegin = std::prev(RestoreEnd); - while (RestoreBegin != MBB.begin() && - isPartOfSVECalleeSaves(std::prev(RestoreBegin))) - --RestoreBegin; - - assert(isPartOfSVECalleeSaves(RestoreBegin) && - isPartOfSVECalleeSaves(std::prev(RestoreEnd)) && - "Unexpected instruction"); - - StackOffset CalleeSavedSizeAsOffset = - StackOffset::getScalable(SVECalleeSavedSize); - DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; - DeallocateAfter = CalleeSavedSizeAsOffset; + auto [PPR, ZPR] = getSVEStackFrameSizes(); + auto [PPRRange, ZPRRange] = partitionSVECS( + MBB, + SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord + ? MBB.getFirstTerminator() + : FirstGPRRestoreI, + PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true); + + StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; + StackOffset SVEStackSize = + SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize; + MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin; + MachineBasicBlock::iterator RestoreEnd = PPRRange.End; + + // Deallocate the SVE area. + if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { + StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize; + // If the callee-save area is before FP, restoring the FP implicitly + // deallocates non-callee-save SVE allocations. Otherwise, deallocate them + // explicitly. + if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { + emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, + SVELocalsSize, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI); } - // Deallocate the SVE area. - if (FPAfterSVECalleeSaves) { - // If the callee-save area is before FP, restoring the FP implicitly - // deallocates non-callee-save SVE allocations. Otherwise, deallocate - // them explicitly. - if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { - emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI, &HasWinCFI); - } + // Deallocate callee-save non-SVE registers. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - // Deallocate callee-save non-SVE registers. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(AFI->getCalleeSavedStackSize()), - TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI); - - // Deallocate fixed objects. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(FixedObject), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI); - - // Deallocate callee-save SVE registers. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - DeallocateAfter, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); - } else if (SVEStackSize) { - int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize(); - // If we have stack realignment or variable-sized objects we must use the - // FP to restore SVE callee saves (as there is an unknown amount of - // data/padding between the SP and SVE CS area). - Register BaseForSVEDealloc = - (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP - : AArch64::SP; - if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) { - Register CalleeSaveBase = AArch64::FP; - if (int64_t CalleeSaveBaseOffset = - AFI->getCalleeSaveBaseToFrameRecordOffset()) { - // If we have have an non-zero offset to the non-SVE CS base we need - // to compute the base address by subtracting the offest in a - // temporary register first (to avoid briefly deallocating the SVE - // CS). - CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); - emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, - StackOffset::getFixed(-CalleeSaveBaseOffset), TII, - MachineInstr::FrameDestroy); - } - // The code below will deallocate the stack space space by moving the - // SP to the start of the SVE callee-save area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, - StackOffset::getScalable(-SVECalleeSavedSize), TII, + // Deallocate fixed objects. + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(FixedObject), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); + + // Deallocate callee-save SVE registers. + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI); + } else if (AFI->hasSVEStackSize()) { + // If we have stack realignment or variable-sized objects we must use the FP + // to restore SVE callee saves (as there is an unknown amount of + // data/padding between the SP and SVE CS area). + Register BaseForSVEDealloc = + (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP + : AArch64::SP; + if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) { + // TODO: Support stack realigment and variable-sized objects. + assert( + SVELayout != SVEStackLayout::Split && + "unexpected stack realignment or variable sized objects with split " + "SVE stack objects"); + + Register CalleeSaveBase = AArch64::FP; + if (int64_t CalleeSaveBaseOffset = + AFI->getCalleeSaveBaseToFrameRecordOffset()) { + // If we have have an non-zero offset to the non-SVE CS base we need to + // compute the base address by subtracting the offest in a temporary + // register first (to avoid briefly deallocating the SVE CS). + CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); + emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, + StackOffset::getFixed(-CalleeSaveBaseOffset), TII, MachineInstr::FrameDestroy); - } else if (BaseForSVEDealloc == AArch64::SP) { - if (SVECalleeSavedSize) { - // Deallocate the non-SVE locals first before we can deallocate (and - // restore callee saves) from the SVE area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(NumBytes), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI, EmitCFI && !HasFP, - SVEStackSize + StackOffset::getFixed( - NumBytes + PrologueSaveSize)); - NumBytes = 0; - } - - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, - SVEStackSize + - StackOffset::getFixed(NumBytes + PrologueSaveSize)); - - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - DeallocateAfter, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, - DeallocateAfter + - StackOffset::getFixed(NumBytes + PrologueSaveSize)); + } + // The code below will deallocate the stack space space by moving the SP + // to the start of the SVE callee-save area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, + -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy); + } else if (BaseForSVEDealloc == AArch64::SP) { + auto CFAOffset = + SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize); + + if (SVECalleeSavesSize) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + auto NonSVELocals = StackOffset::getFixed(NumBytes); + emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, + NonSVELocals, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset); + CFAOffset -= NonSVELocals; + NumBytes = 0; } - if (EmitCFI) - emitCalleeSavedSVERestores(RestoreEnd); - } - } else if (AFI->hasSplitSVEObjects() && SVEStackSize) { - // TODO: Support stack realigment and variable-sized objects. - assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() && - "unexpected stack realignment or variable sized objects with split " - "SVE stack objects"); - // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR - // areas. - auto ZPRCalleeSavedSize = - StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); - auto PPRCalleeSavedSize = - StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); - StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize; - StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize; - - MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI, - PPRRestoreEnd = FirstGPRRestoreI; - if (PPRCalleeSavedSize) { - PPRRestoreBegin = std::prev(PPRRestoreEnd); - while (PPRRestoreBegin != MBB.begin() && - isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin))) - --PPRRestoreBegin; - } - - MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin, - ZPRRestoreEnd = PPRRestoreBegin; - if (ZPRCalleeSavedSize) { - ZPRRestoreBegin = std::prev(ZPRRestoreEnd); - while (ZPRRestoreBegin != MBB.begin() && - isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin))) - --ZPRRestoreBegin; - } - - auto CFAOffset = - SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize); - if (PPRCalleeSavedSize || ZPRCalleeSavedSize) { - // Deallocate the non-SVE locals first before we can deallocate (and - // restore callee saves) from the SVE area. - auto NonSVELocals = StackOffset::getFixed(NumBytes); - emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP, - NonSVELocals, TII, MachineInstr::FrameDestroy, false, - false, nullptr, EmitCFI && !HasFP, CFAOffset); - NumBytes = 0; - CFAOffset -= NonSVELocals; - } + if (ZPR.LocalsSize) { + emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, + ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset); + CFAOffset -= ZPR.LocalsSize; + } - if (ZPRLocalsSize) { - emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP, - ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false, - false, nullptr, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= ZPRLocalsSize; - } + StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize; + if (SVELayout == SVEStackLayout::Split && + (PPR.LocalsSize || ZPR.CalleeSavesSize)) { + assert(PPRRange.Begin == ZPRRange.End && + "Expected PPR restores after ZPR"); + emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP, + PPR.LocalsSize + ZPR.CalleeSavesSize, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI, EmitCFI && !HasFP, CFAOffset); + CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize; + SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize; + } - if (PPRLocalsSize || ZPRCalleeSavedSize) { - assert(PPRRestoreBegin == ZPRRestoreEnd && - "Expected PPR restores after ZPR"); - emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP, - PPRLocalsSize + ZPRCalleeSavedSize, TII, - MachineInstr::FrameDestroy, false, false, nullptr, - EmitCFI && !HasFP, CFAOffset); - CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize; - } - if (PPRCalleeSavedSize) { - emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP, - PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy, - false, false, nullptr, EmitCFI && !HasFP, CFAOffset); + // If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs: + if (SVECalleeSavesToDealloc) + emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, + SVECalleeSavesToDealloc, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI, EmitCFI && !HasFP, CFAOffset); } - // We only emit CFI information for ZPRs so emit CFI after the ZPR restores. if (EmitCFI) - emitCalleeSavedSVERestores(ZPRRestoreEnd); + emitCalleeSavedSVERestores( + SVELayout == SVEStackLayout::Split ? ZPRRange.End : PPRRange.End); } if (!HasFP) { diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h index a1c9b34..bccadda 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h @@ -27,11 +27,23 @@ class AArch64Subtarget; class AArch64FunctionInfo; class AArch64FrameLowering; +struct SVEFrameSizes { + struct { + StackOffset CalleeSavesSize, LocalsSize; + } PPR, ZPR; +}; + class AArch64PrologueEpilogueCommon { public: AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64FrameLowering &AFL); + enum class SVEStackLayout { + Default, + Split, + CalleeSavesAboveFrameRecord, + }; + protected: bool requiresGetVGCall() const; @@ -53,6 +65,8 @@ protected: bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const; + SVEFrameSizes getSVEStackFrameSizes() const; + MachineFunction &MF; MachineBasicBlock &MBB; @@ -68,6 +82,7 @@ protected: bool IsFunclet = false; // Note: Set in derived constructors. bool NeedsWinCFI = false; // Note: Can be changed in emitFramePointerSetup. bool HomPrologEpilog = false; // Note: Set in derived constructors. + SVEStackLayout SVELayout = SVEStackLayout::Default; // Note: "HasWinCFI" is mutable as it can change in any "emit" function. mutable bool HasWinCFI = false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 0f2c335..ce2b4a5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,6 +562,11 @@ public: void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +struct AMDGPUUniformIntrinsicCombinePass + : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 9449e70..a6074ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) +MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c7a91f4c..4958a20 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption( cl::desc("Whether has closed-world assumption at link time"), cl::init(false), cl::Hidden); +static cl::opt<bool> EnableUniformIntrinsicCombine( + "amdgpu-enable-uniform-intrinsic-combine", + cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); @@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); + + if (EnableUniformIntrinsicCombine) + PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp new file mode 100644 index 0000000..50c78d8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -0,0 +1,159 @@ +//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass simplifies certain intrinsic calls when the arguments are uniform. +/// It's true that this pass has transforms that can lead to a situation where +/// some instruction whose operand was previously recognized as statically +/// uniform is later on no longer recognized as statically uniform. However, the +/// semantics of how programs execute don't (and must not, for this precise +/// reason) care about static uniformity, they only ever care about dynamic +/// uniformity. And every instruction that's downstream and cares about dynamic +/// uniformity must be convergent (and isel will introduce v_readfirstlane for +/// them if their operands can't be proven statically uniform). +/// +/// This pass is implemented as a ModulePass because intrinsic declarations +/// exist at the module scope, allowing us to skip processing entirely if no +/// declarations are present and to traverse their user lists directly when +/// they are. A FunctionPass would instead require scanning every instruction +/// in every function to find relevant intrinsics, which is far less efficient. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine" + +using namespace llvm; +using namespace llvm::AMDGPU; +using namespace llvm::PatternMatch; + +/// Wrapper for querying uniformity info that first checks locally tracked +/// instructions. +static bool +isDivergentUseWithNew(const Use &U, const UniformityInfo &UI, + const ValueMap<const Value *, bool> &Tracker) { + Value *V = U.get(); + if (auto It = Tracker.find(V); It != Tracker.end()) + return !It->second; // divergent if marked false + return UI.isDivergentUse(U); +} + +/// Optimizes uniform intrinsics calls if their operand can be proven uniform. +static bool optimizeUniformIntrinsic(IntrinsicInst &II, + const UniformityInfo &UI, + ValueMap<const Value *, bool> &Tracker) { + llvm::Intrinsic::ID IID = II.getIntrinsicID(); + + switch (IID) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: { + Value *Src = II.getArgOperand(0); + if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) + return false; + LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n'); + II.replaceAllUsesWith(Src); + II.eraseFromParent(); + return true; + } + case Intrinsic::amdgcn_ballot: { + Value *Src = II.getArgOperand(0); + if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) + return false; + LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n'); + + bool Changed = false; + for (User *U : make_early_inc_range(II.users())) { + if (auto *ICmp = dyn_cast<ICmpInst>(U)) { + Value *Op0 = ICmp->getOperand(0); + Value *Op1 = ICmp->getOperand(1); + ICmpInst::Predicate Pred = ICmp->getPredicate(); + Value *OtherOp = Op0 == &II ? Op1 : Op0; + + if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) { + // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1 + Instruction *NotOp = + BinaryOperator::CreateNot(Src, "", ICmp->getIterator()); + Tracker[NotOp] = true; // NOT preserves uniformity + LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); + ICmp->replaceAllUsesWith(NotOp); + ICmp->eraseFromParent(); + Changed = true; + } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { + // Case: (icmp ne %ballot, 0) -> %ballot_arg + LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " + << *Src << '\n'); + ICmp->replaceAllUsesWith(Src); + ICmp->eraseFromParent(); + Changed = true; + } + } + } + // Erase the intrinsic if it has no remaining uses. + if (II.use_empty()) + II.eraseFromParent(); + return Changed; + } + default: + llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic"); + } + return false; +} + +/// Iterates over intrinsic declarations in the module to optimize their uses. +static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { + bool IsChanged = false; + ValueMap<const Value *, bool> Tracker; + + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + for (Function &F : M) { + switch (F.getIntrinsicID()) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_ballot: + break; + default: + continue; + } + + for (User *U : make_early_inc_range(F.users())) { + auto *II = cast<IntrinsicInst>(U); + Function *ParentF = II->getFunction(); + const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF); + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); + } + } + return IsChanged; +} + +PreservedAnalyses +AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { + if (!runUniformIntrinsicCombine(M, AM)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve<UniformityInfoAnalysis>(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index aae56ee..13f727b68 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp + AMDGPUUniformIntrinsicCombine.cpp AMDGPUInstrInfo.cpp AMDGPUInstructionSelector.cpp AMDGPUISelDAGToDAG.cpp diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td index 6d0529f..fb0928b8 100644 --- a/llvm/lib/Target/Hexagon/Hexagon.td +++ b/llvm/lib/Target/Hexagon/Hexagon.td @@ -110,8 +110,6 @@ def FeatureSmallData: SubtargetFeature<"small-data", "UseSmallData", "true", "Allow GP-relative addressing of global variables">; def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true", "Enable generation of duplex instruction">; -def FeatureUnsafeFP: SubtargetFeature<"unsafe-fp", "UseUnsafeMath", "true", - "Use unsafe FP math">; def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19", "true", "Reserve register R19">; def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim", @@ -167,7 +165,6 @@ def UseHVXQFloat : Predicate<"HST->useHVXQFloatOps()">, def UseHVXFloatingPoint: Predicate<"HST->useHVXFloatingPoint()">; def HasMemNoShuf : Predicate<"HST->hasMemNoShuf()">, AssemblerPredicate<(all_of FeatureMemNoShuf)>; -def UseUnsafeMath : Predicate<"HST->useUnsafeMath()">; def NotOptTinyCore : Predicate<"!HST->isTinyCore() ||" "MF->getFunction().hasOptSize()"> { let RecomputePerFunction = 1; diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index 4b23670..a0acfcf 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -1611,8 +1611,11 @@ def DfMpy: OutPatFrag<(ops node:$Rs, node:$Rt), $Rt, $Rs), $Rs, $Rt)>; -let Predicates = [HasV67,UseUnsafeMath], AddedComplexity = 50 in { - def: Pat<(fmul F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>; +def fmul_afn : PatFrag<(ops node:$a, node:$b), (fmul node:$a, node:$b), [{ + return N->getFlags().hasApproximateFuncs(); +}]>; +let Predicates = [HasV67], AddedComplexity = 50 in { + def : Pat<(fmul_afn F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>; } let Predicates = [HasV67] in { def: OpR_RR_pat<F2_dfmin, pf2<fminimumnum>, f64, F64>; diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index b111471..7430567 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -54,7 +54,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { bool UseNewValueJumps = false; bool UseNewValueStores = false; bool UseSmallData = false; - bool UseUnsafeMath = false; bool UseZRegOps = false; bool UseHVXIEEEFPOps = false; bool UseHVXQFloatOps = false; @@ -234,7 +233,6 @@ public: bool useNewValueJumps() const { return UseNewValueJumps; } bool useNewValueStores() const { return UseNewValueStores; } bool useSmallData() const { return UseSmallData; } - bool useUnsafeMath() const { return UseUnsafeMath; } bool useZRegOps() const { return UseZRegOps; } bool useCabac() const { return UseCabac; } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 0afa04a..f5d8b69 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -250,13 +250,6 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const { CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU; std::string FS = FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS; - // Append the preexisting target features last, so that +mattr overrides - // the "unsafe-fp-math" function attribute. - // Creating a separate target feature is not strictly necessary, it only - // exists to make "unsafe-fp-math" force creating a new subtarget. - - if (F.getFnAttribute("unsafe-fp-math").getValueAsBool()) - FS = FS.empty() ? "+unsafe-fp" : "+unsafe-fp," + FS; auto &I = SubtargetMap[CPU + FS]; if (!I) { diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 5b8ea15..b74a070 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -1084,8 +1084,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, auto ThenTerm = SplitBlockAndInsertIfThen( IRB.CreateIsNull(Load), &*IP, false, MDBuilder(IRB.getContext()).createUnlikelyBranchWeights()); - IRBuilder<> ThenIRB(ThenTerm); + InstrumentationIRBuilder ThenIRB(ThenTerm); auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr); + if (EntryLoc) + Store->setDebugLoc(EntryLoc); Load->setNoSanitizeMetadata(); Store->setNoSanitizeMetadata(); } @@ -1131,7 +1133,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, EstimatedStackSize >= Options.StackDepthCallbackMin) { if (InsertBefore) IRB.SetInsertPoint(InsertBefore); - IRB.CreateCall(SanCovStackDepthCallback)->setCannotMerge(); + auto Call = IRB.CreateCall(SanCovStackDepthCallback); + if (EntryLoc) + Call->setDebugLoc(EntryLoc); + Call->setCannotMerge(); } } else { // Check stack depth. If it's the deepest so far, record it. @@ -1144,8 +1149,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, auto ThenTerm = SplitBlockAndInsertIfThen( IsStackLower, &*IP, false, MDBuilder(IRB.getContext()).createUnlikelyBranchWeights()); - IRBuilder<> ThenIRB(ThenTerm); + InstrumentationIRBuilder ThenIRB(ThenTerm); auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack); + if (EntryLoc) + Store->setDebugLoc(EntryLoc); LowestStack->setNoSanitizeMetadata(); Store->setNoSanitizeMetadata(); } diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index e448230..3f7003d 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -61,6 +61,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DomTreeUpdater.h" @@ -382,16 +383,9 @@ typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap; typedef MapVector<Instruction *, std::vector<Instruction *>> DefMap; inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) { - OS << "< "; - for (const BasicBlock *BB : Path) { - std::string BBName; - if (BB->hasName()) - raw_string_ostream(BBName) << BB->getName(); - else - raw_string_ostream(BBName) << BB; - OS << BBName << " "; - } - OS << ">"; + auto BBNames = llvm::map_range( + Path, [](const BasicBlock *BB) { return BB->getNameOrAsOperand(); }); + OS << "< " << llvm::join(BBNames, ", ") << " >"; return OS; } @@ -423,7 +417,7 @@ struct ThreadingPath { } void print(raw_ostream &OS) const { - OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]"; + OS << Path << " [ " << ExitVal << ", " << DBB->getNameOrAsOperand() << " ]"; } private: diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll index 8d091a0..d380104 100644 --- a/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll +++ b/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll @@ -61,7 +61,7 @@ define void @umin(i32 noundef %a, i32 noundef %b) { ; CHECK-NEXT: Loop %for.body: backedge-taken count is (-1 + ((2 * %a) umin (4 * %b))) ; CHECK-NEXT: Loop %for.body: constant max backedge-taken count is i32 2147483646 ; CHECK-NEXT: Loop %for.body: symbolic max backedge-taken count is (-1 + ((2 * %a) umin (4 * %b))) -; CHECK-NEXT: Loop %for.body: Trip multiple is 1 +; CHECK-NEXT: Loop %for.body: Trip multiple is 2 ; ; void umin(unsigned a, unsigned b) { ; a *= 2; @@ -157,7 +157,7 @@ define void @smin(i32 noundef %a, i32 noundef %b) { ; CHECK-NEXT: Loop %for.body: backedge-taken count is (-1 + ((2 * %a)<nsw> smin (4 * %b)<nsw>)) ; CHECK-NEXT: Loop %for.body: constant max backedge-taken count is i32 2147483646 ; CHECK-NEXT: Loop %for.body: symbolic max backedge-taken count is (-1 + ((2 * %a)<nsw> smin (4 * %b)<nsw>)) -; CHECK-NEXT: Loop %for.body: Trip multiple is 1 +; CHECK-NEXT: Loop %for.body: Trip multiple is 2 ; ; void smin(signed a, signed b) { ; a *= 2; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll new file mode 100644 index 0000000..6c4f504 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll @@ -0,0 +1,452 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-enable-uniform-intrinsic-combine=0 -O3 -S < %s | FileCheck %s -check-prefix=CURRENT-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O3 -S < %s | FileCheck %s -check-prefix=O3-CHECK + +define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[IF_PEEL:.*]] +; CURRENT-CHECK: [[IF_PEEL]]: +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[EXIT]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %not_done = xor i1 %done, true + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) + %is_done = icmp eq i64 %ballot, 0 ; in this case is_done = !not_done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[IF_PEEL:.*]] +; CURRENT-CHECK: [[IF_PEEL]]: +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[EXIT]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %not_done = xor i1 %done, true + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) + %is_done = icmp eq i64 0, %ballot ; in this case is_done = !not_done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[WHILE:.*]] +; CURRENT-CHECK: [[WHILE]]: +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_NOT]], label %[[WHILE]], label %[[EXIT:.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %done) + %is_done = icmp ne i64 0, %ballot ; in this case is_done = done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[WHILE:.*]] +; CURRENT-CHECK: [[WHILE]]: +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_NOT]], label %[[WHILE]], label %[[EXIT:.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %done) + %is_done = icmp ne i64 %ballot, 0 ; in this case is_done = done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_uniform_waterfall( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[WORK_PEEL:.*]] +; CURRENT-CHECK: [[WORK_PEEL]]: +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[EXIT]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_uniform_waterfall( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ] +; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0 +; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]] +; PASS-CHECK: [[WORK]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[TAIL]] +; PASS-CHECK: [[TAIL]]: +; PASS-CHECK-NEXT: [[NEW_DONE]] = phi i1 [ true, %[[WORK]] ], [ false, %[[IF]] ] +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_uniform_waterfall( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ false, %entry ], [ %new_done, %tail ] + %not_done = xor i1 %done, true + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) + %is_done = icmp eq i64 %ballot, 0 + br i1 %is_done, label %exit, label %if + +if: + %first_active_id = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 0) + %is_first_active_id = icmp eq i32 0, %first_active_id + br i1 %is_first_active_id, label %work, label %tail + +work: + store i32 5, ptr addrspace(1) %out + br label %tail + +tail: + %new_done = phi i1 [ true, %work ], [ false, %if ] + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i32 %mymask) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @uniform_waterfall( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]], i32 [[MYMASK:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[WORK_PEEL:.*]] +; CURRENT-CHECK: [[WORK_PEEL]]: +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[EXIT]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @uniform_waterfall( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MYMASK:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ] +; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]] +; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]] +; PASS-CHECK: [[WORK]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[TAIL]] +; PASS-CHECK: [[TAIL]]: +; PASS-CHECK-NEXT: [[NEW_DONE]] = phi i1 [ true, %[[WORK]] ], [ false, %[[IF]] ] +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @uniform_waterfall( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[MYMASK:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ false, %entry ], [ %new_done, %tail ] + %not_done = xor i1 %done, true + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) + %is_done = icmp eq i64 %ballot, 0 + br i1 %is_done, label %exit, label %if + +if: + %first_active_id = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 %mymask) + %is_first_active_id = icmp eq i32 %mymask, %first_active_id + br i1 %is_first_active_id, label %work, label %tail + +work: + store i32 5, ptr addrspace(1) %out + br label %tail + +tail: + %new_done = phi i1 [ true, %work ], [ false, %if ] + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: [[BALLOT_PEEL:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[BALLOT_PEEL]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[IF_PEEL:.*]] +; CURRENT-CHECK: [[IF_PEEL]]: +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[EXIT]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %not_done = xor i1 %done, true + %ballot = tail call i32 @llvm.amdgcn.ballot.i32(i1 %not_done) + %is_done = icmp eq i32 %ballot, 0 ; in this case is_done = !not_done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[WHILE:.*]] +; CURRENT-CHECK: [[WHILE]]: +; CURRENT-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_NOT:%.*]] = icmp eq i32 [[BALLOT]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_NOT]], label %[[WHILE]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %ballot = tail call i32 @llvm.amdgcn.ballot.i32(i1 %done) + %is_done = icmp ne i32 0, %ballot ; in this case is_done = done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +declare i64 @llvm.amdgcn.ballot.i64(i1) #1 +!6 = !{i64 690} +!7 = distinct !{!7, !8} +!8 = !{!"llvm.loop.mustprogress"} +;. +; CURRENT-CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; CURRENT-CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 1} +; CURRENT-CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} +; CURRENT-CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll new file mode 100644 index 0000000..aa11574 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll @@ -0,0 +1,790 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-enable-uniform-intrinsic-combine=0 -O3 -S < %s | FileCheck %s -check-prefix=CURRENT-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s -check-prefix=DCE-CHECK + +define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CURRENT-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; PASS-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; DCE-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.permlane64(i32 77) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.permlane64(i32 %src) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane64(i32 %tid) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TID2:%.*]] = add nuw nsw i32 [[TID]], 1 +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid2 = add i32 %tid, 1 + %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_constant( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_constant( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_constant( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TIDX]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CURRENT-CHECK-NEXT: [[TIDX2:%.*]] = add nuw nsw i32 [[TIDX]], 1 +; CURRENT-CHECK-NEXT: [[TIDY2:%.*]] = add nuw nsw i32 [[TIDY]], 2 +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TIDX]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; PASS-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1 +; PASS-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; DCE-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1 +; DCE-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %tidx2 = add i32 %tidx, 1 + %tidy2 = add i32 %tidy, 2 + %v = call i32 @llvm.amdgcn.readlane(i32 %tidx2, i32 %tidy2) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.readfirstlane(i32 7) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC0:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TID2:%.*]] = add nuw nsw i32 [[TID]], 1 +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID2]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid2 = add i32 %tid, 1 + %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid2) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5) + %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]]) +; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]]) +; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]]) +; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx) + %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2) + store i32 %v2, ptr addrspace(1) %out + ret void +} + + +define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT_MIN:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT_MAX:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4 +; CURRENT-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4 +; PASS-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4 +; DCE-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4 +; DCE-CHECK-NEXT: ret void +; + %min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648) + store i32 %min_v, ptr addrspace(1) %out_min + %max_v = call i32 @llvm.amdgcn.permlane64(i32 2147483647) + store i32 %max_v, ptr addrspace(1) %out_max + ret void +} + +define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = add nuw nsw i32 [[TIDX]], 5 +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = add i32 %tidx, 5 + %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456 +; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456 +; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %random = xor i32 123, 456 + %v = call i32 @llvm.amdgcn.readfirstlane(i32 %random) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_expression( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[IDX1:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[IDX2:%.*]] = shl nuw nsw i32 [[IDX1]], 1 +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]]) +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_expression( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]]) +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_expression( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]]) +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %idx1 = call i32 @llvm.amdgcn.workitem.id.x() + %idx2 = mul i32 %idx1, 2 + %v = call i32 @llvm.amdgcn.readlane(i32 %idx1, i32 %idx2) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @ballot_i32( +; CURRENT-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 1)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; CURRENT-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[C]]) +; CURRENT-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0 +; CURRENT-CHECK-NEXT: store i1 [[BALLOT_NE_ZERO]], ptr addrspace(1) [[OUT]], align 1 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32( +; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @ballot_i32( +; DCE-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; DCE-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 +; DCE-CHECK-NEXT: ret void +; + %c = trunc i32 %v to i1 + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) + %ballot_ne_zero = icmp ne i32 %ballot, 0 + store i1 %ballot_ne_zero, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @ballot_i64( +; CURRENT-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 1)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[C]]) +; CURRENT-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[TMP1]], 0 +; CURRENT-CHECK-NEXT: store i1 [[BALLOT_NE_ZERO]], ptr addrspace(1) [[OUT]], align 1 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64( +; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @ballot_i64( +; DCE-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; DCE-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 +; DCE-CHECK-NEXT: ret void +; + %c = trunc i32 %v to i1 + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) + %ballot_ne_zero = icmp ne i64 %ballot, 0 + store i1 %ballot_ne_zero, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_readlane_i16(i16 %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16( +; CURRENT-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16( +; PASS-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16( +; DCE-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call i16 @llvm.amdgcn.readlane.i16(i16 %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(i16 %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_i64(i64 %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64( +; CURRENT-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64( +; PASS-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64( +; DCE-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(i64 %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_bf16(bfloat %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16( +; CURRENT-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16( +; PASS-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16( +; DCE-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(bfloat %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_f16(half %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16( +; CURRENT-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16( +; PASS-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16( +; DCE-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call half @llvm.amdgcn.readlane.f16(half %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(half %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_f32(float %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32( +; CURRENT-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32( +; PASS-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32( +; DCE-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call float @llvm.amdgcn.readlane.f32(float %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(float %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_f64(double %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64( +; CURRENT-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64( +; PASS-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64( +; DCE-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(double %readlane) + ret void +} +; All such cases can be optimised, given generic way to query getDeclarationIfExists() +define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) { +; CURRENT-CHECK-LABEL: define void @test_readlane_v8i16( +; CURRENT-CHECK-SAME: ptr addrspace(1) readnone captures(none) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: [[X:%.*]] = tail call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]]) +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define void @test_readlane_v8i16( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[X:%.*]] = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]]) +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define void @test_readlane_v8i16( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[X:%.*]] = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]]) +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<8 x i16> %x) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll new file mode 100644 index 0000000..2fde3e3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=COMB-CHECK + +; This should not be optimized +define amdgpu_cs void @temporal_divergence(ptr addrspace(1) %out, i32 %n) { +; PASS-CHECK-LABEL: define amdgpu_cs void @temporal_divergence( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: br label %[[H:.*]] +; PASS-CHECK: [[H]]: +; PASS-CHECK-NEXT: [[UNI_MERGE_H:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[UNI_INC:%.*]], %[[H]] ] +; PASS-CHECK-NEXT: [[UNI_INC]] = add i32 [[UNI_MERGE_H]], 1 +; PASS-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0 +; PASS-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]] +; PASS-CHECK: [[X]]: +; PASS-CHECK-NEXT: [[UNI_JOIN:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[UNI_INC]]) +; PASS-CHECK-NEXT: [[JOIN_USER:%.*]] = add i32 [[UNI_JOIN]], 5 +; PASS-CHECK-NEXT: store i32 [[JOIN_USER]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; COMB-CHECK-LABEL: define amdgpu_cs void @temporal_divergence( +; COMB-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; COMB-CHECK-NEXT: [[ENTRY:.*]]: +; COMB-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; COMB-CHECK-NEXT: br label %[[H:.*]] +; COMB-CHECK: [[H]]: +; COMB-CHECK-NEXT: [[UNI_MERGE_H:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[UNI_INC:%.*]], %[[H]] ] +; COMB-CHECK-NEXT: [[UNI_INC]] = add i32 [[UNI_MERGE_H]], 1 +; COMB-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0 +; COMB-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]] +; COMB-CHECK: [[X]]: +; COMB-CHECK-NEXT: [[UNI_JOIN:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[UNI_INC]]) +; COMB-CHECK-NEXT: [[JOIN_USER:%.*]] = add i32 [[UNI_JOIN]], 5 +; COMB-CHECK-NEXT: store i32 [[JOIN_USER]], ptr addrspace(1) [[OUT]], align 4 +; COMB-CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp eq i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch + +X: + %uni.join = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %uni.inc) + %join.user = add i32 %uni.join, 5 + store i32 %join.user, ptr addrspace(1) %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.readfirstlane.i32(i32) diff --git a/llvm/test/CodeGen/Hexagon/fmul-v67.ll b/llvm/test/CodeGen/Hexagon/fmul-v67.ll index 49098cd..fc0b7f7 100644 --- a/llvm/test/CodeGen/Hexagon/fmul-v67.ll +++ b/llvm/test/CodeGen/Hexagon/fmul-v67.ll @@ -29,7 +29,7 @@ b2: ; CHECK: [[R22]] += dfmpylh([[R20]],[[R21]]) ; CHECK: [[R22]] += dfmpylh([[R21]],[[R20]]) ; CHECK: [[R22]] += dfmpyhh([[R20]],[[R21]]) -define double @test_02(double %a0, double %a1) #2 { +define double @test_02(double %a0, double %a1) #1 { b2: %v3 = fmul double %a0, %a1 ret double %v3 @@ -40,13 +40,11 @@ b2: ; CHECK: [[R30]] += dfmpylh(r1:0,r3:2) ; CHECK: [[R30]] += dfmpylh(r3:2,r1:0) ; CHECK: [[R30]] += dfmpyhh(r1:0,r3:2) -define double @test_03(double %a0, double %a1) #3 { +define double @test_03(double %a0, double %a1) #1 { b2: - %v3 = fmul double %a0, %a1 + %v3 = fmul afn double %a0, %a1 ret double %v3 } attributes #0 = { nounwind } attributes #1 = { nounwind "target-cpu"="hexagonv67" } -attributes #2 = { nounwind "target-cpu"="hexagonv67" "unsafe-fp-math"="false" } -attributes #3 = { nounwind "target-cpu"="hexagonv67" "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll index 1da516a..80b4048 100644 --- a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll +++ b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll @@ -1,15 +1,15 @@ ; REQUIRES: x86_64-linux -; RUN: not llc -o /dev/null -print-mir2vec-vocab %s 2>&1 | FileCheck %s --check-prefix=CHECK-INVALID -; RUN: not llc -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_zero_vocab.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-ZERO-DIM -; RUN: not llc -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_invalid_vocab.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ENTITIES -; RUN: not llc -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_inconsistent_dims.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-INCONSISTENT-DIMS +; RUN: llc -o /dev/null -print-mir2vec-vocab %s 2>&1 | FileCheck %s --check-prefix=CHECK-INVALID +; RUN: llc -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_zero_vocab.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-ZERO-DIM +; RUN: llc -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_invalid_vocab.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ENTITIES +; RUN: llc -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_inconsistent_dims.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-INCONSISTENT-DIMS define dso_local void @test() { entry: ret void } -; CHECK-INVALID: error: MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path -; CHECK-ZERO-DIM: error: Dimension of 'entities' section of the vocabulary is zero -; CHECK-NO-ENTITIES: error: Missing 'entities' section in vocabulary file -; CHECK-INCONSISTENT-DIMS: error: All vectors in the 'entities' section of the vocabulary are not of the same dimension +; CHECK-INVALID: MIR2Vec Vocabulary Printer: Failed to get vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path +; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'entities' section of the vocabulary is zero +; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'entities' section in vocabulary file +; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'entities' section of the vocabulary are not of the same dimension diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll index 41a0e81..1edb387 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll @@ -12,63 +12,104 @@ declare void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols) declare void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %addr, i32 %ncols) declare void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %addr, i32 %ncols) -; CHECK-LABEL: test_tcgen05_alloc -define void @test_tcgen05_alloc(ptr %addr, i32 %ncols) { -; CHECK_PTX64-LABEL: test_tcgen05_alloc( +define void @test_tcgen05_alloc_cg1(ptr %addr, i32 %ncols) { +; CHECK_PTX64-LABEL: test_tcgen05_alloc_cg1( ; CHECK_PTX64: { ; CHECK_PTX64-NEXT: .reg .b32 %r<2>; ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_param_0]; -; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_param_1]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_cg1_param_0]; +; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_cg1_param_1]; ; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1; -; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1; ; CHECK_PTX64-NEXT: ret; ; -; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc( +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_cg1( ; CHECK_PTX64_SHARED32: { ; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; ; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_param_0]; -; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_param_1]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_cg1_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_cg1_param_1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1; -; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1; ; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols) - call void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols) + ret void +} +define void @test_tcgen05_alloc_cg2(ptr %addr, i32 %ncols) { +; CHECK_PTX64-LABEL: test_tcgen05_alloc_cg2( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b32 %r<2>; +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_cg2_param_0]; +; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_cg2_param_1]; +; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_cg2( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; +; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_cg2_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_cg2_param_1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1; +; CHECK_PTX64_SHARED32-NEXT: ret; + call void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols) ret void } -; CHECK-LABEL: test_tcgen05_alloc_shared -define void @test_tcgen05_alloc_shared(ptr addrspace(3) %addr, i32 %ncols) { -; CHECK_PTX64-LABEL: test_tcgen05_alloc_shared( +define void @test_tcgen05_alloc_shared_cg1(ptr addrspace(3) %addr, i32 %ncols) { +; CHECK_PTX64-LABEL: test_tcgen05_alloc_shared_cg1( ; CHECK_PTX64: { ; CHECK_PTX64-NEXT: .reg .b32 %r<2>; ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_shared_param_0]; -; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_shared_param_1]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_shared_cg1_param_0]; +; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_shared_cg1_param_1]; ; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%rd1], %r1; -; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%rd1], %r1; ; CHECK_PTX64-NEXT: ret; ; -; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_shared( +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_shared_cg1( ; CHECK_PTX64_SHARED32: { ; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<3>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_shared_param_0]; -; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r2, [test_tcgen05_alloc_shared_param_1]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_shared_cg1_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r2, [test_tcgen05_alloc_shared_cg1_param_1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%r1], %r2; -; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%r1], %r2; ; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %addr, i32 %ncols) + ret void +} +define void @test_tcgen05_alloc_shared_cg2(ptr addrspace(3) %addr, i32 %ncols) { +; CHECK_PTX64-LABEL: test_tcgen05_alloc_shared_cg2( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b32 %r<2>; +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_shared_cg2_param_0]; +; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_shared_cg2_param_1]; +; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%rd1], %r1; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_shared_cg2( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<3>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_shared_cg2_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r2, [test_tcgen05_alloc_shared_cg2_param_1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%r1], %r2; +; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %addr, i32 %ncols) ret void } @@ -76,31 +117,50 @@ define void @test_tcgen05_alloc_shared(ptr addrspace(3) %addr, i32 %ncols) { declare void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %tmem_addr, i32 %ncols) declare void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %tmem_addr, i32 %ncols) -; CHECK-LABEL: test_tcgen05_dealloc -define void @test_tcgen05_dealloc(ptr addrspace(6) %tmem_addr, i32 %ncols) { -; CHECK_PTX64-LABEL: test_tcgen05_dealloc( +define void @test_tcgen05_dealloc_cg1(ptr addrspace(6) %tmem_addr, i32 %ncols) { +; CHECK_PTX64-LABEL: test_tcgen05_dealloc_cg1( ; CHECK_PTX64: { ; CHECK_PTX64-NEXT: .reg .b32 %r<3>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_dealloc_param_0]; -; CHECK_PTX64-NEXT: ld.param.b32 %r2, [test_tcgen05_dealloc_param_1]; +; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_dealloc_cg1_param_0]; +; CHECK_PTX64-NEXT: ld.param.b32 %r2, [test_tcgen05_dealloc_cg1_param_1]; ; CHECK_PTX64-NEXT: tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2; -; CHECK_PTX64-NEXT: tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2; ; CHECK_PTX64-NEXT: ret; ; -; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_dealloc( +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_dealloc_cg1( ; CHECK_PTX64_SHARED32: { ; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<3>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_dealloc_param_0]; -; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r2, [test_tcgen05_dealloc_param_1]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_dealloc_cg1_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r2, [test_tcgen05_dealloc_cg1_param_1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2; -; CHECK_PTX64_SHARED32-NEXT: tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2; ; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %tmem_addr, i32 %ncols) + ret void +} +define void @test_tcgen05_dealloc_cg2(ptr addrspace(6) %tmem_addr, i32 %ncols) { +; CHECK_PTX64-LABEL: test_tcgen05_dealloc_cg2( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b32 %r<3>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_dealloc_cg2_param_0]; +; CHECK_PTX64-NEXT: ld.param.b32 %r2, [test_tcgen05_dealloc_cg2_param_1]; +; CHECK_PTX64-NEXT: tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_dealloc_cg2( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<3>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_dealloc_cg2_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r2, [test_tcgen05_dealloc_cg2_param_1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2; +; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %tmem_addr, i32 %ncols) ret void } @@ -108,27 +168,42 @@ define void @test_tcgen05_dealloc(ptr addrspace(6) %tmem_addr, i32 %ncols) { declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1() declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2() -; CHECK-LABEL: test_tcgen05_relinquish_alloc_permit -define void @test_tcgen05_relinquish_alloc_permit() { -; CHECK_PTX64-LABEL: test_tcgen05_relinquish_alloc_permit( +define void @test_tcgen05_relinquish_alloc_permit_cg1() { +; CHECK_PTX64-LABEL: test_tcgen05_relinquish_alloc_permit_cg1( ; CHECK_PTX64: { ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: ; CHECK_PTX64-NEXT: tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned; -; CHECK_PTX64-NEXT: tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; ; CHECK_PTX64-NEXT: ret; ; -; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_relinquish_alloc_permit( +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_relinquish_alloc_permit_cg1( ; CHECK_PTX64_SHARED32: { ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: ; CHECK_PTX64_SHARED32-NEXT: tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned; -; CHECK_PTX64_SHARED32-NEXT: tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; ; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1() + ret void +} +define void @test_tcgen05_relinquish_alloc_permit_cg2() { +; CHECK_PTX64-LABEL: test_tcgen05_relinquish_alloc_permit_cg2( +; CHECK_PTX64: { +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_relinquish_alloc_permit_cg2( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; +; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2() ret void } diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll index 7981feb..2e80c4c 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll @@ -11,57 +11,93 @@ declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr) declare void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %bar_addr) declare void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %bar_addr) -; CHECK-LABEL: test_tcgen05_commit -define void @test_tcgen05_commit(ptr %bar_addr) { -; CHECK_PTX64-LABEL: test_tcgen05_commit( +define void @test_tcgen05_commit_cg1(ptr %bar_addr) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_cg1( ; CHECK_PTX64: { ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_param_0]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_cg1_param_0]; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; -; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; ; CHECK_PTX64-NEXT: ret; ; -; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit( +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_cg1( ; CHECK_PTX64_SHARED32: { ; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_cg1_param_0]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; -; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; ; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr) + ret void +} + +define void @test_tcgen05_commit_cg2(ptr %bar_addr) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_cg2( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_cg2_param_0]; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_cg2( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_cg2_param_0]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; +; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr) ret void } -; CHECK-LABEL: test_tcgen05_commit_shared -define void @test_tcgen05_commit_shared(ptr addrspace(3) %bar_addr) { -; CHECK_PTX64-LABEL: test_tcgen05_commit_shared( +define void @test_tcgen05_commit_shared_cg1(ptr addrspace(3) %bar_addr) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_shared_cg1( ; CHECK_PTX64: { ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_shared_param_0]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_shared_cg1_param_0]; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; -; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; ; CHECK_PTX64-NEXT: ret; ; -; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_shared( +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_shared_cg1( ; CHECK_PTX64_SHARED32: { ; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_commit_shared_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_commit_shared_cg1_param_0]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%r1]; -; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%r1]; ; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %bar_addr) + ret void +} + +define void @test_tcgen05_commit_shared_cg2(ptr addrspace(3) %bar_addr) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_shared_cg2( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_shared_cg2_param_0]; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_shared_cg2( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_commit_shared_cg2_param_0]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%r1]; +; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %bar_addr) ret void @@ -72,66 +108,106 @@ declare void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %bar_addr, i16 %cta_mask) declare void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %bar_addr, i16 %cta_mask) declare void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %bar_addr, i16 %cta_mask) -; CHECK-LABEL: test_tcgen05_commit_mc -define void @test_tcgen05_commit_mc(ptr %bar_addr, i16 %cta_mask) { -; CHECK_PTX64-LABEL: test_tcgen05_commit_mc( +define void @test_tcgen05_commit_mc_cg1(ptr %bar_addr, i16 %cta_mask) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_cg1( ; CHECK_PTX64: { ; CHECK_PTX64-NEXT: .reg .b16 %rs<2>; ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_param_0]; -; CHECK_PTX64-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_param_1]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_cg1_param_0]; +; CHECK_PTX64-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_cg1_param_1]; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; -; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; ; CHECK_PTX64-NEXT: ret; ; -; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc( +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_cg1( ; CHECK_PTX64_SHARED32: { ; CHECK_PTX64_SHARED32-NEXT: .reg .b16 %rs<2>; ; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_param_0]; -; CHECK_PTX64_SHARED32-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_param_1]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_cg1_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_cg1_param_1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; -; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; ; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.commit.mc.cg1(ptr %bar_addr, i16 %cta_mask) + ret void +} +define void @test_tcgen05_commit_mc_cg2(ptr %bar_addr, i16 %cta_mask) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_cg2( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_cg2_param_0]; +; CHECK_PTX64-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_cg2_param_1]; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_cg2( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_cg2_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_cg2_param_1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; +; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %bar_addr, i16 %cta_mask) - ret void } -; CHECK-LABEL: test_tcgen05_commit_mc_shared -define void @test_tcgen05_commit_mc_shared(ptr addrspace(3) %bar_addr, i16 %cta_mask) { -; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_shared( +define void @test_tcgen05_commit_mc_shared_cg1(ptr addrspace(3) %bar_addr, i16 %cta_mask) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_shared_cg1( ; CHECK_PTX64: { ; CHECK_PTX64-NEXT: .reg .b16 %rs<2>; ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_shared_param_0]; -; CHECK_PTX64-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_param_1]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_shared_cg1_param_0]; +; CHECK_PTX64-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_cg1_param_1]; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; -; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; ; CHECK_PTX64-NEXT: ret; ; -; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_shared( +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_shared_cg1( ; CHECK_PTX64_SHARED32: { ; CHECK_PTX64_SHARED32-NEXT: .reg .b16 %rs<2>; ; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_commit_mc_shared_param_0]; -; CHECK_PTX64_SHARED32-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_param_1]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_commit_mc_shared_cg1_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_cg1_param_1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1; -; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1; ; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %bar_addr, i16 %cta_mask) + ret void +} +define void @test_tcgen05_commit_mc_shared_cg2(ptr addrspace(3) %bar_addr, i16 %cta_mask) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_shared_cg2( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_shared_cg2_param_0]; +; CHECK_PTX64-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_cg2_param_1]; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_shared_cg2( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_commit_mc_shared_cg2_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_cg2_param_1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1; +; CHECK_PTX64_SHARED32-NEXT: ret; call void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %bar_addr, i16 %cta_mask) - ret void } diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll index c540f78..817b1d5 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll @@ -4,346 +4,580 @@ ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} -; CHECK-LABEL: test_tcgen05_cp_64x128_v1 -define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_64x128_v1( +define void @test_tcgen05_cp_64x128_v1_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_64x128_v1_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_64x128_v2 -define void @test_tcgen05_cp_64x128_v2(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_64x128_v2( +define void @test_tcgen05_cp_64x128_v2_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_64x128_v2_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_32x128 -define void @test_tcgen05_cp_32x128(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_32x128( +define void @test_tcgen05_cp_32x128_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_32x128_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_128x128b -define void @test_tcgen05_cp_128x128b(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_128x128b( +define void @test_tcgen05_cp_128x128b_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x128b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_128x128b_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x128b.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_128x256b -define void @test_tcgen05_cp_128x256b(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_128x256b( +define void @test_tcgen05_cp_128x256b_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_128x256b_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_4x256b -define void @test_tcgen05_cp_4x256b(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_4x256b( +define void @test_tcgen05_cp_4x256b_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.4x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_4x256b_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.4x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } ; With src_fmt as b6x16_p32 -; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32 -define void @test_tcgen05_cp_128x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32( +define void @test_tcgen05_cp_128x256b_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_128x256b_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32 -define void @test_tcgen05_cp_4x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32( +define void @test_tcgen05_cp_4x256b_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_4x256b_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32 -define void @test_tcgen05_cp_128x128b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32( +define void @test_tcgen05_cp_128x128b_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_128x128b_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32 -define void @test_tcgen05_cp_64x128_v1_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32( +define void @test_tcgen05_cp_64x128_v1_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_64x128_v1_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32 -define void @test_tcgen05_cp_64x128_v2_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32( +define void @test_tcgen05_cp_64x128_v2_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_64x128_v2_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32 -define void @test_tcgen05_cp_32x128_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32( +define void @test_tcgen05_cp_32x128_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_32x128_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } ; With src_fmt as b4x16_p64 -; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64 -define void @test_tcgen05_cp_128x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64( +define void @test_tcgen05_cp_128x256b_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_128x256b_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64 -define void @test_tcgen05_cp_4x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64( +define void @test_tcgen05_cp_4x256b_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_4x256b_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64 -define void @test_tcgen05_cp_128x128b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64( +define void @test_tcgen05_cp_128x128b_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_128x128b_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64 -define void @test_tcgen05_cp_64x128_v1_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64( +define void @test_tcgen05_cp_64x128_v1_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_64x128_v1_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64 -define void @test_tcgen05_cp_64x128_v2_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64( +define void @test_tcgen05_cp_64x128_v2_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_64x128_v2_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void } -; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64 -define void @test_tcgen05_cp_32x128_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { -; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64( +define void @test_tcgen05_cp_32x128_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_cg1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; -; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +define void @test_tcgen05_cp_32x128_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_cg2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) ret void diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll index 8ca6a2a0..bf2adac 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll @@ -7,18 +7,29 @@ declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) -; CHECK-LABEL: test_tcgen05_shift -define void @test_tcgen05_shift(ptr addrspace(6) %tmem_addr) { -; CHECK-LABEL: test_tcgen05_shift( +define void @test_tcgen05_shift_cg1(ptr addrspace(6) %tmem_addr) { +; CHECK-LABEL: test_tcgen05_shift_cg1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_shift_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_shift_cg1_param_0]; ; CHECK-NEXT: tcgen05.shift.cta_group::1.down [%r1]; -; CHECK-NEXT: tcgen05.shift.cta_group::2.down [%r1]; ; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) + + ret void +} + +define void @test_tcgen05_shift_cg2(ptr addrspace(6) %tmem_addr) { +; CHECK-LABEL: test_tcgen05_shift_cg2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_shift_cg2_param_0]; +; CHECK-NEXT: tcgen05.shift.cta_group::2.down [%r1]; +; CHECK-NEXT: ret; call void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) ret void diff --git a/llvm/test/CodeGen/RISCV/double-arith.ll b/llvm/test/CodeGen/RISCV/double-arith.ll index 911692e..f960bc1 100644 --- a/llvm/test/CodeGen/RISCV/double-arith.ll +++ b/llvm/test/CodeGen/RISCV/double-arith.ll @@ -305,9 +305,6 @@ define i32 @fneg_d(double %a, double %b) nounwind { } define double @fsgnjn_d(double %a, double %b) nounwind { -; TODO: fsgnjn.s isn't selected on RV64 because DAGCombiner::visitBITCAST will -; convert (bitconvert (fneg x)) to a xor. -; ; CHECKIFD-LABEL: fsgnjn_d: ; CHECKIFD: # %bb.0: ; CHECKIFD-NEXT: fsgnjn.d fa0, fa0, fa1 diff --git a/llvm/test/Instrumentation/AddressSanitizer/alloca-offset-lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/alloca-offset-lifetime.ll deleted file mode 100644 index a4846176..0000000 --- a/llvm/test/Instrumentation/AddressSanitizer/alloca-offset-lifetime.ll +++ /dev/null @@ -1,27 +0,0 @@ -; Test that ASAN will not instrument lifetime markers on alloca offsets. -; -; RUN: opt < %s -passes=asan --asan-use-after-scope -S | FileCheck %s - -target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.15.0" - -%t = type { ptr, ptr, %sub, i64 } -%sub = type { i32 } - -define void @foo() sanitize_address { -entry: - %0 = alloca %t, align 8 - %x = getelementptr inbounds %t, ptr %0, i64 0, i32 2 - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %x) - call void @bar(ptr nonnull %x) - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %x) #3 - ret void -} - -declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) -declare void @bar(ptr) -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) - -; CHECK: store i64 %[[STACK_BASE:.+]], ptr %asan_local_stack_base, align 8 -; CHECK-NOT: store i8 0 -; CHECK: call void @bar(ptr nonnull %x) diff --git a/llvm/test/Instrumentation/SanitizerCoverage/missing_dbg.ll b/llvm/test/Instrumentation/SanitizerCoverage/missing_dbg.ll index 3568434..07b9a1c 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/missing_dbg.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/missing_dbg.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=2 -S | FileCheck %s +; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -sanitizer-coverage-stack-depth-callback-min=1 -S | FileCheck %s --check-prefix=CHECK-STACK-CALLBACK +; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -S | FileCheck %s --check-prefix=CHECK-STACK-DEPTH target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -55,6 +57,86 @@ entry: ret i32 %t } +define i32 @with_dbg_stack_callback(ptr %a) !dbg !8 { +; CHECK-STACK-CALLBACK-LABEL: define i32 @with_dbg_stack_callback( +; CHECK-STACK-CALLBACK-SAME: ptr [[A:%.*]]) !dbg [[DBG8:![0-9]+]] { +; CHECK-STACK-CALLBACK-NEXT: entry: +; CHECK-STACK-CALLBACK-NEXT: [[BUF:%.*]] = alloca [64 x i8], align 1 +; CHECK-STACK-CALLBACK-NEXT: call void @__sanitizer_cov_stack_depth() #[[ATTR1:[0-9]+]], !dbg [[DBG9:![0-9]+]] +; CHECK-STACK-CALLBACK-NEXT: %t = load i32, ptr [[A]], align 4 +; CHECK-STACK-CALLBACK-NEXT: call void @external_func() +; CHECK-STACK-CALLBACK-NEXT: ret i32 %t +; +entry: + %buf = alloca [64 x i8], align 1 + %t = load i32, ptr %a, align 4 + call void @external_func() + ret i32 %t +} + +define i32 @with_dbg_stack_depth(ptr %a) !dbg !10 { +; CHECK-STACK-DEPTH-LABEL: define i32 @with_dbg_stack_depth( +; CHECK-STACK-DEPTH-SAME: ptr [[A:%.*]]) !dbg [[DBG10:![0-9]+]] { +; CHECK-STACK-DEPTH-NEXT: entry: +; CHECK-STACK-DEPTH-NEXT: [[BUF:%.*]] = alloca [64 x i8], align 1 +; CHECK-STACK-DEPTH-NEXT: [[TMP1:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) +; CHECK-STACK-DEPTH-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-STACK-DEPTH-NEXT: [[TMP3:%.*]] = load i64, ptr @__sancov_lowest_stack, align 8 +; CHECK-STACK-DEPTH-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP2]], [[TMP3]] +; CHECK-STACK-DEPTH-NEXT: br i1 [[TMP4]], label {{%.*}}, label {{%.*}} +; CHECK-STACK-DEPTH: store i64 [[TMP2]], ptr @__sancov_lowest_stack, align 8, !dbg [[DBG11:![0-9]+]], {{.*}}!nosanitize +; CHECK-STACK-DEPTH: %t = load i32, ptr [[A]], align 4 +; CHECK-STACK-DEPTH-NEXT: call void @external_func() +; CHECK-STACK-DEPTH-NEXT: ret i32 %t +; +entry: + %buf = alloca [64 x i8], align 1 + %t = load i32, ptr %a, align 4 + call void @external_func() + ret i32 %t +} + +define i32 @without_dbg_stack_callback(ptr %a) { +; CHECK-STACK-CALLBACK-LABEL: define i32 @without_dbg_stack_callback( +; CHECK-STACK-CALLBACK-SAME: ptr [[A:%.*]]) { +; CHECK-STACK-CALLBACK-NEXT: entry: +; CHECK-STACK-CALLBACK-NEXT: [[BUF:%.*]] = alloca [64 x i8], align 1 +; CHECK-STACK-CALLBACK-NEXT: call void @__sanitizer_cov_stack_depth() #[[ATTR1]] +; CHECK-STACK-CALLBACK-NEXT: %t = load i32, ptr [[A]], align 4 +; CHECK-STACK-CALLBACK-NEXT: call void @external_func() +; CHECK-STACK-CALLBACK-NEXT: ret i32 %t +; +entry: + %buf = alloca [64 x i8], align 1 + %t = load i32, ptr %a, align 4 + call void @external_func() + ret i32 %t +} + +define i32 @without_dbg_stack_depth(ptr %a) { +; CHECK-STACK-DEPTH-LABEL: define i32 @without_dbg_stack_depth( +; CHECK-STACK-DEPTH-SAME: ptr [[A:%.*]]) { +; CHECK-STACK-DEPTH-NEXT: entry: +; CHECK-STACK-DEPTH-NEXT: [[BUF:%.*]] = alloca [64 x i8], align 1 +; CHECK-STACK-DEPTH-NEXT: [[TMP1:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) +; CHECK-STACK-DEPTH-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-STACK-DEPTH-NEXT: [[TMP3:%.*]] = load i64, ptr @__sancov_lowest_stack, align 8 +; CHECK-STACK-DEPTH-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP2]], [[TMP3]] +; CHECK-STACK-DEPTH-NEXT: br i1 [[TMP4]], label {{%.*}}, label {{%.*}} +; CHECK-STACK-DEPTH: store i64 [[TMP2]], ptr @__sancov_lowest_stack, align 8, {{.*}}!nosanitize +; CHECK-STACK-DEPTH: %t = load i32, ptr [[A]], align 4 +; CHECK-STACK-DEPTH-NEXT: call void @external_func() +; CHECK-STACK-DEPTH-NEXT: ret i32 %t +; +entry: + %buf = alloca [64 x i8], align 1 + %t = load i32, ptr %a, align 4 + call void @external_func() + ret i32 %t +} + +declare void @external_func() + !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2} @@ -66,6 +148,10 @@ entry: !5 = !{} !6 = !DILocation(line: 192, scope: !3) !7 = !DILocation(line: 0, scope: !3) +!8 = distinct !DISubprogram(name: "with_dbg_stack_callback", scope: !1, file: !1, line: 200, type: !4, scopeLine: 200, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!9 = !DILocation(line: 200, scope: !8) +!10 = distinct !DISubprogram(name: "with_dbg_stack_depth", scope: !1, file: !1, line: 210, type: !4, scopeLine: 210, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!11 = !DILocation(line: 210, scope: !10) ;. ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C89, file: [[META1:![0-9]+]], isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None) @@ -76,3 +162,9 @@ entry: ; CHECK: [[DBG6]] = !DILocation(line: 192, scope: [[DBG3]]) ; CHECK: [[DBG7]] = !DILocation(line: 0, scope: [[DBG3]]) ;. +; CHECK-STACK-CALLBACK: [[DBG8]] = distinct !DISubprogram(name: "with_dbg_stack_callback", scope: {{.*}}, file: {{.*}}, line: 200 +; CHECK-STACK-CALLBACK: [[DBG9]] = !DILocation(line: 200, scope: [[DBG8]]) +;. +; CHECK-STACK-DEPTH: [[DBG10]] = distinct !DISubprogram(name: "with_dbg_stack_depth", scope: {{.*}}, file: {{.*}}, line: 210 +; CHECK-STACK-DEPTH: [[DBG11]] = !DILocation(line: 210, scope: [[DBG10]]) +;. diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll index 4173c32..f45798b 100644 --- a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll +++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll @@ -7,10 +7,10 @@ ; state, and the block that determines the next state. ; < path of BBs that form a cycle > [ state, determinator ] define i32 @test1(i32 %num) !prof !0{ -; CHECK: < case2 for.inc for.body > [ 1, for.inc ] -; CHECK-NEXT: < for.inc for.body > [ 1, for.inc ] -; CHECK-NEXT: < case1 for.inc for.body > [ 2, for.inc ] -; CHECK-NEXT: < case2 sel.si.unfold.false for.inc for.body > [ 2, sel.si.unfold.false ] +; CHECK: < case2, for.inc, for.body > [ 1, for.inc ] +; CHECK-NEXT: < for.inc, for.body > [ 1, for.inc ] +; CHECK-NEXT: < case1, for.inc, for.body > [ 2, for.inc ] +; CHECK-NEXT: < case2, sel.si.unfold.false, for.inc, for.body > [ 2, sel.si.unfold.false ] entry: br label %for.body @@ -47,12 +47,12 @@ for.end: ; complicated CFG. Here the FSM is represented as a nested loop, with ; fallthrough cases. define i32 @test2(i32 %init) { -; CHECK: < loop.1.backedge loop.1 loop.2 loop.3 > [ 1, loop.1 ] -; CHECK-NEXT: < case4 loop.1.backedge state.1.be2.si.unfold.false loop.1 loop.2 loop.3 > [ 2, loop.1.backedge ] -; CHECK-NEXT: < case2 loop.1.backedge state.1.be2.si.unfold.false loop.1 loop.2 loop.3 > [ 4, loop.1.backedge ] -; CHECK-NEXT: < case4 loop.2.backedge loop.2 loop.3 > [ 3, loop.2.backedge ] -; CHECK-NEXT: < case3 loop.2.backedge loop.2 loop.3 > [ 0, loop.2.backedge ] -; CHECK-NEXT: < case2 loop.3 > [ 3, loop.3 ] +; CHECK: < loop.1.backedge, loop.1, loop.2, loop.3 > [ 1, loop.1 ] +; CHECK-NEXT: < case4, loop.1.backedge, state.1.be2.si.unfold.false, loop.1, loop.2, loop.3 > [ 2, loop.1.backedge ] +; CHECK-NEXT: < case2, loop.1.backedge, state.1.be2.si.unfold.false, loop.1, loop.2, loop.3 > [ 4, loop.1.backedge ] +; CHECK-NEXT: < case4, loop.2.backedge, loop.2, loop.3 > [ 3, loop.2.backedge ] +; CHECK-NEXT: < case3, loop.2.backedge, loop.2, loop.3 > [ 0, loop.2.backedge ] +; CHECK-NEXT: < case2, loop.3 > [ 3, loop.3 ] entry: %cmp = icmp eq i32 %init, 0 %sel = select i1 %cmp, i32 0, i32 2 @@ -187,12 +187,12 @@ bb66: ; preds = %bb59 ; Value %init is not predictable but it's okay since it is the value initial to the switch. define i32 @initial.value.positive1(i32 %init) !prof !0 { -; CHECK: < loop.1.backedge loop.1 loop.2 loop.3 > [ 1, loop.1 ] -; CHECK-NEXT: < case4 loop.1.backedge state.1.be2.si.unfold.false loop.1 loop.2 loop.3 > [ 2, loop.1.backedge ] -; CHECK-NEXT: < case2 loop.1.backedge state.1.be2.si.unfold.false loop.1 loop.2 loop.3 > [ 4, loop.1.backedge ] -; CHECK-NEXT: < case4 loop.2.backedge loop.2 loop.3 > [ 3, loop.2.backedge ] -; CHECK-NEXT: < case3 loop.2.backedge loop.2 loop.3 > [ 0, loop.2.backedge ] -; CHECK-NEXT: < case2 loop.3 > [ 3, loop.3 ] +; CHECK: < loop.1.backedge, loop.1, loop.2, loop.3 > [ 1, loop.1 ] +; CHECK-NEXT: < case4, loop.1.backedge, state.1.be2.si.unfold.false, loop.1, loop.2, loop.3 > [ 2, loop.1.backedge ] +; CHECK-NEXT: < case2, loop.1.backedge, state.1.be2.si.unfold.false, loop.1, loop.2, loop.3 > [ 4, loop.1.backedge ] +; CHECK-NEXT: < case4, loop.2.backedge, loop.2, loop.3 > [ 3, loop.2.backedge ] +; CHECK-NEXT: < case3, loop.2.backedge, loop.2, loop.3 > [ 0, loop.2.backedge ] +; CHECK-NEXT: < case2, loop.3 > [ 3, loop.3 ] entry: %cmp = icmp eq i32 %init, 0 br label %loop.1 diff --git a/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll b/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll index 92747629..cb7c46e 100644 --- a/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll +++ b/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll @@ -9,9 +9,9 @@ ; too long so that it is not jump-threaded. define i32 @max_path_length(i32 %num) { ; CHECK-NOT: 3, case1 -; CHECK: < case2 for.inc for.body > [ 1, for.inc ] -; CHECK-NEXT: < for.inc for.body > [ 1, for.inc ] -; CHECK-NEXT: < case2 sel.si.unfold.false for.inc for.body > [ 2, sel.si.unfold.false ] +; CHECK: < case2, for.inc, for.body > [ 1, for.inc ] +; CHECK-NEXT: < for.inc, for.body > [ 1, for.inc ] +; CHECK-NEXT: < case2, sel.si.unfold.false, for.inc, for.body > [ 2, sel.si.unfold.false ] ; CHECK-NEXT: DFA-JT: Renaming non-local uses of: entry: br label %for.body diff --git a/llvm/test/Transforms/GVN/assume-equal.ll b/llvm/test/Transforms/GVN/assume-equal.ll index 0c922da..bbbc5c5 100644 --- a/llvm/test/Transforms/GVN/assume-equal.ll +++ b/llvm/test/Transforms/GVN/assume-equal.ll @@ -221,21 +221,22 @@ define i32 @_Z1ii(i32 %p) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[P]], 42 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) -; CHECK-NEXT: br i1 true, label %[[BB2:.*]], label %[[BB2]] -; CHECK: [[BB2]]: -; CHECK-NEXT: br i1 true, label %[[BB2]], label %[[BB2]] -; CHECK: [[BB0:.*:]] +; CHECK-NEXT: br i1 true, label %[[COMMON:.*]], label %[[COMMON]] +; CHECK: [[COMMON]]: +; CHECK-NEXT: br i1 true, label %[[COMMON]], label %[[COMMON]] +; CHECK: [[EXIT:.*:]] ; CHECK-NEXT: ret i32 42 ; entry: %cmp = icmp eq i32 %p, 42 call void @llvm.assume(i1 %cmp) - br i1 %cmp, label %bb2, label %bb2 -bb2: + br i1 %cmp, label %common, label %common +common: call void @llvm.assume(i1 true) - br i1 %cmp, label %bb2, label %bb2 + br i1 %cmp, label %common, label %common +exit: ret i32 %p } @@ -357,8 +358,8 @@ define i8 @assume_ptr_eq_different_prov_matters(ptr %p, ptr %p2) { ret i8 %v } -define i1 @assume_ptr_eq_different_prov_does_not_matter(ptr %p, ptr %p2) { -; CHECK-LABEL: define i1 @assume_ptr_eq_different_prov_does_not_matter( +define i1 @assume_ptr_eq_different_prov_does_not_matter_icmp(ptr %p, ptr %p2) { +; CHECK-LABEL: define i1 @assume_ptr_eq_different_prov_does_not_matter_icmp( ; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]]) { ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], [[P2]] ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) @@ -371,6 +372,36 @@ define i1 @assume_ptr_eq_different_prov_does_not_matter(ptr %p, ptr %p2) { ret i1 %c } +; This is not correct, as it may change the provenance exposed by ptrtoint. +; We still allow it for now. +define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint(ptr %p, ptr %p2) { +; CHECK-LABEL: define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint( +; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], [[P2]] +; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[INT:%.*]] = ptrtoint ptr [[P]] to i64 +; CHECK-NEXT: ret i64 [[INT]] +; + %cmp = icmp eq ptr %p, %p2 + call void @llvm.assume(i1 %cmp) + %int = ptrtoint ptr %p2 to i64 + ret i64 %int +} + +define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr(ptr %p, ptr %p2) { +; CHECK-LABEL: define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr( +; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], [[P2]] +; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[INT:%.*]] = ptrtoaddr ptr [[P]] to i64 +; CHECK-NEXT: ret i64 [[INT]] +; + %cmp = icmp eq ptr %p, %p2 + call void @llvm.assume(i1 %cmp) + %int = ptrtoaddr ptr %p2 to i64 + ret i64 %int +} + define i8 @assume_ptr_eq_same_prov(ptr %p, i64 %x) { ; CHECK-LABEL: define i8 @assume_ptr_eq_same_prov( ; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) { diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll index 61b1331..49b9b7e 100644 --- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll +++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll @@ -1,6 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s -target datalayout = "p1:64:64:64:32" + +; The ptrtoaddr folds are also valid for pointers that have external state. +target datalayout = "pe1:64:64:64:32" + +@g = external global i8 +@g2 = external global i8 + +@g.as1 = external addrspace(1) global i8 +@g2.as1 = external addrspace(1) global i8 define i32 @ptrtoaddr_inttoptr_arg(i32 %a) { ; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg( @@ -24,14 +32,14 @@ define i32 @ptrtoaddr_inttoptr() { define i32 @ptrtoaddr_inttoptr_diff_size1() { ; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() { -; CHECK-NEXT: ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32) +; CHECK-NEXT: ret i32 -1 ; ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32) } define i32 @ptrtoaddr_inttoptr_diff_size2() { ; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() { -; CHECK-NEXT: ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32) +; CHECK-NEXT: ret i32 65535 ; ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32) } @@ -52,14 +60,52 @@ define i64 @ptr2addr2_inttoptr_noas2() { define i64 @ptrtoaddr_inttoptr_noas_diff_size1() { ; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() { -; CHECK-NEXT: ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64) +; CHECK-NEXT: ret i64 4294967295 ; ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64) } define i64 @ptrtoaddr_inttoptr_noas_diff_size2() { ; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() { -; CHECK-NEXT: ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64) +; CHECK-NEXT: ret i64 -1 ; ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64) } + +define i64 @ptrtoaddr_gep_null() { +; CHECK-LABEL: define i64 @ptrtoaddr_gep_null() { +; CHECK-NEXT: ret i64 42 +; + ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 42) to i64) +} + +define i32 @ptrtoaddr_gep_null_addrsize() { +; CHECK-LABEL: define i32 @ptrtoaddr_gep_null_addrsize() { +; CHECK-NEXT: ret i32 42 +; + ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i32 42) to i32) +} + +define i64 @ptrtoaddr_gep_sub() { +; CHECK-LABEL: define i64 @ptrtoaddr_gep_sub() { +; CHECK-NEXT: ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64)) +; + ret i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 sub (i64 0, i64 ptrtoaddr (ptr @g2 to i64))) to i64) +} + +define i32 @ptrtoaddr_gep_sub_addrsize() { +; CHECK-LABEL: define i32 @ptrtoaddr_gep_sub_addrsize() { +; CHECK-NEXT: ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32)) +; + ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 sub (i32 0, i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))) to i32) +} + +; Don't fold inttoptr of ptrtoaddr away. inttoptr will pick a previously +; exposed provenance, which is not necessarily that of @g (especially as +; ptrtoaddr does not expose the provenance.) +define ptr @inttoptr_of_ptrtoaddr() { +; CHECK-LABEL: define ptr @inttoptr_of_ptrtoaddr() { +; CHECK-NEXT: ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr) +; + ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr) +} diff --git a/llvm/test/Transforms/LICM/vector-intrinsics.ll b/llvm/test/Transforms/LICM/vector-intrinsics.ll new file mode 100644 index 0000000..351773e --- /dev/null +++ b/llvm/test/Transforms/LICM/vector-intrinsics.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes='loop-mssa(licm)' -verify-memoryssa %s | FileCheck %s + +define i32 @reduce_umax(<2 x i32> %inv, i1 %c) { +; CHECK-LABEL: define i32 @reduce_umax( +; CHECK-SAME: <2 x i32> [[INV:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[REDUCE_UMAX:%.*]] = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> [[INV]]) +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[BACKEDGE_COND:%.*]] = icmp ult i32 [[IV]], [[REDUCE_UMAX]] +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C]], i1 [[BACKEDGE_COND]], i1 false +; CHECK-NEXT: br i1 [[OR_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %cond.true ] + %iv.next = add i32 %iv, 1 + br i1 %c, label %cond.true, label %exit + +cond.true: + %reduce.umax = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %inv) + %backedge.cond = icmp ult i32 %iv, %reduce.umax + br i1 %backedge.cond, label %loop, label %exit + +exit: + ret i32 %iv +} + +define i32 @vp_umax(<2 x i32> %inv.l, <2 x i32> %inv.r, i1 %c) { +; CHECK-LABEL: define i32 @vp_umax( +; CHECK-SAME: <2 x i32> [[INV_L:%.*]], <2 x i32> [[INV_R:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[VP_UMAX:%.*]] = call <2 x i32> @llvm.vp.umax.v2i32(<2 x i32> [[INV_L]], <2 x i32> [[INV_R]], <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x i32> [[VP_UMAX]], i32 0 +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[BACKEDGE_COND:%.*]] = icmp ult i32 [[IV]], [[EXTRACT]] +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C]], i1 [[BACKEDGE_COND]], i1 false +; CHECK-NEXT: br i1 [[OR_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %cond.true ] + %iv.next = add i32 %iv, 1 + br i1 %c, label %cond.true, label %exit + +cond.true: + %vp.umax = call <2 x i32> @llvm.vp.umax.v2i32(<2 x i32> %inv.l, <2 x i32> %inv.r, <2 x i1> splat (i1 1), i32 2) + %extract = extractelement <2 x i32> %vp.umax, i32 0 + %backedge.cond = icmp ult i32 %iv, %extract + br i1 %backedge.cond, label %loop, label %exit + +exit: + ret i32 %iv +} + +define i32 @vp_udiv(<2 x i32> %inv.q, <2 x i32> %inv.d, i1 %c) { +; CHECK-LABEL: define i32 @vp_udiv( +; CHECK-SAME: <2 x i32> [[INV_Q:%.*]], <2 x i32> [[INV_D:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[COND_TRUE:.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[C]], label %[[COND_TRUE]], label %[[EXIT:.*]] +; CHECK: [[COND_TRUE]]: +; CHECK-NEXT: [[VP_UDIV:%.*]] = call <2 x i32> @llvm.vp.udiv.v2i32(<2 x i32> [[INV_Q]], <2 x i32> [[INV_D]], <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x i32> [[VP_UDIV]], i32 0 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV]], [[EXTRACT]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], %[[COND_TRUE]] ], [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %cond.true ] + %iv.next = add i32 %iv, 1 + br i1 %c, label %cond.true, label %exit + +cond.true: + %vp.udiv = call <2 x i32> @llvm.vp.udiv.v2i32(<2 x i32> %inv.q, <2 x i32> %inv.d, <2 x i1> splat (i1 1), i32 2) + %extract = extractelement <2 x i32> %vp.udiv, i32 0 + %backedge.cond = icmp ult i32 %iv, %extract + br i1 %backedge.cond, label %loop, label %exit + +exit: + ret i32 %iv +} + +define i32 @vp_load(ptr %inv, i1 %c) { +; CHECK-LABEL: define i32 @vp_load( +; CHECK-SAME: ptr [[INV:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[COND_TRUE:.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[C]], label %[[COND_TRUE]], label %[[EXIT:.*]] +; CHECK: [[COND_TRUE]]: +; CHECK-NEXT: [[VP_LOAD:%.*]] = call <2 x i32> @llvm.vp.load.v2i32.p0(ptr [[INV]], <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x i32> [[VP_LOAD]], i32 0 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV]], [[EXTRACT]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], %[[COND_TRUE]] ], [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %cond.true ] + %iv.next = add i32 %iv, 1 + br i1 %c, label %cond.true, label %exit + +cond.true: + %vp.load = call <2 x i32> @llvm.vp.load.v2i32(ptr %inv, <2 x i1> splat (i1 1), i32 2) + %extract = extractelement <2 x i32> %vp.load, i32 0 + %backedge.cond = icmp ult i32 %iv, %extract + br i1 %backedge.cond, label %loop, label %exit + +exit: + ret i32 %iv +} + +define i32 @vp_store(<2 x i32> %inv.v, ptr %inv.p, i1 %c) { +; CHECK-LABEL: define i32 @vp_store( +; CHECK-SAME: <2 x i32> [[INV_V:%.*]], ptr [[INV_P:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[COND_TRUE:.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[C]], label %[[COND_TRUE]], label %[[EXIT:.*]] +; CHECK: [[COND_TRUE]]: +; CHECK-NEXT: call void @llvm.vp.store.v2i32.p0(<2 x i32> [[INV_V]], ptr [[INV_P]], <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[BACKEDGE_COND:%.*]] = icmp ult i32 [[IV]], 10 +; CHECK-NEXT: br i1 [[BACKEDGE_COND]], label %[[LOOP]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], %[[COND_TRUE]] ], [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %cond.true ] + %iv.next = add i32 %iv, 1 + br i1 %c, label %cond.true, label %exit + +cond.true: + call void @llvm.vp.store.v2i32(<2 x i32> %inv.v, ptr %inv.p, <2 x i1> splat (i1 1), i32 2) + %backedge.cond = icmp ult i32 %iv, 10 + br i1 %backedge.cond, label %loop, label %exit + +exit: + ret i32 %iv +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/RISCV/veclib-function-calls.ll index d73900d..83b494a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/veclib-function-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/veclib-function-calls.ll @@ -2288,7 +2288,7 @@ define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { } ;. ; CHECK: attributes #[[ATTR0]] = { "target-features"="+v" } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR2]] = { "vector-function-abi-variant"="_ZGVrNxv_acos(Sleef_acosdx_u10rvvm2)" } ; CHECK: attributes #[[ATTR3]] = { "vector-function-abi-variant"="_ZGVrNxv_acosf(Sleef_acosfx_u10rvvm2)" } ; CHECK: attributes #[[ATTR4]] = { "vector-function-abi-variant"="_ZGVrNxv_acosh(Sleef_acoshdx_u10rvvm2)" } diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll index 3500c5c..4fd8d17 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll @@ -546,19 +546,50 @@ define i64 @loop_guards_needed_to_prove_deref_multiple(i32 %x, i1 %c, ptr derefe ; CHECK-NEXT: call void @llvm.assume(i1 [[PRE_2]]) ; CHECK-NEXT: [[N:%.*]] = add i32 [[SEL]], -1 ; CHECK-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SEL]], -2 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-NEXT: [[IV_NEXT:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] +; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_LATCH:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[IV_NEXT]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: br label [[EXIT_LOOPEXIT]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[LOOP_LATCH]] ], [ 0, [[PH]] ] +; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[PH]] ] -; CHECK-NEXT: [[GEP_SRC_I:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], [[LOOP_LATCH1:%.*]] ], [ [[IV]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP_SRC_I:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV1]] ; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I]], align 1 ; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], 0 -; CHECK-NEXT: br i1 [[C_1]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_LATCH]] +; CHECK-NEXT: br i1 [[C_1]], label [[EXIT_LOOPEXIT]], label [[LOOP_LATCH1]] ; CHECK: loop.latch: -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N_EXT]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]] +; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV1]], [[N_EXT]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit.loopexit: -; CHECK-NEXT: [[RES_PH:%.*]] = phi i64 [ [[IV]], [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ] +; CHECK-NEXT: [[RES_PH:%.*]] = phi i64 [ [[IV1]], [[LOOP_HEADER1]] ], [ 0, [[LOOP_LATCH1]] ], [ 0, [[LOOP_LATCH]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: [[RES:%.*]] = phi i64 [ -1, [[ENTRY:%.*]] ], [ -2, [[THEN]] ], [ [[RES_PH]], [[EXIT_LOOPEXIT]] ] @@ -609,4 +640,6 @@ exit: ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} ; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll index 9acc6d6..09f583f 100644 --- a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll @@ -39,5 +39,4 @@ declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0 declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>) #0 ; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; CHECK-NEXT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s b/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s index da83c54..5325177 100644 --- a/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s +++ b/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s @@ -1,10 +1,10 @@ REQUIRES: aarch64-registered-target -// Flakey on SVE buildbots, disabled pending invesgitation. -UNSUPPORTED: target={{.*}} RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%t.obj --opcode-name=FMOVWSr --benchmark-phase=assemble-measured-code 2>&1 RUN: llvm-objdump -d %t.obj > %t.s RUN: FileCheck %s < %t.s +// Start matching after the printed file path, as that may contain something that looks like a mnemonic. +CHECK: Disassembly of section .text: CHECK-NOT: ld{{[1-4]}} CHECK-NOT: st{{[1-4]}} diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp index 988e307..7340f56 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp @@ -480,18 +480,21 @@ TEST(LegalizerInfoTest, MMOAlignment) { LegacyInfo.computeTables(); - EXPECT_ACTION(Legal, 0, LLT(), - LegalityQuery(G_LOAD, {s32, p0}, - LegalityQuery::MemDesc{ - s32, 32, AtomicOrdering::NotAtomic})); - EXPECT_ACTION(Unsupported, 0, LLT(), - LegalityQuery(G_LOAD, {s32, p0}, - LegalityQuery::MemDesc{ - s32, 16, AtomicOrdering::NotAtomic })); - EXPECT_ACTION(Unsupported, 0, LLT(), - LegalityQuery(G_LOAD, {s32, p0}, - LegalityQuery::MemDesc{ - s32, 8, AtomicOrdering::NotAtomic})); + EXPECT_ACTION( + Legal, 0, LLT(), + LegalityQuery(G_LOAD, {s32, p0}, + LegalityQuery::MemDesc{s32, 32, AtomicOrdering::NotAtomic, + AtomicOrdering::NotAtomic})); + EXPECT_ACTION( + Unsupported, 0, LLT(), + LegalityQuery(G_LOAD, {s32, p0}, + LegalityQuery::MemDesc{s32, 16, AtomicOrdering::NotAtomic, + AtomicOrdering::NotAtomic})); + EXPECT_ACTION( + Unsupported, 0, LLT(), + LegalityQuery(G_LOAD, {s32, p0}, + LegalityQuery::MemDesc{s32, 8, AtomicOrdering::NotAtomic, + AtomicOrdering::NotAtomic})); } // Test that the maximum supported alignment value isn't truncated @@ -506,14 +509,17 @@ TEST(LegalizerInfoTest, MMOAlignment) { LegacyInfo.computeTables(); - EXPECT_ACTION(Legal, 0, LLT(), - LegalityQuery(G_LOAD, {s32, p0}, - LegalityQuery::MemDesc{s32, - MaxAlignInBits, AtomicOrdering::NotAtomic})); - EXPECT_ACTION(Unsupported, 0, LLT(), - LegalityQuery(G_LOAD, {s32, p0}, - LegalityQuery::MemDesc{ - s32, 8, AtomicOrdering::NotAtomic })); + EXPECT_ACTION( + Legal, 0, LLT(), + LegalityQuery(G_LOAD, {s32, p0}, + LegalityQuery::MemDesc{s32, MaxAlignInBits, + AtomicOrdering::NotAtomic, + AtomicOrdering::NotAtomic})); + EXPECT_ACTION( + Unsupported, 0, LLT(), + LegalityQuery(G_LOAD, {s32, p0}, + LegalityQuery::MemDesc{s32, 8, AtomicOrdering::NotAtomic, + AtomicOrdering::NotAtomic})); } } diff --git a/llvm/unittests/CodeGen/MIR2VecTest.cpp b/llvm/unittests/CodeGen/MIR2VecTest.cpp index d243d82..11222b4 100644 --- a/llvm/unittests/CodeGen/MIR2VecTest.cpp +++ b/llvm/unittests/CodeGen/MIR2VecTest.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/Module.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/TargetParser/Triple.h" @@ -52,7 +53,7 @@ protected: std::unique_ptr<LLVMContext> Ctx; std::unique_ptr<Module> M; std::unique_ptr<TargetMachine> TM; - const TargetInstrInfo *TII; + const TargetInstrInfo *TII = nullptr; static void SetUpTestCase() { InitializeAllTargets(); @@ -93,6 +94,8 @@ protected: return; } } + + void TearDown() override { TII = nullptr; } }; // Function to find an opcode by name @@ -118,7 +121,11 @@ TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) { VocabMap VMap; Embedding Val = Embedding(64, 1.0f); VMap["ADD"] = Val; - MIRVocabulary TestVocab(std::move(VMap), TII); + auto TestVocabOrErr = MIRVocabulary::create(std::move(VMap), *TII); + ASSERT_TRUE(static_cast<bool>(TestVocabOrErr)) + << "Failed to create vocabulary: " + << toString(TestVocabOrErr.takeError()); + auto &TestVocab = *TestVocabOrErr; unsigned Index1 = TestVocab.getCanonicalIndexForBaseName(BaseName1); unsigned Index2 = TestVocab.getCanonicalIndexForBaseName(BaseName2); @@ -173,7 +180,11 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) { // Use a minimal MIRVocabulary to trigger canonical mapping construction VocabMap VMap; VMap["ADD"] = Embedding(64, 1.0f); - MIRVocabulary TestVocab(std::move(VMap), TII); + auto TestVocabOrErr = MIRVocabulary::create(std::move(VMap), *TII); + ASSERT_TRUE(static_cast<bool>(TestVocabOrErr)) + << "Failed to create vocabulary: " + << toString(TestVocabOrErr.takeError()); + auto &TestVocab = *TestVocabOrErr; unsigned Index1 = TestVocab.getCanonicalIndexForBaseName(BaseName); unsigned Index2 = TestVocab.getCanonicalIndexForBaseName(BaseName); @@ -195,8 +206,10 @@ TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) { VMap["ADD"] = Embedding(128, 1.0f); // Dimension 128, all values 1.0 VMap["SUB"] = Embedding(128, 2.0f); // Dimension 128, all values 2.0 - MIRVocabulary Vocab(std::move(VMap), TII); - EXPECT_TRUE(Vocab.isValid()); + auto VocabOrErr = MIRVocabulary::create(std::move(VMap), *TII); + ASSERT_TRUE(static_cast<bool>(VocabOrErr)) + << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); + auto &Vocab = *VocabOrErr; EXPECT_EQ(Vocab.getDimension(), 128u); // Test iterator - iterates over individual embeddings @@ -214,4 +227,20 @@ TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) { EXPECT_GT(Count, 0u); } -} // namespace
\ No newline at end of file +// Test factory method with empty vocabulary +TEST_F(MIR2VecVocabTestFixture, EmptyVocabularyCreation) { + VocabMap EmptyVMap; + + auto VocabOrErr = MIRVocabulary::create(std::move(EmptyVMap), *TII); + EXPECT_FALSE(static_cast<bool>(VocabOrErr)) + << "Factory method should fail with empty vocabulary"; + + // Consume the error + if (!VocabOrErr) { + auto Err = VocabOrErr.takeError(); + std::string ErrorMsg = toString(std::move(Err)); + EXPECT_FALSE(ErrorMsg.empty()); + } +} + +} // namespace diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index 2208ae5..c89e335 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -202,6 +202,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPUTargetMachine.cpp", "AMDGPUTargetObjectFile.cpp", "AMDGPUTargetTransformInfo.cpp", + "AMDGPUUniformIntrinsicCombine.cpp", "AMDGPUUnifyDivergentExitNodes.cpp", "AMDGPUWaitSGPRHazards.cpp", "GCNCreateVOPD.cpp", diff --git a/llvm/utils/lit/tests/xunit-output-report-failures-only.py b/llvm/utils/lit/tests/xunit-output-report-failures-only.py index e15fd6a..c331578 100644 --- a/llvm/utils/lit/tests/xunit-output-report-failures-only.py +++ b/llvm/utils/lit/tests/xunit-output-report-failures-only.py @@ -5,7 +5,7 @@ # CHECK: <?xml version="1.0" encoding="UTF-8"?> # CHECK-NEXT: <testsuites time="{{[0-9.]+}}"> # CHECK-NEXT: <testsuite name="test-data" tests="1" failures="1" skipped="0" time="{{[0-9.]+}}"> -# CHECK-NEXT: <testcase classname="test-data.test-data" name="bad&name.ini" time="{{[0-1]\.[0-9]+}}"> +# CHECK-NEXT: <testcase classname="test-data.test-data" name="bad&name.ini" time="{{[0-9.]+}}"> # CHECK-NEXT: <failure><![CDATA[& < > ]]]]><![CDATA[> &"]]></failure> # CHECK-NEXT: </testcase> # CHECK-NEXT: </testsuite> |