diff options
Diffstat (limited to 'llvm/lib')
220 files changed, 4926 insertions, 2069 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 9c1c2c6..e71ba5e 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1801,6 +1801,44 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::nvvm_d2ull_rn: case Intrinsic::nvvm_d2ull_rp: case Intrinsic::nvvm_d2ull_rz: + + // NVVM math intrinsics: + case Intrinsic::nvvm_ceil_d: + case Intrinsic::nvvm_ceil_f: + case Intrinsic::nvvm_ceil_ftz_f: + + case Intrinsic::nvvm_fabs: + case Intrinsic::nvvm_fabs_ftz: + + case Intrinsic::nvvm_floor_d: + case Intrinsic::nvvm_floor_f: + case Intrinsic::nvvm_floor_ftz_f: + + case Intrinsic::nvvm_rcp_rm_d: + case Intrinsic::nvvm_rcp_rm_f: + case Intrinsic::nvvm_rcp_rm_ftz_f: + case Intrinsic::nvvm_rcp_rn_d: + case Intrinsic::nvvm_rcp_rn_f: + case Intrinsic::nvvm_rcp_rn_ftz_f: + case Intrinsic::nvvm_rcp_rp_d: + case Intrinsic::nvvm_rcp_rp_f: + case Intrinsic::nvvm_rcp_rp_ftz_f: + case Intrinsic::nvvm_rcp_rz_d: + case Intrinsic::nvvm_rcp_rz_f: + case Intrinsic::nvvm_rcp_rz_ftz_f: + + case Intrinsic::nvvm_round_d: + case Intrinsic::nvvm_round_f: + case Intrinsic::nvvm_round_ftz_f: + + case Intrinsic::nvvm_saturate_d: + case Intrinsic::nvvm_saturate_f: + case Intrinsic::nvvm_saturate_ftz_f: + + case Intrinsic::nvvm_sqrt_f: + case Intrinsic::nvvm_sqrt_rn_d: + case Intrinsic::nvvm_sqrt_rn_f: + case Intrinsic::nvvm_sqrt_rn_ftz_f: return !Call->isStrictFP(); // Sign operations are actually bitwise operations, they do not raise @@ -1818,6 +1856,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::nearbyint: case Intrinsic::rint: case Intrinsic::canonicalize: + // Constrained intrinsics can be folded if FP environment is known // to compiler. case Intrinsic::experimental_constrained_fma: @@ -1965,22 +2004,56 @@ inline bool llvm_fenv_testexcept() { return false; } -static APFloat FTZPreserveSign(const APFloat &V) { +static const APFloat FTZPreserveSign(const APFloat &V) { if (V.isDenormal()) return APFloat::getZero(V.getSemantics(), V.isNegative()); return V; } -Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, - Type *Ty) { +static const APFloat FlushToPositiveZero(const APFloat &V) { + if (V.isDenormal()) + return APFloat::getZero(V.getSemantics(), false); + return V; +} + +static const APFloat +FlushWithDenormKind(const APFloat &V, + DenormalMode::DenormalModeKind DenormKind) { + assert(DenormKind != DenormalMode::DenormalModeKind::Invalid && + DenormKind != DenormalMode::DenormalModeKind::Dynamic); + switch (DenormKind) { + case DenormalMode::DenormalModeKind::IEEE: + return V; + case DenormalMode::DenormalModeKind::PreserveSign: + return FTZPreserveSign(V); + case DenormalMode::DenormalModeKind::PositiveZero: + return FlushToPositiveZero(V); + default: + llvm_unreachable("Invalid denormal mode!"); + } +} + +Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, Type *Ty, + DenormalMode DenormMode = DenormalMode::getIEEE()) { + if (!DenormMode.isValid() || + DenormMode.Input == DenormalMode::DenormalModeKind::Dynamic || + DenormMode.Output == DenormalMode::DenormalModeKind::Dynamic) + return nullptr; + llvm_fenv_clearexcept(); - double Result = NativeFP(V.convertToDouble()); + auto Input = FlushWithDenormKind(V, DenormMode.Input); + double Result = NativeFP(Input.convertToDouble()); if (llvm_fenv_testexcept()) { llvm_fenv_clearexcept(); return nullptr; } - return GetConstantFoldFPValue(Result, Ty); + Constant *Output = GetConstantFoldFPValue(Result, Ty); + if (DenormMode.Output == DenormalMode::DenormalModeKind::IEEE) + return Output; + const auto *CFP = static_cast<ConstantFP *>(Output); + const auto Res = FlushWithDenormKind(CFP->getValueAPF(), DenormMode.Output); + return ConstantFP::get(Ty->getContext(), Res); } #if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128) @@ -2550,6 +2623,94 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, return ConstantFoldFP(atan, APF, Ty); case Intrinsic::sqrt: return ConstantFoldFP(sqrt, APF, Ty); + + // NVVM Intrinsics: + case Intrinsic::nvvm_ceil_ftz_f: + case Intrinsic::nvvm_ceil_f: + case Intrinsic::nvvm_ceil_d: + return ConstantFoldFP( + ceil, APF, Ty, + nvvm::GetNVVMDenromMode( + nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); + + case Intrinsic::nvvm_fabs_ftz: + case Intrinsic::nvvm_fabs: + return ConstantFoldFP( + fabs, APF, Ty, + nvvm::GetNVVMDenromMode( + nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); + + case Intrinsic::nvvm_floor_ftz_f: + case Intrinsic::nvvm_floor_f: + case Intrinsic::nvvm_floor_d: + return ConstantFoldFP( + floor, APF, Ty, + nvvm::GetNVVMDenromMode( + nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); + + case Intrinsic::nvvm_rcp_rm_ftz_f: + case Intrinsic::nvvm_rcp_rn_ftz_f: + case Intrinsic::nvvm_rcp_rp_ftz_f: + case Intrinsic::nvvm_rcp_rz_ftz_f: + case Intrinsic::nvvm_rcp_rm_d: + case Intrinsic::nvvm_rcp_rm_f: + case Intrinsic::nvvm_rcp_rn_d: + case Intrinsic::nvvm_rcp_rn_f: + case Intrinsic::nvvm_rcp_rp_d: + case Intrinsic::nvvm_rcp_rp_f: + case Intrinsic::nvvm_rcp_rz_d: + case Intrinsic::nvvm_rcp_rz_f: { + APFloat::roundingMode RoundMode = nvvm::GetRCPRoundingMode(IntrinsicID); + bool IsFTZ = nvvm::RCPShouldFTZ(IntrinsicID); + + auto Denominator = IsFTZ ? FTZPreserveSign(APF) : APF; + APFloat Res = APFloat::getOne(APF.getSemantics()); + APFloat::opStatus Status = Res.divide(Denominator, RoundMode); + + if (Status == APFloat::opOK || Status == APFloat::opInexact) { + if (IsFTZ) + Res = FTZPreserveSign(Res); + return ConstantFP::get(Ty->getContext(), Res); + } + return nullptr; + } + + case Intrinsic::nvvm_round_ftz_f: + case Intrinsic::nvvm_round_f: + case Intrinsic::nvvm_round_d: { + // Use APFloat implementation instead of native libm call, as some + // implementations (e.g. on PPC) do not preserve the sign of negative 0. + bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID); + auto V = IsFTZ ? FTZPreserveSign(APF) : APF; + V.roundToIntegral(APFloat::rmNearestTiesToAway); + return ConstantFP::get(Ty->getContext(), V); + } + + case Intrinsic::nvvm_saturate_ftz_f: + case Intrinsic::nvvm_saturate_d: + case Intrinsic::nvvm_saturate_f: { + bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID); + auto V = IsFTZ ? FTZPreserveSign(APF) : APF; + if (V.isNegative() || V.isZero() || V.isNaN()) + return ConstantFP::getZero(Ty); + APFloat One = APFloat::getOne(APF.getSemantics()); + if (V > One) + return ConstantFP::get(Ty->getContext(), One); + return ConstantFP::get(Ty->getContext(), APF); + } + + case Intrinsic::nvvm_sqrt_rn_ftz_f: + case Intrinsic::nvvm_sqrt_f: + case Intrinsic::nvvm_sqrt_rn_d: + case Intrinsic::nvvm_sqrt_rn_f: + if (APF.isNegative()) + return nullptr; + return ConstantFoldFP( + sqrt, APF, Ty, + nvvm::GetNVVMDenromMode( + nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); + + // AMDGCN Intrinsics: case Intrinsic::amdgcn_cos: case Intrinsic::amdgcn_sin: { double V = getValueAsDouble(Op); diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 39f74be..8be5de3 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -941,10 +941,30 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) || match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value())); }; - if (isIntMinMaxRecurrenceKind(Kind) || - (HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind))) + if (isIntMinMaxRecurrenceKind(Kind)) return isMinMaxPattern(I, Kind, Prev); - else if (isFMulAddIntrinsic(I)) + if (isFPMinMaxRecurrenceKind(Kind)) { + InstDesc Res = isMinMaxPattern(I, Kind, Prev); + if (!Res.isRecurrence()) + return InstDesc(false, I); + if (HasRequiredFMF()) + return Res; + // We may be able to vectorize FMax/FMin reductions using maxnum/minnum + // intrinsics with extra checks ensuring the vector loop handles only + // non-NaN inputs. + if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) { + assert(Kind == RecurKind::FMax && + "unexpected recurrence kind for maxnum"); + return InstDesc(I, RecurKind::FMaxNum); + } + if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) { + assert(Kind == RecurKind::FMin && + "unexpected recurrence kind for minnum"); + return InstDesc(I, RecurKind::FMinNum); + } + return InstDesc(false, I); + } + if (isFMulAddIntrinsic(I)) return InstDesc(Kind == RecurKind::FMulAdd, I, I->hasAllowReassoc() ? nullptr : I); return InstDesc(false, I); diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index f3a32d3..14be385 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -589,11 +589,11 @@ void RuntimePointerChecking::groupChecks( // dependence. Not grouping the checks for a[i] and a[i + 9000] allows // us to perform an accurate check in this case. // - // The above case requires that we have an UnknownDependence between - // accesses to the same underlying object. This cannot happen unless - // FoundNonConstantDistanceDependence is set, and therefore UseDependencies - // is also false. In this case we will use the fallback path and create - // separate checking groups for all pointers. + // In the above case, we have a non-constant distance and an Unknown + // dependence between accesses to the same underlying object, and could retry + // with runtime checks. Therefore UseDependencies is false. In this case we + // will use the fallback path and create separate checking groups for all + // pointers. // If we don't have the dependency partitions, construct a new // checking pointer group for each pointer. This is also required @@ -819,7 +819,7 @@ public: /// perform dependency checking. /// /// Note that this can later be cleared if we retry memcheck analysis without - /// dependency checking (i.e. FoundNonConstantDistanceDependence). + /// dependency checking (i.e. ShouldRetryWithRuntimeChecks). bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); } /// We decided that no dependence analysis would be used. Reset the state. @@ -896,7 +896,7 @@ private: /// /// Note that, this is different from isDependencyCheckNeeded. When we retry /// memcheck analysis without dependency checking - /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is + /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is /// cleared while this remains set if we have potentially dependent accesses. bool IsRTCheckAnalysisNeeded = false; @@ -2079,11 +2079,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( if (StrideAScaled == StrideBScaled) CommonStride = StrideAScaled; - // TODO: FoundNonConstantDistanceDependence is used as a necessary condition - // to consider retrying with runtime checks. Historically, we did not set it - // when (unscaled) strides were different but there is no inherent reason to. + // TODO: Historically, we didn't retry with runtime checks when (unscaled) + // strides were different but there is no inherent reason to. if (!isa<SCEVConstant>(Dist)) - FoundNonConstantDistanceDependence |= StrideAPtrInt == StrideBPtrInt; + ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt; // If distance is a SCEVCouldNotCompute, return Unknown immediately. if (isa<SCEVCouldNotCompute>(Dist)) { @@ -2712,7 +2711,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, DepsAreSafe = DepChecker->areDepsSafe(DepCands, Accesses.getDependenciesToCheck()); - if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) { + if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) { LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n"); // Clear the dependency checks. We assume they are not needed. diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index c08024a..b3c8a7d 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -157,6 +157,8 @@ void CallStackTrie::addCallStack( } void CallStackTrie::addCallStack(MDNode *MIB) { + // Note that we are building this from existing MD_memprof metadata. + BuiltFromExistingMetadata = true; MDNode *StackMD = getMIBStackNode(MIB); assert(StackMD); std::vector<uint64_t> CallStack; @@ -187,8 +189,9 @@ void CallStackTrie::addCallStack(MDNode *MIB) { static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack, AllocationType AllocType, ArrayRef<ContextTotalSize> ContextSizeInfo, - const uint64_t MaxColdSize, uint64_t &TotalBytes, - uint64_t &ColdBytes) { + const uint64_t MaxColdSize, + bool BuiltFromExistingMetadata, + uint64_t &TotalBytes, uint64_t &ColdBytes) { SmallVector<Metadata *> MIBPayload( {buildCallstackMetadata(MIBCallStack, Ctx)}); MIBPayload.push_back( @@ -197,8 +200,9 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack, if (ContextSizeInfo.empty()) { // The profile matcher should have provided context size info if there was a // MinCallsiteColdBytePercent < 100. Here we check >=100 to gracefully - // handle a user-provided percent larger than 100. - assert(MinCallsiteColdBytePercent >= 100); + // handle a user-provided percent larger than 100. However, we may not have + // this information if we built the Trie from existing MD_memprof metadata. + assert(BuiltFromExistingMetadata || MinCallsiteColdBytePercent >= 100); return MDNode::get(Ctx, MIBPayload); } @@ -252,9 +256,19 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) { static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes, std::vector<Metadata *> &SavedMIBNodes, unsigned CallerContextLength, - uint64_t TotalBytes, uint64_t ColdBytes) { + uint64_t TotalBytes, uint64_t ColdBytes, + bool BuiltFromExistingMetadata) { const bool MostlyCold = - MinCallsiteColdBytePercent < 100 && + // If we have built the Trie from existing MD_memprof metadata, we may or + // may not have context size information (in which case ColdBytes and + // TotalBytes are 0, which is not also guarded against below). Even if we + // do have some context size information from the the metadata, we have + // already gone through a round of discarding of small non-cold contexts + // during matching, and it would be overly aggressive to do it again, and + // we also want to maintain the same behavior with and without reporting + // of hinted bytes enabled. + !BuiltFromExistingMetadata && MinCallsiteColdBytePercent < 100 && + ColdBytes > 0 && ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes; // In the simplest case, with pruning disabled, keep all the new MIB nodes. @@ -386,9 +400,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, if (hasSingleAllocType(Node->AllocTypes)) { std::vector<ContextTotalSize> ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); - MIBNodes.push_back( - createMIBNode(Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, - ContextSizeInfo, MaxColdSize, TotalBytes, ColdBytes)); + MIBNodes.push_back(createMIBNode( + Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo, + MaxColdSize, BuiltFromExistingMetadata, TotalBytes, ColdBytes)); return true; } @@ -416,7 +430,8 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, // Pass in the stack length of the MIB nodes added for the immediate caller, // which is the current stack length plus 1. saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1, - CallerTotalBytes, CallerColdBytes); + CallerTotalBytes, CallerColdBytes, + BuiltFromExistingMetadata); TotalBytes += CallerTotalBytes; ColdBytes += CallerColdBytes; @@ -441,9 +456,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, return false; std::vector<ContextTotalSize> ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); - MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold, - ContextSizeInfo, MaxColdSize, TotalBytes, - ColdBytes)); + MIBNodes.push_back(createMIBNode( + Ctx, MIBCallStack, AllocationType::NotCold, ContextSizeInfo, MaxColdSize, + BuiltFromExistingMetadata, TotalBytes, ColdBytes)); return true; } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 61a322b..af85ce4 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7912,6 +7912,8 @@ bool llvm::intrinsicPropagatesPoison(Intrinsic::ID IID) { case Intrinsic::ushl_sat: case Intrinsic::smul_fix: case Intrinsic::smul_fix_sat: + case Intrinsic::umul_fix: + case Intrinsic::umul_fix_sat: case Intrinsic::pow: case Intrinsic::powi: case Intrinsic::sin: @@ -7928,6 +7930,22 @@ bool llvm::intrinsicPropagatesPoison(Intrinsic::ID IID) { case Intrinsic::atan2: case Intrinsic::canonicalize: case Intrinsic::sqrt: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::exp10: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::modf: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::round: + case Intrinsic::roundeven: + case Intrinsic::lrint: + case Intrinsic::llrint: return true; default: return false; diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index ce813e1..520c6a0 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -679,6 +679,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(amdgpu_cs_chain_preserve); KEYWORD(amdgpu_kernel); KEYWORD(amdgpu_gfx); + KEYWORD(amdgpu_gfx_whole_wave); KEYWORD(tailcc); KEYWORD(m68k_rtdcc); KEYWORD(graalcc); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index b7f6950..13bef1f 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -2272,6 +2272,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) { CC = CallingConv::AMDGPU_CS_ChainPreserve; break; case lltok::kw_amdgpu_kernel: CC = CallingConv::AMDGPU_KERNEL; break; + case lltok::kw_amdgpu_gfx_whole_wave: + CC = CallingConv::AMDGPU_Gfx_WholeWave; + break; case lltok::kw_tailcc: CC = CallingConv::Tail; break; case lltok::kw_m68k_rtdcc: CC = CallingConv::M68k_RTD; break; case lltok::kw_graalcc: CC = CallingConv::GRAAL; break; @@ -4783,9 +4786,13 @@ struct MDField : public MDFieldImpl<Metadata *> { }; struct MDStringField : public MDFieldImpl<MDString *> { - bool AllowEmpty; - MDStringField(bool AllowEmpty = true) - : ImplTy(nullptr), AllowEmpty(AllowEmpty) {} + enum class EmptyIs { + Null, //< Allow empty input string, map to nullptr + Empty, //< Allow empty input string, map to an empty MDString + Error, //< Disallow empty string, map to an error + } EmptyIs; + MDStringField(enum EmptyIs EmptyIs = EmptyIs::Null) + : ImplTy(nullptr), EmptyIs(EmptyIs) {} }; struct MDFieldList : public MDFieldImpl<SmallVector<Metadata *, 4>> { @@ -5257,10 +5264,19 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDStringField &Result) { if (parseStringConstant(S)) return true; - if (!Result.AllowEmpty && S.empty()) - return error(ValueLoc, "'" + Name + "' cannot be empty"); + if (S.empty()) { + switch (Result.EmptyIs) { + case MDStringField::EmptyIs::Null: + Result.assign(nullptr); + return false; + case MDStringField::EmptyIs::Empty: + break; + case MDStringField::EmptyIs::Error: + return error(ValueLoc, "'" + Name + "' cannot be empty"); + } + } - Result.assign(S.empty() ? nullptr : MDString::get(Context, S)); + Result.assign(MDString::get(Context, S)); return false; } @@ -5778,7 +5794,7 @@ bool LLParser::parseDIFile(MDNode *&Result, bool IsDistinct) { REQUIRED(directory, MDStringField, ); \ OPTIONAL(checksumkind, ChecksumKindField, (DIFile::CSK_MD5)); \ OPTIONAL(checksum, MDStringField, ); \ - OPTIONAL(source, MDStringField, ); + OPTIONAL(source, MDStringField, (MDStringField::EmptyIs::Empty)); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS @@ -6062,7 +6078,7 @@ bool LLParser::parseDITemplateValueParameter(MDNode *&Result, bool IsDistinct) { /// declaration: !4, align: 8) bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) { #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \ - OPTIONAL(name, MDStringField, (/* AllowEmpty */ false)); \ + OPTIONAL(name, MDStringField, (MDStringField::EmptyIs::Error)); \ OPTIONAL(scope, MDField, ); \ OPTIONAL(linkageName, MDStringField, ); \ OPTIONAL(file, MDField, ); \ diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt index 38ba2d9..4b2debb 100644 --- a/llvm/lib/BinaryFormat/CMakeLists.txt +++ b/llvm/lib/BinaryFormat/CMakeLists.txt @@ -11,6 +11,7 @@ add_llvm_component_library(LLVMBinaryFormat MsgPackDocumentYAML.cpp MsgPackReader.cpp MsgPackWriter.cpp + SFrame.cpp Wasm.cpp XCOFF.cpp diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp new file mode 100644 index 0000000..3b436af --- /dev/null +++ b/llvm/lib/BinaryFormat/SFrame.cpp @@ -0,0 +1,37 @@ +//===-- SFrame.cpp -----------------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/BinaryFormat/SFrame.h" +#include "llvm/Support/ScopedPrinter.h" + +using namespace llvm; + +ArrayRef<EnumEntry<sframe::Version>> sframe::getVersions() { + static constexpr EnumEntry<Version> Versions[] = { +#define HANDLE_SFRAME_VERSION(CODE, NAME) {#NAME, sframe::Version::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + + return ArrayRef(Versions); +} + +ArrayRef<EnumEntry<sframe::Flags>> sframe::getFlags() { + static constexpr EnumEntry<sframe::Flags> Flags[] = { +#define HANDLE_SFRAME_FLAG(CODE, NAME) {#NAME, sframe::Flags::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(Flags); +} + +ArrayRef<EnumEntry<sframe::ABI>> sframe::getABIs() { + static constexpr EnumEntry<sframe::ABI> ABIs[] = { +#define HANDLE_SFRAME_ABI(CODE, NAME) {#NAME, sframe::ABI::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(ABIs); +} diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 66ecc69..f763683 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -293,10 +293,18 @@ static Expected<bool> hasObjCCategoryInModule(BitstreamCursor &Stream) { std::string S; if (convertToString(Record, 0, S)) return error("Invalid section name record"); + // Check for the i386 and other (x86_64, ARM) conventions - if (S.find("__DATA,__objc_catlist") != std::string::npos || - S.find("__OBJC,__category") != std::string::npos || - S.find("__TEXT,__swift") != std::string::npos) + + auto [Segment, Section] = StringRef(S).split(","); + Segment = Segment.trim(); + Section = Section.trim(); + + if (Segment == "__DATA" && Section.starts_with("__objc_catlist")) + return true; + if (Segment == "__OBJC" && Section.starts_with("__category")) + return true; + if (Segment == "__TEXT" && Section.starts_with("__swift")) return true; break; } @@ -7116,9 +7124,11 @@ Error BitcodeReader::materializeModule() { if (CallInst *CI = dyn_cast<CallInst>(U)) UpgradeIntrinsicCall(CI, I.second); } - if (!I.first->use_empty()) - I.first->replaceAllUsesWith(I.second); - I.first->eraseFromParent(); + if (I.first != I.second) { + if (!I.first->use_empty()) + I.first->replaceAllUsesWith(I.second); + I.first->eraseFromParent(); + } } UpgradedIntrinsics.clear(); diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 76a1d8c..f1d3e96 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -809,7 +809,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { // If we have a bss global going to a section that supports the // zerofill directive, do so here. - if (GVKind.isBSS() && MAI->isMachO() && TheSection->isVirtualSection()) { + if (GVKind.isBSS() && MAI->isMachO() && TheSection->isBssSection()) { if (Size == 0) Size = 1; // zerofill of 0 bytes is undefined. emitLinkage(GV, GVSym); @@ -1868,6 +1868,7 @@ void AsmPrinter::emitFunctionBody() { OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol()); break; case TargetOpcode::EH_LABEL: + OutStreamer->AddComment("EH_LABEL"); OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol()); // For AsynchEH, insert a Nop if followed by a trap inst // Or the exception won't be caught. diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp index 618deef..4bf3bdf 100644 --- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp @@ -18,6 +18,11 @@ #include "llvm/MC/MCPseudoProbe.h" #include "llvm/MC/MCStreamer.h" +#ifndef NDEBUG +#include "llvm/IR/Module.h" +#include "llvm/Support/WithColor.h" +#endif + using namespace llvm; void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index, @@ -35,6 +40,9 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index, uint64_t &CallerGuid = NameGuidMap[Name]; if (!CallerGuid) CallerGuid = Function::getGUIDAssumingExternalLinkage(Name); +#ifndef NDEBUG + verifyGuidExistenceInDesc(CallerGuid, Name); +#endif uint64_t CallerProbeId = PseudoProbeDwarfDiscriminator::extractProbeIndex( InlinedAt->getDiscriminator()); ReversedInlineStack.emplace_back(CallerGuid, CallerProbeId); @@ -51,4 +59,28 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index, SmallVector<InlineSite, 8> InlineStack(llvm::reverse(ReversedInlineStack)); Asm->OutStreamer->emitPseudoProbe(Guid, Index, Type, Attr, Discriminator, InlineStack, Asm->CurrentFnSym); +#ifndef NDEBUG + verifyGuidExistenceInDesc( + Guid, DebugLoc ? DebugLoc->getSubprogramLinkageName() : ""); +#endif +} + +#ifndef NDEBUG +void PseudoProbeHandler::verifyGuidExistenceInDesc(uint64_t Guid, + StringRef FuncName) { + NamedMDNode *Desc = Asm->MF->getFunction().getParent()->getNamedMetadata( + PseudoProbeDescMetadataName); + assert(Desc && "pseudo probe does not exist"); + + // Keep DescGuidSet up to date. + for (size_t I = DescGuidSet.size(), E = Desc->getNumOperands(); I != E; ++I) { + const auto *MD = cast<MDNode>(Desc->getOperand(I)); + auto *ID = mdconst::extract<ConstantInt>(MD->getOperand(0)); + DescGuidSet.insert(ID->getZExtValue()); + } + + if (!DescGuidSet.contains(Guid)) + WithColor::warning() << "Guid:" << Guid << " Name:" << FuncName + << " does not exist in pseudo probe desc\n"; } +#endif diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h index f11b552..e950b23 100644 --- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h +++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h @@ -15,6 +15,10 @@ #include "llvm/ADT/DenseMap.h" +#ifndef NDEBUG +#include "llvm/ADT/DenseSet.h" +#endif + namespace llvm { class AsmPrinter; @@ -26,6 +30,13 @@ class PseudoProbeHandler { // Name to GUID map, used as caching/memoization for speed. DenseMap<StringRef, uint64_t> NameGuidMap; +#ifndef NDEBUG + // All GUID in llvm.pseudo_probe_desc. + DenseSet<uint64_t> DescGuidSet; + + void verifyGuidExistenceInDesc(uint64_t Guid, StringRef FuncName); +#endif + public: PseudoProbeHandler(AsmPrinter *A) : Asm(A) {}; diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp index dccd71f..13fd270 100644 --- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp @@ -323,12 +323,6 @@ const MCExpr *WinException::getLabel(const MCSymbol *Label) { Asm->OutContext); } -const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) { - return MCBinaryExpr::createAdd(getLabel(Label), - MCConstantExpr::create(1, Asm->OutContext), - Asm->OutContext); -} - const MCExpr *WinException::getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom) { return MCBinaryExpr::createSub( @@ -655,7 +649,7 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo, AddComment("LabelStart"); OS.emitValue(getLabel(BeginLabel), 4); AddComment("LabelEnd"); - OS.emitValue(getLabelPlusOne(EndLabel), 4); + OS.emitValue(getLabel(EndLabel), 4); AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction" : "CatchAll"); OS.emitValue(FilterOrFinally, 4); @@ -950,13 +944,7 @@ void WinException::computeIP2StateTable( if (!ChangeLabel) ChangeLabel = StateChange.PreviousEndLabel; // Emit an entry indicating that PCs after 'Label' have this EH state. - // NOTE: On ARM architectures, the StateFromIp automatically takes into - // account that the return address is after the call instruction (whose EH - // state we should be using), but on other platforms we need to +1 to the - // label so that we are using the correct EH state. - const MCExpr *LabelExpression = (isAArch64 || isThumb) - ? getLabel(ChangeLabel) - : getLabelPlusOne(ChangeLabel); + const MCExpr *LabelExpression = getLabel(ChangeLabel); IPToStateTable.push_back( std::make_pair(LabelExpression, StateChange.NewState)); // FIXME: assert that NewState is between CatchLow and CatchHigh. diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.h b/llvm/lib/CodeGen/AsmPrinter/WinException.h index 638589a..47dd30c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/WinException.h +++ b/llvm/lib/CodeGen/AsmPrinter/WinException.h @@ -80,7 +80,6 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer { const MCExpr *create32bitRef(const MCSymbol *Value); const MCExpr *create32bitRef(const GlobalValue *GV); const MCExpr *getLabel(const MCSymbol *Label); - const MCExpr *getLabelPlusOne(const MCSymbol *Label); const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom); const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index dc81843..c21058c 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -3571,9 +3571,7 @@ class TypePromotionTransaction { } // Record the debug uses separately. They are not in the instruction's // use list, but they are replaced by RAUW. - SmallVector<DbgValueInst *> DbgValues; - findDbgValues(DbgValues, Inst, &DbgVariableRecords); - assert(DbgValues.empty()); + findDbgValues(Inst, DbgVariableRecords); // Now, we can replace the uses. Inst->replaceAllUsesWith(New); diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 1286af8..974fc40 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -1884,6 +1884,14 @@ unsigned GISelValueTracking::computeNumSignBits(Register R, } break; } + case TargetOpcode::G_ASHR: { + Register Src1 = MI.getOperand(1).getReg(); + Register Src2 = MI.getOperand(2).getReg(); + FirstAnswer = computeNumSignBits(Src1, DemandedElts, Depth + 1); + if (auto C = getValidMinimumShiftAmount(Src2, DemandedElts, Depth + 1)) + FirstAnswer = std::min<uint64_t>(FirstAnswer + *C, TyBits); + break; + } case TargetOpcode::G_TRUNC: { Register Src = MI.getOperand(1).getReg(); LLT SrcTy = MRI.getType(Src); @@ -2053,6 +2061,64 @@ unsigned GISelValueTracking::computeNumSignBits(Register R, unsigned Depth) { return computeNumSignBits(R, DemandedElts, Depth); } +std::optional<ConstantRange> GISelValueTracking::getValidShiftAmountRange( + Register R, const APInt &DemandedElts, unsigned Depth) { + // Shifting more than the bitwidth is not valid. + MachineInstr &MI = *MRI.getVRegDef(R); + unsigned Opcode = MI.getOpcode(); + + LLT Ty = MRI.getType(R); + unsigned BitWidth = Ty.getScalarSizeInBits(); + + if (Opcode == TargetOpcode::G_CONSTANT) { + const APInt &ShAmt = MI.getOperand(1).getCImm()->getValue(); + if (ShAmt.uge(BitWidth)) + return std::nullopt; + return ConstantRange(ShAmt); + } + + if (Opcode == TargetOpcode::G_BUILD_VECTOR) { + const APInt *MinAmt = nullptr, *MaxAmt = nullptr; + for (unsigned I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { + if (!DemandedElts[I]) + continue; + MachineInstr *Op = MRI.getVRegDef(MI.getOperand(I + 1).getReg()); + if (Op->getOpcode() != TargetOpcode::G_CONSTANT) { + MinAmt = MaxAmt = nullptr; + break; + } + + const APInt &ShAmt = Op->getOperand(1).getCImm()->getValue(); + if (ShAmt.uge(BitWidth)) + return std::nullopt; + if (!MinAmt || MinAmt->ugt(ShAmt)) + MinAmt = &ShAmt; + if (!MaxAmt || MaxAmt->ult(ShAmt)) + MaxAmt = &ShAmt; + } + assert(((!MinAmt && !MaxAmt) || (MinAmt && MaxAmt)) && + "Failed to find matching min/max shift amounts"); + if (MinAmt && MaxAmt) + return ConstantRange(*MinAmt, *MaxAmt + 1); + } + + // Use computeKnownBits to find a hidden constant/knownbits (usually type + // legalized). e.g. Hidden behind multiple bitcasts/build_vector/casts etc. + KnownBits KnownAmt = getKnownBits(R, DemandedElts, Depth); + if (KnownAmt.getMaxValue().ult(BitWidth)) + return ConstantRange::fromKnownBits(KnownAmt, /*IsSigned=*/false); + + return std::nullopt; +} + +std::optional<uint64_t> GISelValueTracking::getValidMinimumShiftAmount( + Register R, const APInt &DemandedElts, unsigned Depth) { + if (std::optional<ConstantRange> AmtRange = + getValidShiftAmountRange(R, DemandedElts, Depth)) + return AmtRange->getUnsignedMin().getZExtValue(); + return std::nullopt; +} + void GISelValueTrackingAnalysisLegacy::getAnalysisUsage( AnalysisUsage &AU) const { AU.setPreservesAll(); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index d7280ea..dc5dfab 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2189,23 +2189,11 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START : TargetOpcode::LIFETIME_END; - // Get the underlying objects for the location passed on the lifetime - // marker. - SmallVector<const Value *, 4> Allocas; - getUnderlyingObjects(CI.getArgOperand(1), Allocas); - - // Iterate over each underlying object, creating lifetime markers for each - // static alloca. Quit if we find a non-static alloca. - for (const Value *V : Allocas) { - const AllocaInst *AI = dyn_cast<AllocaInst>(V); - if (!AI) - continue; - - if (!AI->isStaticAlloca()) - return true; + const AllocaInst *AI = cast<AllocaInst>(CI.getArgOperand(1)); + if (!AI->isStaticAlloca()) + return true; - MIRBuilder.buildInstr(Op).addFrameIndex(getOrCreateFrameIndex(*AI)); - } + MIRBuilder.buildInstr(Op).addFrameIndex(getOrCreateFrameIndex(*AI)); return true; } case Intrinsic::fake_use: { diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 11b3ac8..ed7b07f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -10120,14 +10120,10 @@ LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { return Legalized; } - bool IsVolatile = MemOp->isVolatile(); - // Don't try to optimize volatile. - if (IsVolatile) - return UnableToLegalize; - if (MaxLen && KnownLen > MaxLen) return UnableToLegalize; + bool IsVolatile = MemOp->isVolatile(); if (Opc == TargetOpcode::G_MEMCPY) { auto &MF = *MI.getParent()->getParent(); const auto &TLI = *MF.getSubtarget().getTargetLowering(); diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index d2b2edf..df162fc 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -600,87 +600,113 @@ static Value *getMask(Value *WideMask, unsigned Factor, bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) { - Value *LoadedVal = DI->getOperand(0); - if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal)) + Instruction *LoadedVal = dyn_cast<Instruction>(DI->getOperand(0)); + if (!LoadedVal || !LoadedVal->hasOneUse()) + return false; + + auto *LI = dyn_cast<LoadInst>(LoadedVal); + auto *II = dyn_cast<IntrinsicInst>(LoadedVal); + if (!LI && !II) return false; const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); assert(Factor && "unexpected deinterleave intrinsic"); Value *Mask = nullptr; - if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) { - if (VPLoad->getIntrinsicID() != Intrinsic::vp_load) + if (LI) { + if (!LI->isSimple()) return false; + + LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI + << " and factor = " << Factor << "\n"); + } else { + assert(II); + // Check mask operand. Handle both all-true/false and interleaved mask. - Value *WideMask = VPLoad->getOperand(1); - Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI)); - if (!Mask) + Value *WideMask; + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::vp_load: + WideMask = II->getOperand(1); + break; + case Intrinsic::masked_load: + WideMask = II->getOperand(2); + break; + } - LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic " - << *DI << " and factor = " << Factor << "\n"); - } else { - auto *LI = cast<LoadInst>(LoadedVal); - if (!LI->isSimple()) + Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI)); + if (!Mask) return false; - LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI - << " and factor = " << Factor << "\n"); + LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave" + << " intrinsic " << *DI << " and factor = " + << Factor << "\n"); } // Try and match this with target specific intrinsics. - if (!TLI->lowerDeinterleaveIntrinsicToLoad(cast<Instruction>(LoadedVal), Mask, - DI)) + if (!TLI->lowerDeinterleaveIntrinsicToLoad(LoadedVal, Mask, DI)) return false; DeadInsts.insert(DI); // We now have a target-specific load, so delete the old one. - DeadInsts.insert(cast<Instruction>(LoadedVal)); + DeadInsts.insert(LoadedVal); return true; } bool InterleavedAccessImpl::lowerInterleaveIntrinsic( - IntrinsicInst *II, SmallSetVector<Instruction *, 32> &DeadInsts) { - if (!II->hasOneUse()) + IntrinsicInst *IntII, SmallSetVector<Instruction *, 32> &DeadInsts) { + if (!IntII->hasOneUse()) return false; - Value *StoredBy = II->user_back(); - if (!isa<StoreInst, VPIntrinsic>(StoredBy)) + Instruction *StoredBy = dyn_cast<Instruction>(IntII->user_back()); + if (!StoredBy) + return false; + auto *SI = dyn_cast<StoreInst>(StoredBy); + auto *II = dyn_cast<IntrinsicInst>(StoredBy); + if (!SI && !II) return false; - SmallVector<Value *, 8> InterleaveValues(II->args()); - const unsigned Factor = getInterleaveIntrinsicFactor(II->getIntrinsicID()); + SmallVector<Value *, 8> InterleaveValues(IntII->args()); + const unsigned Factor = getInterleaveIntrinsicFactor(IntII->getIntrinsicID()); assert(Factor && "unexpected interleave intrinsic"); Value *Mask = nullptr; - if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) { - if (VPStore->getIntrinsicID() != Intrinsic::vp_store) + if (II) { + // Check mask operand. Handle both all-true/false and interleaved mask. + Value *WideMask; + switch (II->getIntrinsicID()) { + default: return false; - - Value *WideMask = VPStore->getOperand(2); + case Intrinsic::vp_store: + WideMask = II->getOperand(2); + break; + case Intrinsic::masked_store: + WideMask = II->getOperand(3); + break; + } Mask = getMask(WideMask, Factor, cast<VectorType>(InterleaveValues[0]->getType())); if (!Mask) return false; - LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic " - << *II << " and factor = " << Factor << "\n"); + LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave" + << " intrinsic " << *IntII << " and factor = " + << Factor << "\n"); } else { - auto *SI = cast<StoreInst>(StoredBy); if (!SI->isSimple()) return false; - LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II - << " and factor = " << Factor << "\n"); + LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " + << *IntII << " and factor = " << Factor << "\n"); } // Try and match this with target specific intrinsics. - if (!TLI->lowerInterleaveIntrinsicToStore(cast<Instruction>(StoredBy), Mask, - InterleaveValues)) + if (!TLI->lowerInterleaveIntrinsicToStore(StoredBy, Mask, InterleaveValues)) return false; // We now have a target-specific store, so delete the old one. - DeadInsts.insert(cast<Instruction>(StoredBy)); - DeadInsts.insert(II); + DeadInsts.insert(StoredBy); + DeadInsts.insert(IntII); return true; } diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 38ad582..429a17a 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -211,9 +211,8 @@ void MachineFunction::init() { ConstantPool = new (Allocator) MachineConstantPool(getDataLayout()); Alignment = STI->getTargetLowering()->getMinFunctionAlignment(); - // FIXME: Shouldn't use pref alignment if explicit alignment is set on F. // FIXME: Use Function::hasOptSize(). - if (!F.hasFnAttribute(Attribute::OptimizeForSize)) + if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize)) Alignment = std::max(Alignment, STI->getTargetLowering()->getPrefFunctionAlignment()); diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index b38a4d1c..90005bd 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -4279,8 +4279,8 @@ void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits, !TII->isGlobalMemoryObject(FromMI) && !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) { SDep Pred = Dep; - Pred.setSUnit(Src); - Dst->addPred(Pred); + Pred.setSUnit(From); + To->addPred(Pred); } } } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 76cba29..9d5c39c 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -771,24 +771,6 @@ static bool isSchedBoundary(MachineBasicBlock::iterator MI, MI->isFakeUse(); } -/// A region of an MBB for scheduling. -namespace { -struct SchedRegion { - /// RegionBegin is the first instruction in the scheduling region, and - /// RegionEnd is either MBB->end() or the scheduling boundary after the - /// last instruction in the scheduling region. These iterators cannot refer - /// to instructions outside of the identified scheduling region because - /// those may be reordered before scheduling this region. - MachineBasicBlock::iterator RegionBegin; - MachineBasicBlock::iterator RegionEnd; - unsigned NumRegionInstrs; - - SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E, - unsigned N) : - RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {} -}; -} // end anonymous namespace - using MBBRegionsVector = SmallVector<SchedRegion, 16>; static void @@ -3725,7 +3707,8 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, RegionPolicy.OnlyBottomUp = true; // Allow the subtarget to override default policy. - MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs); + SchedRegion Region(Begin, End, NumRegionInstrs); + MF.getSubtarget().overrideSchedPolicy(RegionPolicy, Region); // After subtarget overrides, apply command line options. if (!EnableRegPressure) { @@ -4338,7 +4321,8 @@ void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, RegionPolicy.OnlyBottomUp = false; // Allow the subtarget to override default policy. - MF.getSubtarget().overridePostRASchedPolicy(RegionPolicy, NumRegionInstrs); + SchedRegion Region(Begin, End, NumRegionInstrs); + MF.getSubtarget().overridePostRASchedPolicy(RegionPolicy, Region); // After subtarget overrides, apply command line options. if (PostRADirection == MISched::TopDown) { diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index 381249e..0b2a73b 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -5,35 +5,31 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// This file defines the RABasic function pass, which provides a minimal -// implementation of the basic register allocator. -// +/// +/// \file +/// This file defines the RABasic function pass, which provides a minimal +/// implementation of the basic register allocator. +/// //===----------------------------------------------------------------------===// +#include "RegAllocBasic.h" #include "AllocationOrder.h" -#include "RegAllocBase.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveDebugVariables.h" #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" -#include "llvm/CodeGen/Spiller.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include <queue> using namespace llvm; @@ -42,89 +38,8 @@ using namespace llvm; static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator", createBasicRegisterAllocator); -namespace { - struct CompSpillWeight { - bool operator()(const LiveInterval *A, const LiveInterval *B) const { - return A->weight() < B->weight(); - } - }; -} - -namespace { -/// RABasic provides a minimal implementation of the basic register allocation -/// algorithm. It prioritizes live virtual registers by spill weight and spills -/// whenever a register is unavailable. This is not practical in production but -/// provides a useful baseline both for measuring other allocators and comparing -/// the speed of the basic algorithm against other styles of allocators. -class RABasic : public MachineFunctionPass, - public RegAllocBase, - private LiveRangeEdit::Delegate { - // context - MachineFunction *MF = nullptr; - - // state - std::unique_ptr<Spiller> SpillerInstance; - std::priority_queue<const LiveInterval *, std::vector<const LiveInterval *>, - CompSpillWeight> - Queue; - - // Scratch space. Allocated here to avoid repeated malloc calls in - // selectOrSplit(). - BitVector UsableRegs; - - bool LRE_CanEraseVirtReg(Register) override; - void LRE_WillShrinkVirtReg(Register) override; - -public: - RABasic(const RegAllocFilterFunc F = nullptr); - - /// Return the pass name. - StringRef getPassName() const override { return "Basic Register Allocator"; } - - /// RABasic analysis usage. - void getAnalysisUsage(AnalysisUsage &AU) const override; - - void releaseMemory() override; - - Spiller &spiller() override { return *SpillerInstance; } - - void enqueueImpl(const LiveInterval *LI) override { Queue.push(LI); } - - const LiveInterval *dequeue() override { - if (Queue.empty()) - return nullptr; - const LiveInterval *LI = Queue.top(); - Queue.pop(); - return LI; - } - - MCRegister selectOrSplit(const LiveInterval &VirtReg, - SmallVectorImpl<Register> &SplitVRegs) override; - - /// Perform register allocation. - bool runOnMachineFunction(MachineFunction &mf) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().setNoPHIs(); - } - - MachineFunctionProperties getClearedProperties() const override { - return MachineFunctionProperties().setIsSSA(); - } - - // Helper for spilling all live virtual registers currently unified under preg - // that interfere with the most recently queried lvr. Return true if spilling - // was successful, and append any new spilled/split intervals to splitLVRs. - bool spillInterferences(const LiveInterval &VirtReg, MCRegister PhysReg, - SmallVectorImpl<Register> &SplitVRegs); - - static char ID; -}; - char RABasic::ID = 0; -} // end anonymous namespace - char &llvm::RABasicID = RABasic::ID; INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator", diff --git a/llvm/lib/CodeGen/RegAllocBasic.h b/llvm/lib/CodeGen/RegAllocBasic.h new file mode 100644 index 0000000..004bc1a --- /dev/null +++ b/llvm/lib/CodeGen/RegAllocBasic.h @@ -0,0 +1,104 @@ +//===-- RegAllocBasic.h - Basic Register Allocator Header -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares the RABasic class, which provides a minimal +/// implementation of the basic register allocator. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_REGALLOCBASIC_H +#define LLVM_CODEGEN_REGALLOCBASIC_H + +#include "RegAllocBase.h" +#include "llvm/CodeGen/LiveRangeEdit.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Spiller.h" +#include <queue> + +namespace llvm { + +struct CompSpillWeight { + bool operator()(const LiveInterval *A, const LiveInterval *B) const { + return A->weight() < B->weight(); + } +}; + +/// RABasic provides a minimal implementation of the basic register allocation +/// algorithm. It prioritizes live virtual registers by spill weight and spills +/// whenever a register is unavailable. This is not practical in production but +/// provides a useful baseline both for measuring other allocators and comparing +/// the speed of the basic algorithm against other styles of allocators. +class LLVM_LIBRARY_VISIBILITY RABasic : public MachineFunctionPass, + public RegAllocBase, + private LiveRangeEdit::Delegate { + // context + MachineFunction *MF = nullptr; + + // state + std::unique_ptr<Spiller> SpillerInstance; + std::priority_queue<const LiveInterval *, std::vector<const LiveInterval *>, + CompSpillWeight> + Queue; + + // Scratch space. Allocated here to avoid repeated malloc calls in + // selectOrSplit(). + BitVector UsableRegs; + + bool LRE_CanEraseVirtReg(Register) override; + void LRE_WillShrinkVirtReg(Register) override; + +public: + RABasic(const RegAllocFilterFunc F = nullptr); + + /// Return the pass name. + StringRef getPassName() const override { return "Basic Register Allocator"; } + + /// RABasic analysis usage. + void getAnalysisUsage(AnalysisUsage &AU) const override; + + void releaseMemory() override; + + Spiller &spiller() override { return *SpillerInstance; } + + void enqueueImpl(const LiveInterval *LI) override { Queue.push(LI); } + + const LiveInterval *dequeue() override { + if (Queue.empty()) + return nullptr; + const LiveInterval *LI = Queue.top(); + Queue.pop(); + return LI; + } + + MCRegister selectOrSplit(const LiveInterval &VirtReg, + SmallVectorImpl<Register> &SplitVRegs) override; + + /// Perform register allocation. + bool runOnMachineFunction(MachineFunction &mf) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoPHIs); + } + + MachineFunctionProperties getClearedProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } + + // Helper for spilling all live virtual registers currently unified under preg + // that interfere with the most recently queried lvr. Return true if spilling + // was successful, and append any new spilled/split intervals to splitLVRs. + bool spillInterferences(const LiveInterval &VirtReg, MCRegister PhysReg, + SmallVectorImpl<Register> &SplitVRegs); + + static char ID; +}; +} // namespace llvm +#endif diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp index 9962070..908ed96 100644 --- a/llvm/lib/CodeGen/SafeStack.cpp +++ b/llvm/lib/CodeGen/SafeStack.cpp @@ -614,6 +614,13 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( Use &U = *AI->use_begin(); Instruction *User = cast<Instruction>(U.getUser()); + // Drop lifetime markers now that this is no longer an alloca. + // SafeStack has already performed its own stack coloring. + if (User->isLifetimeStartOrEnd()) { + User->eraseFromParent(); + continue; + } + Instruction *InsertBefore; if (auto *PHI = dyn_cast<PHINode>(User)) InsertBefore = PHI->getIncomingBlock(U)->getTerminator(); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index fed5e72..d3df434 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12375,11 +12375,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) { // Any flags available in a select/setcc fold will be on the setcc as they // migrated from fcmp - Flags = N0->getFlags(); - SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, - N2, N0.getOperand(2)); - SelectNode->setFlags(Flags); - return SelectNode; + return DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, N2, + N0.getOperand(2), N0->getFlags()); } if (SDValue ABD = foldSelectToABD(Cond0, Cond1, N1, N2, CC, DL)) @@ -16738,7 +16735,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // Fold freeze(op(x, ...)) -> op(freeze(x), ...). // Try to push freeze through instructions that propagate but don't produce // poison as far as possible. If an operand of freeze follows three - // conditions 1) one-use, and 2) does not produce poison then push + // conditions 1) one-use, 2) does not produce poison, and 3) has all but one + // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push // the freeze through to the operands that are not guaranteed non-poison. // NOTE: we will strip poison-generating flags, so ignore them here. if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false, @@ -16746,6 +16744,18 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { N0->getNumValues() != 1 || !N0->hasOneUse()) return SDValue(); + // TOOD: we should always allow multiple operands, however this increases the + // likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call + // below causing later nodes that share frozen operands to fold again and no + // longer being able to confirm other operands are not poison due to recursion + // depth limits on isGuaranteedNotToBeUndefOrPoison. + bool AllowMultipleMaybePoisonOperands = + N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC || + N0.getOpcode() == ISD::BUILD_VECTOR || + N0.getOpcode() == ISD::BUILD_PAIR || + N0.getOpcode() == ISD::VECTOR_SHUFFLE || + N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL; + // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all // ones" or "constant" into something that depends on FrozenUndef. We can // instead pick undef values to keep those properties, while at the same time @@ -16772,8 +16782,16 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false, /*Depth*/ 1)) continue; - if (MaybePoisonOperands.insert(Op).second) + bool HadMaybePoisonOperands = !MaybePoisonOperands.empty(); + bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second; + if (IsNewMaybePoisonOperand) MaybePoisonOperandNumbers.push_back(OpNo); + if (!HadMaybePoisonOperands) + continue; + if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) { + // Multiple maybe-poison ops when not allowed - bail out. + return SDValue(); + } } // NOTE: the whole op may be not guaranteed to not be undef or poison because // it could create undef or poison due to it's poison-generating flags. @@ -22727,11 +22745,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) { const auto *LifetimeEnd = cast<LifetimeSDNode>(N); - if (!LifetimeEnd->hasOffset()) - return SDValue(); - - const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(), - LifetimeEnd->getOffset(), false); + const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(), 0, false); // We walk up the chains to find stores. SmallVector<SDValue, 8> Chains = {N->getOperand(0)}; @@ -29418,9 +29432,8 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const { return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1), - (LN->hasOffset()) ? LN->getOffset() : 0, - (LN->hasOffset()) ? LocationSize::precise(LN->getSize()) - : LocationSize::beforeOrAfterPointer(), + 0, + LocationSize::precise(LN->getSize()), (MachineMemOperand *)nullptr}; // Default. return {false /*isvolatile*/, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 7266940..74172b2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2785,19 +2785,17 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node, // In strict mode, we must avoid spurious exceptions, and therefore // must make sure to only emit a single STRICT_SINT_TO_FP. SDValue InCvt = DAG.getSelect(dl, SrcVT, SignBitTest, Or, Op0); - Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DestVT, MVT::Other }, - { Node->getOperand(0), InCvt }); - Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DestVT, MVT::Other }, - { Fast.getValue(1), Fast, Fast }); - Chain = Slow.getValue(1); // The STRICT_SINT_TO_FP inherits the exception mode from the // incoming STRICT_UINT_TO_FP node; the STRICT_FADD node can // never raise any exception. SDNodeFlags Flags; Flags.setNoFPExcept(Node->getFlags().hasNoFPExcept()); - Fast->setFlags(Flags); + Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DestVT, MVT::Other}, + {Node->getOperand(0), InCvt}, Flags); Flags.setNoFPExcept(true); - Slow->setFlags(Flags); + Slow = DAG.getNode(ISD::STRICT_FADD, dl, {DestVT, MVT::Other}, + {Fast.getValue(1), Fast, Fast}, Flags); + Chain = Slow.getValue(1); } else { SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Or); Slow = DAG.getNode(ISD::FADD, dl, DestVT, SignCvt, SignCvt); @@ -3407,14 +3405,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { EVT VT = Operand.getValueType(); SDValue One = DAG.getConstantFP(1.0, dl, VT); SDValue Chain = DAG.getEntryNode(); - SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other}, - {Chain, Operand, One}); - // Propagate existing flags on canonicalize, and additionally set // NoFPExcept. SDNodeFlags CanonicalizeFlags = Node->getFlags(); CanonicalizeFlags.setNoFPExcept(true); - Mul->setFlags(CanonicalizeFlags); + SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other}, + {Chain, Operand, One}, CanonicalizeFlags); Results.push_back(Mul); break; @@ -4150,15 +4146,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Tmp2 = Node->getOperand(1); Tmp3 = Node->getOperand(2); if (Tmp1.getOpcode() == ISD::SETCC) { - Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1), - Tmp2, Tmp3, - cast<CondCodeSDNode>(Tmp1.getOperand(2))->get()); + Tmp1 = DAG.getSelectCC( + dl, Tmp1.getOperand(0), Tmp1.getOperand(1), Tmp2, Tmp3, + cast<CondCodeSDNode>(Tmp1.getOperand(2))->get(), Node->getFlags()); } else { - Tmp1 = DAG.getSelectCC(dl, Tmp1, - DAG.getConstant(0, dl, Tmp1.getValueType()), - Tmp2, Tmp3, ISD::SETNE); + Tmp1 = + DAG.getSelectCC(dl, Tmp1, DAG.getConstant(0, dl, Tmp1.getValueType()), + Tmp2, Tmp3, ISD::SETNE, Node->getFlags()); } - Tmp1->setFlags(Node->getFlags()); Results.push_back(Tmp1); break; case ISD::BR_JT: { @@ -4296,8 +4291,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { EVT Tmp1VT = Tmp1.getValueType(); Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2, DAG.getBoolConstant(true, dl, VT, Tmp1VT), - DAG.getBoolConstant(false, dl, VT, Tmp1VT), Tmp3); - Tmp1->setFlags(Node->getFlags()); + DAG.getBoolConstant(false, dl, VT, Tmp1VT), Tmp3, + Node->getFlags()); Results.push_back(Tmp1); break; } @@ -4335,8 +4330,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { if (TLI.isCondCodeLegalOrCustom(InvCC, Tmp1.getSimpleValueType())) { // Use the new condition code and swap true and false Legalized = true; - Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC); - Tmp1->setFlags(Node->getFlags()); + Tmp1 = + DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC, Node->getFlags()); } else { // If The inverse is not legal, then try to swap the arguments using // the inverse condition code. @@ -4345,8 +4340,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { // The swapped inverse condition is legal, so swap true and false, // lhs and rhs. Legalized = true; - Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC); - Tmp1->setFlags(Node->getFlags()); + Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC, + Node->getFlags()); } } @@ -4365,15 +4360,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { // If we expanded the SETCC by swapping LHS and RHS, or by inverting the // condition code, create a new SELECT_CC node. if (CC.getNode()) { - Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), - Tmp1, Tmp2, Tmp3, Tmp4, CC); + Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, + Tmp2, Tmp3, Tmp4, CC, Node->getFlags()); } else { Tmp2 = DAG.getConstant(0, dl, Tmp1.getValueType()); CC = DAG.getCondCode(ISD::SETNE); Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, - Tmp2, Tmp3, Tmp4, CC); + Tmp2, Tmp3, Tmp4, CC, Node->getFlags()); } - Tmp1->setFlags(Node->getFlags()); } Results.push_back(Tmp1); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index f908a66..d2ecc133 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -2087,11 +2087,10 @@ void VectorLegalizer::ExpandSETCC(SDNode *Node, // Otherwise, SETCC for the given comparison type must be completely // illegal; expand it into a SELECT_CC. EVT VT = Node->getValueType(0); - LHS = - DAG.getNode(ISD::SELECT_CC, dl, VT, LHS, RHS, - DAG.getBoolConstant(true, dl, VT, LHS.getValueType()), - DAG.getBoolConstant(false, dl, VT, LHS.getValueType()), CC); - LHS->setFlags(Node->getFlags()); + LHS = DAG.getNode(ISD::SELECT_CC, dl, VT, LHS, RHS, + DAG.getBoolConstant(true, dl, VT, LHS.getValueType()), + DAG.getBoolConstant(false, dl, VT, LHS.getValueType()), + CC, Node->getFlags()); } Results.push_back(LHS); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 32c5961..1661814 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -372,9 +372,9 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N, SDVTList ScalarVTs = DAG.getVTList( ResVT.getVectorElementType(), OvVT.getVectorElementType()); - SDNode *ScalarNode = DAG.getNode( - N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode(); - ScalarNode->setFlags(N->getFlags()); + SDNode *ScalarNode = DAG.getNode(N->getOpcode(), DL, ScalarVTs, + {ScalarLHS, ScalarRHS}, N->getFlags()) + .getNode(); // Replace the other vector result not being explicitly scalarized here. unsigned OtherNo = 1 - ResNo; @@ -1898,7 +1898,7 @@ SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) { NE = ResNE; //The results of each unrolled operation, including the chain. - EVT ChainVTs[] = {EltVT, MVT::Other}; + SDVTList ChainVTs = DAG.getVTList(EltVT, MVT::Other); SmallVector<SDValue, 8> Chains; unsigned i; @@ -1914,8 +1914,8 @@ SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) { Operands[j] = Operand; } } - SDValue Scalar = DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands); - Scalar.getNode()->setFlags(N->getFlags()); + SDValue Scalar = + DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands, N->getFlags()); //Add in the scalar as well as its chain value to the //result vectors. @@ -1956,10 +1956,10 @@ void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo, unsigned Opcode = N->getOpcode(); SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT); SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT); - SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode(); - SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode(); - LoNode->setFlags(N->getFlags()); - HiNode->setFlags(N->getFlags()); + SDNode *LoNode = + DAG.getNode(Opcode, dl, LoVTs, {LoLHS, LoRHS}, N->getFlags()).getNode(); + SDNode *HiNode = + DAG.getNode(Opcode, dl, HiVTs, {HiLHS, HiRHS}, N->getFlags()).getNode(); Lo = SDValue(LoNode, ResNo); Hi = SDValue(HiNode, ResNo); @@ -2669,10 +2669,8 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOpWithTwoResults(SDNode *N, else std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); - Lo = DAG.getNode(N->getOpcode(), dl, {LoVT, LoVT1}, Lo); - Hi = DAG.getNode(N->getOpcode(), dl, {HiVT, HiVT1}, Hi); - Lo->setFlags(N->getFlags()); - Hi->setFlags(N->getFlags()); + Lo = DAG.getNode(N->getOpcode(), dl, {LoVT, LoVT1}, Lo, N->getFlags()); + Hi = DAG.getNode(N->getOpcode(), dl, {HiVT, HiVT1}, Hi, N->getFlags()); SDNode *HiNode = Hi.getNode(); SDNode *LoNode = Lo.getNode(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 2458115..773ff48 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -786,10 +786,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { break; case ISD::LIFETIME_START: case ISD::LIFETIME_END: - if (cast<LifetimeSDNode>(N)->hasOffset()) { - ID.AddInteger(cast<LifetimeSDNode>(N)->getSize()); - ID.AddInteger(cast<LifetimeSDNode>(N)->getOffset()); - } + ID.AddInteger(cast<LifetimeSDNode>(N)->getSize()); break; case ISD::PSEUDO_PROBE: ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid()); @@ -3036,7 +3033,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, return TLI->isSplatValueForTargetNode(V, DemandedElts, UndefElts, *this, Depth); break; -} + } // We don't support other cases than those above for scalable vectors at // the moment. @@ -9364,7 +9361,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl, SDValue Chain, int FrameIndex, - int64_t Size, int64_t Offset) { + int64_t Size) { const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END; const auto VTs = getVTList(MVT::Other); SDValue Ops[2] = { @@ -9377,13 +9374,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl, AddNodeIDNode(ID, Opcode, VTs, Ops); ID.AddInteger(FrameIndex); ID.AddInteger(Size); - ID.AddInteger(Offset); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) return SDValue(E, 0); - LifetimeSDNode *N = newSDNode<LifetimeSDNode>( - Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, Size, Offset); + LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), + dl.getDebugLoc(), VTs, Size); createOperands(N, Ops); CSEMap.InsertNode(N, IP); InsertNode(N); @@ -10563,7 +10559,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef<SDUse> Ops) { switch (Ops.size()) { case 0: return getNode(Opcode, DL, VT); - case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0])); + case 1: return getNode(Opcode, DL, VT, Ops[0].get()); case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]); case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]); default: break; @@ -10699,7 +10695,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) { - return getNode(Opcode, DL, getVTList(ResultTys), Ops); + SDNodeFlags Flags; + if (Inserter) + Flags = Inserter->getFlags(); + return getNode(Opcode, DL, getVTList(ResultTys), Ops, Flags); +} + +SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, + ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops, + const SDNodeFlags Flags) { + return getNode(Opcode, DL, getVTList(ResultTys), Ops, Flags); } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, @@ -10855,26 +10860,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, (Ops[2]->getAsZExtVal() == 0 || Ops[2]->getAsZExtVal() == 1) && "Invalid STRICT_FP_ROUND!"); break; -#if 0 - // FIXME: figure out how to safely handle things like - // int foo(int x) { return 1 << (x & 255); } - // int bar() { return foo(256); } - case ISD::SRA_PARTS: - case ISD::SRL_PARTS: - case ISD::SHL_PARTS: - if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG && - cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1) - return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0)); - else if (N3.getOpcode() == ISD::AND) - if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) { - // If the and is only masking out bits that cannot effect the shift, - // eliminate the and. - unsigned NumBits = VT.getScalarSizeInBits()*2; - if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1) - return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0)); - } - break; -#endif } // Memoize the node unless it returns a glue result. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index da92aaa..8f08046 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -303,10 +303,7 @@ BaseIndexOffset BaseIndexOffset::match(const SDNode *N, if (const auto *LS0 = dyn_cast<LSBaseSDNode>(N)) return matchLSNode(LS0, DAG); if (const auto *LN = dyn_cast<LifetimeSDNode>(N)) { - if (LN->hasOffset()) - return BaseIndexOffset(LN->getOperand(1), SDValue(), LN->getOffset(), - false); - return BaseIndexOffset(LN->getOperand(1), SDValue(), false); + return BaseIndexOffset(LN->getOperand(1), SDValue(), 0, false); } return BaseIndexOffset(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 01e5312..1636465 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7596,32 +7596,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, const int64_t ObjectSize = cast<ConstantInt>(I.getArgOperand(0))->getSExtValue(); - Value *const ObjectPtr = I.getArgOperand(1); - SmallVector<const Value *, 4> Allocas; - getUnderlyingObjects(ObjectPtr, Allocas); + const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1)); - for (const Value *Alloca : Allocas) { - const AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(Alloca); - - // Could not find an Alloca. - if (!LifetimeObject) - continue; - - // First check that the Alloca is static, otherwise it won't have a - // valid frame index. - auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject); - if (SI == FuncInfo.StaticAllocaMap.end()) - return; + // First check that the Alloca is static, otherwise it won't have a + // valid frame index. + auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject); + if (SI == FuncInfo.StaticAllocaMap.end()) + return; - const int FrameIndex = SI->second; - int64_t Offset; - if (GetPointerBaseWithConstantOffset( - ObjectPtr, Offset, DAG.getDataLayout()) != LifetimeObject) - Offset = -1; // Cannot determine offset from alloca to lifetime object. - Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize, - Offset); - DAG.setRoot(Res); - } + const int FrameIndex = SI->second; + Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize); + DAG.setRoot(Res); return; } case Intrinsic::pseudoprobe: { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 7fc1558..9474587 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -947,8 +947,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { << ASC->getDestAddressSpace() << ']'; } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) { - if (LN->hasOffset()) - OS << "<" << LN->getOffset() << " to " << LN->getOffset() + LN->getSize() << ">"; + OS << "<0 to " << LN->getSize() << ">"; } else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) { OS << '<' << AA->getAlign().value() << '>'; } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index e059798..1764910 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -778,7 +778,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( case ISD::FREEZE: { SDValue N0 = Op.getOperand(0); if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts, - /*PoisonOnly=*/false)) + /*PoisonOnly=*/false, Depth + 1)) return N0; break; } @@ -3369,7 +3369,8 @@ bool TargetLowering::SimplifyDemandedVectorElts( case ISD::FREEZE: { SDValue N0 = Op.getOperand(0); if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts, - /*PoisonOnly=*/false)) + /*PoisonOnly=*/false, + Depth + 1)) return TLO.CombineTo(Op, N0); // TODO: Replace this with the general fold from DAGCombiner::visitFREEZE @@ -8128,7 +8129,7 @@ static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) { return ISD::matchUnaryPredicate( Z, [=](ConstantSDNode *C) { return !C || C->getAPIntValue().urem(BW) != 0; }, - /*AllowUndef=*/true, /*AllowTruncation=*/true); + /*AllowUndefs=*/true, /*AllowTruncation=*/true); } static SDValue expandVPFunnelShift(SDNode *Node, SelectionDAG &DAG) { @@ -8633,9 +8634,8 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node, return SDValue(); SDValue Op1 = Node->getOperand(0); SDValue Op2 = Node->getOperand(1); - SDValue SelCC = DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred); - SelCC->setFlags(Node->getFlags()); - return SelCC; + return DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred, + Node->getFlags()); } return SDValue(); @@ -11994,8 +11994,7 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node, // Get the mask value and add it to the current output position. This // either increments by 1 if MaskI is true or adds 0 otherwise. // Freeze in case we have poison/undef mask entries. - SDValue MaskI = - DAG.getFreeze(DAG.getExtractVectorElt(DL, MaskScalarVT, Mask, I)); + SDValue MaskI = DAG.getExtractVectorElt(DL, MaskScalarVT, Mask, I); MaskI = DAG.getFreeze(MaskI); MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI); MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, PositionVT, MaskI); diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp index 93a567e..64f1bfc0 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp @@ -263,7 +263,7 @@ bool LVScope::removeElement(LVElement *Element) { return Item == Element; }; auto RemoveElement = [Element, Predicate](auto &Container) -> bool { - auto Iter = std::remove_if(Container->begin(), Container->end(), Predicate); + auto Iter = llvm::remove_if(*Container, Predicate); if (Iter != Container->end()) { Container->erase(Iter, Container->end()); Element->resetParent(); diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index cca9959..ffc7696 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -738,6 +738,32 @@ static inline uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo) { return Hi == 63 ? Val >> Lo : (Val & (((1ULL << (Hi + 1)) - 1))) >> Lo; } +// Calculate the adjusted page delta between dest and PC. The code is copied +// from lld and see comments there for more details. +static uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc, + uint32_t type) { + uint64_t pcalau12i_pc; + switch (type) { + case ELF::R_LARCH_PCALA64_LO20: + case ELF::R_LARCH_GOT64_PC_LO20: + pcalau12i_pc = pc - 8; + break; + case ELF::R_LARCH_PCALA64_HI12: + case ELF::R_LARCH_GOT64_PC_HI12: + pcalau12i_pc = pc - 12; + break; + default: + pcalau12i_pc = pc; + break; + } + uint64_t result = (dest & ~0xfffULL) - (pcalau12i_pc & ~0xfffULL); + if (dest & 0x800) + result += 0x1000 - 0x1'0000'0000; + if (result & 0x8000'0000) + result += 0x1'0000'0000; + return result; +} + void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section, uint64_t Offset, uint64_t Value, uint32_t Type, @@ -789,10 +815,7 @@ void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section, case ELF::R_LARCH_GOT_PC_HI20: case ELF::R_LARCH_PCALA_HI20: { uint64_t Target = Value + Addend; - uint64_t TargetPage = - (Target + (Target & 0x800)) & ~static_cast<uint64_t>(0xfff); - uint64_t PCPage = FinalAddress & ~static_cast<uint64_t>(0xfff); - int64_t PageDelta = TargetPage - PCPage; + int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type); auto Instr = support::ulittle32_t::ref(TargetPtr); uint32_t Imm31_12 = extractBits(PageDelta, /*Hi=*/31, /*Lo=*/12) << 5; Instr = (Instr & 0xfe00001f) | Imm31_12; @@ -806,6 +829,24 @@ void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section, Instr = (Instr & 0xffc003ff) | Imm11_0; break; } + case ELF::R_LARCH_GOT64_PC_LO20: + case ELF::R_LARCH_PCALA64_LO20: { + uint64_t Target = Value + Addend; + int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type); + auto Instr = support::ulittle32_t::ref(TargetPtr); + uint32_t Imm51_32 = extractBits(PageDelta, /*Hi=*/51, /*Lo=*/32) << 5; + Instr = (Instr & 0xfe00001f) | Imm51_32; + break; + } + case ELF::R_LARCH_GOT64_PC_HI12: + case ELF::R_LARCH_PCALA64_HI12: { + uint64_t Target = Value + Addend; + int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type); + auto Instr = support::ulittle32_t::ref(TargetPtr); + uint32_t Imm63_52 = extractBits(PageDelta, /*Hi=*/63, /*Lo=*/52) << 10; + Instr = (Instr & 0xffc003ff) | Imm63_52; + break; + } case ELF::R_LARCH_ABS_HI20: { uint64_t Target = Value + Addend; auto Instr = support::ulittle32_t::ref(TargetPtr); @@ -1758,7 +1799,9 @@ RuntimeDyldELF::processRelocationRef( MemMgr.allowStubAllocation()) { resolveLoongArch64Branch(SectionID, Value, RelI, Stubs); } else if (RelType == ELF::R_LARCH_GOT_PC_HI20 || - RelType == ELF::R_LARCH_GOT_PC_LO12) { + RelType == ELF::R_LARCH_GOT_PC_LO12 || + RelType == ELF::R_LARCH_GOT64_PC_HI12 || + RelType == ELF::R_LARCH_GOT64_PC_LO20) { uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_LARCH_64); resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend, RelType); @@ -2936,7 +2979,9 @@ bool RuntimeDyldELF::relocationNeedsGot(const RelocationRef &R) const { if (Arch == Triple::loongarch64) return RelTy == ELF::R_LARCH_GOT_PC_HI20 || - RelTy == ELF::R_LARCH_GOT_PC_LO12; + RelTy == ELF::R_LARCH_GOT_PC_LO12 || + RelTy == ELF::R_LARCH_GOT64_PC_HI12 || + RelTy == ELF::R_LARCH_GOT64_PC_LO20; if (Arch == Triple::x86_64) return RelTy == ELF::R_X86_64_GOTPCREL || diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 145ef10..e5a4e1e 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -404,6 +404,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { break; case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break; case CallingConv::AMDGPU_Gfx: Out << "amdgpu_gfx"; break; + case CallingConv::AMDGPU_Gfx_WholeWave: + Out << "amdgpu_gfx_whole_wave"; + break; case CallingConv::M68k_RTD: Out << "m68k_rtdcc"; break; case CallingConv::RISCV_VectorCall: Out << "riscv_vector_cc"; @@ -2398,8 +2401,9 @@ static void writeDIFile(raw_ostream &Out, const DIFile *N, AsmWriterContext &) { // Print all values for checksum together, or not at all. if (N->getChecksum()) Printer.printChecksum(*N->getChecksum()); - Printer.printString("source", N->getSource().value_or(StringRef()), - /* ShouldSkipEmpty */ true); + if (N->getSource()) + Printer.printString("source", *N->getSource(), + /* ShouldSkipEmpty */ false); Out << ")"; } diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 86285a0..28ed1e5 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1310,6 +1310,18 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, return true; } break; + case 'l': + if (Name.starts_with("lifetime.start") || + Name.starts_with("lifetime.end")) { + // Unless remangling is required, do not upgrade the function declaration, + // but do upgrade the calls. + if (auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F)) + NewFn = *Result; + else + NewFn = F; + return true; + } + break; case 'm': { // Updating the memory intrinsics (memcpy/memmove/memset) that have an // alignment parameter to embedding the alignment as an attribute of @@ -1629,7 +1641,6 @@ bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn, NewFn = nullptr; bool Upgraded = upgradeIntrinsicFunction1(F, NewFn, CanUpgradeDebugIntrinsicsToRecords); - assert(F != NewFn && "Intrinsic function upgraded to the same function"); // Upgrade intrinsic attributes. This does not change the function. if (NewFn) @@ -4570,6 +4581,9 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { } const auto &DefaultCase = [&]() -> void { + if (F == NewFn) + return; + if (CI->getFunctionType() == NewFn->getFunctionType()) { // Handle generic mangling change. assert( @@ -5109,6 +5123,31 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { MTI->setSourceAlignment(Align->getMaybeAlignValue()); break; } + + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: { + Value *Size = CI->getArgOperand(0); + Value *Ptr = CI->getArgOperand(1); + if (isa<AllocaInst>(Ptr)) { + DefaultCase(); + return; + } + + // Try to strip pointer casts, such that the lifetime works on an alloca. + Ptr = Ptr->stripPointerCasts(); + if (isa<AllocaInst>(Ptr)) { + // Don't use NewFn, as we might have looked through an addrspacecast. + if (NewFn->getIntrinsicID() == Intrinsic::lifetime_start) + NewCall = Builder.CreateLifetimeStart(Ptr, cast<ConstantInt>(Size)); + else + NewCall = Builder.CreateLifetimeEnd(Ptr, cast<ConstantInt>(Size)); + break; + } + + // Otherwise remove the lifetime marker. + CI->eraseFromParent(); + return; + } } assert(NewCall && "Should have either set this variable or returned through " "the default case"); @@ -5131,7 +5170,8 @@ void llvm::UpgradeCallsToIntrinsic(Function *F) { UpgradeIntrinsicCall(CB, NewFn); // Remove old function, no longer used, from the module. - F->eraseFromParent(); + if (F != NewFn) + F->eraseFromParent(); } } diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 8fb33c3..ab8ecee 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -45,25 +45,6 @@ using namespace llvm; using namespace llvm::at; using namespace llvm::dwarf; -TinyPtrVector<DbgDeclareInst *> llvm::findDbgDeclares(Value *V) { - // This function is hot. Check whether the value has any metadata to avoid a - // DenseMap lookup. This check is a bitfield datamember lookup. - if (!V->isUsedByMetadata()) - return {}; - auto *L = ValueAsMetadata::getIfExists(V); - if (!L) - return {}; - auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L); - if (!MDV) - return {}; - - TinyPtrVector<DbgDeclareInst *> Declares; - for (User *U : MDV->users()) - if (auto *DDI = dyn_cast<DbgDeclareInst>(U)) - Declares.push_back(DDI); - - return Declares; -} TinyPtrVector<DbgVariableRecord *> llvm::findDVRDeclares(Value *V) { // This function is hot. Check whether the value has any metadata to avoid a // DenseMap lookup. This check is a bitfield datamember lookup. @@ -98,42 +79,31 @@ TinyPtrVector<DbgVariableRecord *> llvm::findDVRValues(Value *V) { return Values; } -template <typename IntrinsicT, bool DbgAssignAndValuesOnly> +template <bool DbgAssignAndValuesOnly> static void -findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V, - SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) { +findDbgIntrinsics(Value *V, + SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) { // This function is hot. Check whether the value has any metadata to avoid a // DenseMap lookup. if (!V->isUsedByMetadata()) return; - LLVMContext &Ctx = V->getContext(); // TODO: If this value appears multiple times in a DIArgList, we should still - // only add the owning DbgValueInst once; use this set to track ArgListUsers. + // only add the owning dbg.value once; use this set to track ArgListUsers. // This behaviour can be removed when we can automatically remove duplicates. // V will also appear twice in a dbg.assign if its used in the both the value // and address components. - SmallPtrSet<IntrinsicT *, 4> EncounteredIntrinsics; SmallPtrSet<DbgVariableRecord *, 4> EncounteredDbgVariableRecords; - /// Append IntrinsicT users of MetadataAsValue(MD). - auto AppendUsers = [&Ctx, &EncounteredIntrinsics, - &EncounteredDbgVariableRecords, &Result, - DbgVariableRecords](Metadata *MD) { - if (auto *MDV = MetadataAsValue::getIfExists(Ctx, MD)) { - for (User *U : MDV->users()) - if (IntrinsicT *DVI = dyn_cast<IntrinsicT>(U)) - if (EncounteredIntrinsics.insert(DVI).second) - Result.push_back(DVI); - } - if (!DbgVariableRecords) - return; + /// Append users of MetadataAsValue(MD). + auto AppendUsers = [&EncounteredDbgVariableRecords, + &DbgVariableRecords](Metadata *MD) { // Get DbgVariableRecords that use this as a single value. if (LocalAsMetadata *L = dyn_cast<LocalAsMetadata>(MD)) { for (DbgVariableRecord *DVR : L->getAllDbgVariableRecordUsers()) { if (!DbgAssignAndValuesOnly || DVR->isDbgValue() || DVR->isDbgAssign()) if (EncounteredDbgVariableRecords.insert(DVR).second) - DbgVariableRecords->push_back(DVR); + DbgVariableRecords.push_back(DVR); } } }; @@ -142,29 +112,23 @@ findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V, AppendUsers(L); for (Metadata *AL : L->getAllArgListUsers()) { AppendUsers(AL); - if (!DbgVariableRecords) - continue; DIArgList *DI = cast<DIArgList>(AL); for (DbgVariableRecord *DVR : DI->getAllDbgVariableRecordUsers()) if (!DbgAssignAndValuesOnly || DVR->isDbgValue() || DVR->isDbgAssign()) if (EncounteredDbgVariableRecords.insert(DVR).second) - DbgVariableRecords->push_back(DVR); + DbgVariableRecords.push_back(DVR); } } } void llvm::findDbgValues( - SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V, - SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) { - findDbgIntrinsics<DbgValueInst, /*DbgAssignAndValuesOnly=*/true>( - DbgValues, V, DbgVariableRecords); + Value *V, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) { + findDbgIntrinsics</*DbgAssignAndValuesOnly=*/true>(V, DbgVariableRecords); } void llvm::findDbgUsers( - SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers, Value *V, - SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) { - findDbgIntrinsics<DbgVariableIntrinsic, /*DbgAssignAndValuesOnly=*/false>( - DbgUsers, V, DbgVariableRecords); + Value *V, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) { + findDbgIntrinsics</*DbgAssignAndValuesOnly=*/false>(V, DbgVariableRecords); } DISubprogram *llvm::getDISubprogram(const MDNode *Scope) { @@ -173,18 +137,6 @@ DISubprogram *llvm::getDISubprogram(const MDNode *Scope) { return nullptr; } -DebugLoc llvm::getDebugValueLoc(DbgVariableIntrinsic *DII) { - // Original dbg.declare must have a location. - const DebugLoc &DeclareLoc = DII->getDebugLoc(); - MDNode *Scope = DeclareLoc.getScope(); - DILocation *InlinedAt = DeclareLoc.getInlinedAt(); - // Because no machine insts can come from debug intrinsics, only the scope - // and inlinedAt is significant. Zero line numbers are used in case this - // DebugLoc leaks into any adjacent instructions. Produce an unknown location - // with the correct scope / inlinedAt fields. - return DILocation::get(DII->getContext(), 0, 0, Scope, InlinedAt); -} - DebugLoc llvm::getDebugValueLoc(DbgVariableRecord *DVR) { // Original dbg.declare must have a location. const DebugLoc &DeclareLoc = DVR->getDebugLoc(); @@ -852,19 +804,6 @@ void DebugTypeInfoRemoval::traverse(MDNode *N) { bool llvm::stripNonLineTableDebugInfo(Module &M) { bool Changed = false; - // First off, delete the debug intrinsics. - auto RemoveUses = [&](StringRef Name) { - if (auto *DbgVal = M.getFunction(Name)) { - while (!DbgVal->use_empty()) - cast<Instruction>(DbgVal->user_back())->eraseFromParent(); - DbgVal->eraseFromParent(); - Changed = true; - } - }; - RemoveUses("llvm.dbg.declare"); - RemoveUses("llvm.dbg.label"); - RemoveUses("llvm.dbg.value"); - // Delete non-CU debug info named metadata nodes. for (auto NMI = M.named_metadata_begin(), NME = M.named_metadata_end(); NMI != NME;) { diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 7a03663..fc06745 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1232,6 +1232,7 @@ bool llvm::CallingConv::supportsNonVoidReturnType(CallingConv::ID CC) { case CallingConv::AArch64_SVE_VectorCall: case CallingConv::WASM_EmscriptenInvoke: case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: case CallingConv::M68k_INTR: case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0: case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2: diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index f0448b0..0dbd07f 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -1303,6 +1303,24 @@ static void addRange(SmallVectorImpl<ConstantInt *> &EndPoints, EndPoints.push_back(High); } +MDNode *MDNode::getMergedCalleeTypeMetadata(const MDNode *A, const MDNode *B) { + // Drop the callee_type metadata if either of the call instructions do not + // have it. + if (!A || !B) + return nullptr; + SmallVector<Metadata *, 8> AB; + SmallPtrSet<Metadata *, 8> MergedCallees; + auto AddUniqueCallees = [&AB, &MergedCallees](const MDNode *N) { + for (Metadata *MD : N->operands()) { + if (MergedCallees.insert(MD).second) + AB.push_back(MD); + } + }; + AddUniqueCallees(A); + AddUniqueCallees(B); + return MDNode::get(A->getContext(), AB); +} + MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) { // Given two ranges, we want to compute the union of the ranges. This // is slightly complicated by having to combine the intervals and merge diff --git a/llvm/lib/IR/PassInstrumentation.cpp b/llvm/lib/IR/PassInstrumentation.cpp index 94ad124..70bbe8f 100644 --- a/llvm/lib/IR/PassInstrumentation.cpp +++ b/llvm/lib/IR/PassInstrumentation.cpp @@ -23,6 +23,7 @@ template struct LLVM_EXPORT_TEMPLATE Any::TypeId<const Loop *>; void PassInstrumentationCallbacks::addClassToPassName(StringRef ClassName, StringRef PassName) { + assert(!PassName.empty() && "PassName can't be empty!"); ClassToPassName.try_emplace(ClassName, PassName.str()); } @@ -33,7 +34,10 @@ PassInstrumentationCallbacks::getPassNameForClassName(StringRef ClassName) { Fn(); ClassToPassNameCallbacks.clear(); } - return ClassToPassName[ClassName]; + auto PassNameIter = ClassToPassName.find(ClassName); + if (PassNameIter != ClassToPassName.end()) + return PassNameIter->second; + return {}; } AnalysisKey PassInstrumentationAnalysis::Key; diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 02c16e2..5928c89 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -582,16 +582,11 @@ void Value::replaceUsesWithIf(Value *New, } } -/// Replace llvm.dbg.* uses of MetadataAsValue(ValueAsMetadata(V)) outside BB +/// Replace debug record uses of MetadataAsValue(ValueAsMetadata(V)) outside BB /// with New. static void replaceDbgUsesOutsideBlock(Value *V, Value *New, BasicBlock *BB) { - SmallVector<DbgVariableIntrinsic *> DbgUsers; SmallVector<DbgVariableRecord *> DPUsers; - findDbgUsers(DbgUsers, V, &DPUsers); - for (auto *DVI : DbgUsers) { - if (DVI->getParent() != BB) - DVI->replaceVariableLocationOp(V, New); - } + findDbgUsers(V, DPUsers); for (auto *DVR : DPUsers) { DbgMarker *Marker = DVR->getMarker(); if (Marker->getParent() != BB) diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 8c8ed3c..3ff9895 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -531,6 +531,7 @@ private: void visitCallStackMetadata(MDNode *MD); void visitMemProfMetadata(Instruction &I, MDNode *MD); void visitCallsiteMetadata(Instruction &I, MDNode *MD); + void visitCalleeTypeMetadata(Instruction &I, MDNode *MD); void visitDIAssignIDMetadata(Instruction &I, MDNode *MD); void visitMMRAMetadata(Instruction &I, MDNode *MD); void visitAnnotationMetadata(MDNode *Annotation); @@ -2978,6 +2979,16 @@ void Verifier::visitFunction(const Function &F) { "perfect forwarding!", &F); break; + case CallingConv::AMDGPU_Gfx_WholeWave: + Check(!F.arg_empty() && F.arg_begin()->getType()->isIntegerTy(1), + "Calling convention requires first argument to be i1", &F); + Check(!F.arg_begin()->hasInRegAttr(), + "Calling convention requires first argument to not be inreg", &F); + Check(!F.isVarArg(), + "Calling convention does not support varargs or " + "perfect forwarding!", + &F); + break; } // Check that the argument values match the function type for this function... @@ -5193,6 +5204,33 @@ void Verifier::visitCallsiteMetadata(Instruction &I, MDNode *MD) { visitCallStackMetadata(MD); } +static inline bool isConstantIntMetadataOperand(const Metadata *MD) { + if (auto *VAL = dyn_cast<ValueAsMetadata>(MD)) + return isa<ConstantInt>(VAL->getValue()); + return false; +} + +void Verifier::visitCalleeTypeMetadata(Instruction &I, MDNode *MD) { + Check(isa<CallBase>(I), "!callee_type metadata should only exist on calls", + &I); + for (Metadata *Op : MD->operands()) { + Check(isa<MDNode>(Op), + "The callee_type metadata must be a list of type metadata nodes", Op); + auto *TypeMD = cast<MDNode>(Op); + Check(TypeMD->getNumOperands() == 2, + "Well-formed generalized type metadata must contain exactly two " + "operands", + Op); + Check(isConstantIntMetadataOperand(TypeMD->getOperand(0)) && + mdconst::extract<ConstantInt>(TypeMD->getOperand(0))->isZero(), + "The first operand of type metadata for functions must be zero", Op); + Check(TypeMD->hasGeneralizedMDString(), + "Only generalized type metadata can be part of the callee_type " + "metadata list", + Op); + } +} + void Verifier::visitAnnotationMetadata(MDNode *Annotation) { Check(isa<MDTuple>(Annotation), "annotation must be a tuple"); Check(Annotation->getNumOperands() >= 1, @@ -5470,6 +5508,9 @@ void Verifier::visitInstruction(Instruction &I) { if (MDNode *MD = I.getMetadata(LLVMContext::MD_callsite)) visitCallsiteMetadata(I, MD); + if (MDNode *MD = I.getMetadata(LLVMContext::MD_callee_type)) + visitCalleeTypeMetadata(I, MD); + if (MDNode *MD = I.getMetadata(LLVMContext::MD_DIAssignID)) visitDIAssignIDMetadata(I, MD); @@ -6627,6 +6668,54 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "invalid vector type for format", &Call, Src1, Call.getArgOperand(5)); break; } + case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: { + Value *Src0 = Call.getArgOperand(1); + Value *Src1 = Call.getArgOperand(3); + + unsigned FmtA = cast<ConstantInt>(Call.getArgOperand(0))->getZExtValue(); + unsigned FmtB = cast<ConstantInt>(Call.getArgOperand(2))->getZExtValue(); + Check(FmtA <= 4, "invalid value for matrix format", Call, + Call.getArgOperand(0)); + Check(FmtB <= 4, "invalid value for matrix format", Call, + Call.getArgOperand(2)); + + // AMDGPU::MatrixFMT values + auto getFormatNumRegs = [](unsigned FormatVal) { + switch (FormatVal) { + case 0: + case 1: + return 16u; + case 2: + case 3: + return 12u; + case 4: + return 8u; + default: + llvm_unreachable("invalid format value"); + } + }; + + auto isValidSrcASrcBVector = [](FixedVectorType *Ty) { + if (!Ty || !Ty->getElementType()->isIntegerTy(32)) + return false; + unsigned NumElts = Ty->getNumElements(); + return NumElts == 16 || NumElts == 12 || NumElts == 8; + }; + + auto *Src0Ty = dyn_cast<FixedVectorType>(Src0->getType()); + auto *Src1Ty = dyn_cast<FixedVectorType>(Src1->getType()); + Check(isValidSrcASrcBVector(Src0Ty), + "operand 1 must be 8, 12 or 16 element i32 vector", &Call, Src0); + Check(isValidSrcASrcBVector(Src1Ty), + "operand 3 must be 8, 12 or 16 element i32 vector", &Call, Src1); + + // Permit excess registers for the format. + Check(Src0Ty->getNumElements() >= getFormatNumRegs(FmtA), + "invalid vector type for format", &Call, Src0, Call.getArgOperand(0)); + Check(Src1Ty->getNumElements() >= getFormatNumRegs(FmtB), + "invalid vector type for format", &Call, Src1, Call.getArgOperand(2)); + break; + } case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32: case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: { Value *V = Call.getArgOperand(0); @@ -6679,6 +6768,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "llvm.threadlocal.address operand isThreadLocal() must be true"); break; } + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + Check(isa<AllocaInst>(Call.getArgOperand(1)), + "llvm.lifetime.start/end can only be used on alloca", &Call); + break; }; // Verify that there aren't any unmediated control transfers between funclets. diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 3e96bdf..2b56e2a 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -196,12 +196,12 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const { switch (F.getKind()) { case MCFragment::FT_Data: case MCFragment::FT_Relaxable: + case MCFragment::FT_Align: case MCFragment::FT_LEB: case MCFragment::FT_Dwarf: case MCFragment::FT_DwarfFrame: case MCFragment::FT_CVInlineLines: case MCFragment::FT_CVDefRange: - case MCFragment::FT_PseudoProbe: return F.getSize(); case MCFragment::FT_Fill: { auto &FF = cast<MCFillFragment>(F); @@ -227,28 +227,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const { case MCFragment::FT_SymbolId: return 4; - case MCFragment::FT_Align: { - const MCAlignFragment &AF = cast<MCAlignFragment>(F); - unsigned Offset = getFragmentOffset(AF); - unsigned Size = offsetToAlignment(Offset, AF.getAlignment()); - - // Insert extra Nops for code alignment if the target define - // shouldInsertExtraNopBytesForCodeAlign target hook. - if (AF.getParent()->useCodeAlign() && AF.hasEmitNops() && - getBackend().shouldInsertExtraNopBytesForCodeAlign(AF, Size)) - return Size; - - // If we are padding with nops, force the padding to be larger than the - // minimum nop size. - if (Size > 0 && AF.hasEmitNops()) { - while (Size % getBackend().getMinimumNopSize()) - Size += AF.getAlignment().value(); - } - if (Size > AF.getMaxBytesToEmit()) - return 0; - return Size; - } - case MCFragment::FT_Org: { const MCOrgFragment &OF = cast<MCOrgFragment>(F); MCValue Value; @@ -384,7 +362,7 @@ uint64_t MCAssembler::getSectionAddressSize(const MCSection &Sec) const { uint64_t MCAssembler::getSectionFileSize(const MCSection &Sec) const { // Virtual sections have no file size. - if (Sec.isVirtualSection()) + if (Sec.isBssSection()) return 0; return getSectionAddressSize(Sec); } @@ -424,8 +402,7 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm, case MCFragment::FT_Dwarf: case MCFragment::FT_DwarfFrame: case MCFragment::FT_CVInlineLines: - case MCFragment::FT_CVDefRange: - case MCFragment::FT_PseudoProbe: { + case MCFragment::FT_CVDefRange: { if (F.getKind() == MCFragment::FT_Data) ++stats::EmittedDataFragments; else if (F.getKind() == MCFragment::FT_Relaxable) @@ -433,48 +410,45 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm, const auto &EF = cast<MCFragment>(F); OS << StringRef(EF.getContents().data(), EF.getContents().size()); OS << StringRef(EF.getVarContents().data(), EF.getVarContents().size()); - break; - } + } break; + case MCFragment::FT_Align: { ++stats::EmittedAlignFragments; - const MCAlignFragment &AF = cast<MCAlignFragment>(F); - assert(AF.getFillLen() && "Invalid virtual align in concrete fragment!"); + OS << StringRef(F.getContents().data(), F.getContents().size()); + assert(F.getAlignFillLen() && + "Invalid virtual align in concrete fragment!"); - uint64_t Count = FragmentSize / AF.getFillLen(); - assert(FragmentSize % AF.getFillLen() == 0 && + uint64_t Count = (FragmentSize - F.getFixedSize()) / F.getAlignFillLen(); + assert((FragmentSize - F.getFixedSize()) % F.getAlignFillLen() == 0 && "computeFragmentSize computed size is incorrect"); - // See if we are aligning with nops, and if so do that first to try to fill - // the Count bytes. Then if that did not fill any bytes or there are any - // bytes left to fill use the Value and ValueSize to fill the rest. - // If we are aligning with nops, ask that target to emit the right data. - if (AF.hasEmitNops()) { - if (!Asm.getBackend().writeNopData(OS, Count, AF.getSubtargetInfo())) - report_fatal_error("unable to write nop sequence of " + - Twine(Count) + " bytes"); - break; - } - - // Otherwise, write out in multiples of the value size. - for (uint64_t i = 0; i != Count; ++i) { - switch (AF.getFillLen()) { - default: llvm_unreachable("Invalid size!"); - case 1: - OS << char(AF.getFill()); - break; - case 2: - support::endian::write<uint16_t>(OS, AF.getFill(), Endian); - break; - case 4: - support::endian::write<uint32_t>(OS, AF.getFill(), Endian); - break; - case 8: - support::endian::write<uint64_t>(OS, AF.getFill(), Endian); - break; + // In the nops mode, call the backend hook to write `Count` nops. + if (F.hasAlignEmitNops()) { + if (!Asm.getBackend().writeNopData(OS, Count, F.getSubtargetInfo())) + reportFatalInternalError("unable to write nop sequence of " + + Twine(Count) + " bytes"); + } else { + // Otherwise, write out in multiples of the value size. + for (uint64_t i = 0; i != Count; ++i) { + switch (F.getAlignFillLen()) { + default: + llvm_unreachable("Invalid size!"); + case 1: + OS << char(F.getAlignFill()); + break; + case 2: + support::endian::write<uint16_t>(OS, F.getAlignFill(), Endian); + break; + case 4: + support::endian::write<uint32_t>(OS, F.getAlignFill(), Endian); + break; + case 8: + support::endian::write<uint64_t>(OS, F.getAlignFill(), Endian); + break; + } } } - break; - } + } break; case MCFragment::FT_Fill: { ++stats::EmittedFillFragments; @@ -585,42 +559,45 @@ void MCAssembler::writeSectionData(raw_ostream &OS, const MCSection *Sec) const { assert(getBackendPtr() && "Expected assembler backend"); - // Ignore virtual sections. - if (Sec->isVirtualSection()) { + if (Sec->isBssSection()) { assert(getSectionFileSize(*Sec) == 0 && "Invalid size for section!"); - // Check that contents are only things legal inside a virtual section. + // Ensure no fixups or non-zero bytes are written to BSS sections, catching + // errors in both input assembly code and MCStreamer API usage. Location is + // not tracked for efficiency. + auto Fn = [](char c) { return c != 0; }; for (const MCFragment &F : *Sec) { + bool HasNonZero = false; switch (F.getKind()) { - default: llvm_unreachable("Invalid fragment in virtual section!"); - case MCFragment::FT_Data: { - // Check that we aren't trying to write a non-zero contents (or fixups) - // into a virtual section. This is to support clients which use standard - // directives to fill the contents of virtual sections. - if (F.getFixups().size() || F.getVarFixups().size()) - reportError(SMLoc(), Sec->getVirtualSectionKind() + " section '" + - Sec->getName() + "' cannot have fixups"); - for (char C : F.getContents()) - if (C) { - reportError(SMLoc(), Sec->getVirtualSectionKind() + " section '" + - Sec->getName() + - "' cannot have non-zero initializers"); - break; - } + default: + reportFatalInternalError("BSS section '" + Sec->getName() + + "' contains invalid fragment"); + break; + case MCFragment::FT_Data: + case MCFragment::FT_Relaxable: + HasNonZero = + any_of(F.getContents(), Fn) || any_of(F.getVarContents(), Fn); break; - } case MCFragment::FT_Align: - // Check that we aren't trying to write a non-zero value into a virtual - // section. - assert((cast<MCAlignFragment>(F).getFillLen() == 0 || - cast<MCAlignFragment>(F).getFill() == 0) && - "Invalid align in virtual section!"); + // Disallowed for API usage. AsmParser changes non-zero fill values to + // 0. + assert(F.getAlignFill() == 0 && "Invalid align in virtual section!"); break; case MCFragment::FT_Fill: - assert((cast<MCFillFragment>(F).getValue() == 0) && - "Invalid fill in virtual section!"); + HasNonZero = cast<MCFillFragment>(F).getValue() != 0; break; case MCFragment::FT_Org: + HasNonZero = cast<MCOrgFragment>(F).getValue() != 0; + break; + } + if (HasNonZero) { + reportError(SMLoc(), "BSS section '" + Sec->getName() + + "' cannot have non-zero bytes"); + break; + } + if (F.getFixups().size() || F.getVarFixups().size()) { + reportError(SMLoc(), + "BSS section '" + Sec->getName() + "' cannot have fixups"); break; } } @@ -724,34 +701,25 @@ void MCAssembler::layout() { for (MCSection &Sec : *this) { for (MCFragment &F : Sec) { // Process fragments with fixups here. - if (F.isEncoded()) { - auto Contents = F.getContents(); - for (MCFixup &Fixup : F.getFixups()) { + auto Contents = F.getContents(); + for (MCFixup &Fixup : F.getFixups()) { + uint64_t FixedValue; + MCValue Target; + evaluateFixup(F, Fixup, Target, FixedValue, + /*RecordReloc=*/true, Contents); + } + if (F.getVarFixups().size()) { + // In the variable part, fixup offsets are relative to the fixed part's + // start. Extend the variable contents to the left to account for the + // fixed part size. + Contents = MutableArrayRef(F.getParent()->ContentStorage) + .slice(F.VarContentStart - Contents.size(), F.getSize()); + for (MCFixup &Fixup : F.getVarFixups()) { uint64_t FixedValue; MCValue Target; evaluateFixup(F, Fixup, Target, FixedValue, /*RecordReloc=*/true, Contents); } - // In the variable part, fixup offsets are relative to the fixed part's - // start. Extend the variable contents to the left to account for the - // fixed part size. - auto VarFixups = F.getVarFixups(); - if (VarFixups.size()) { - Contents = - MutableArrayRef(F.getParent()->ContentStorage) - .slice(F.VarContentStart - Contents.size(), F.getSize()); - for (MCFixup &Fixup : VarFixups) { - uint64_t FixedValue; - MCValue Target; - evaluateFixup(F, Fixup, Target, FixedValue, - /*RecordReloc=*/true, Contents); - } - } - } else if (auto *AF = dyn_cast<MCAlignFragment>(&F)) { - // For RISC-V linker relaxation, an alignment relocation might be - // needed. - if (AF->hasEmitNops()) - getBackend().shouldInsertFixupForCodeAlign(*this, *AF); } } } @@ -955,15 +923,15 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCFragment &F) { } bool MCAssembler::relaxCVInlineLineTable(MCCVInlineLineTableFragment &F) { - unsigned OldSize = F.getContents().size(); + unsigned OldSize = F.getVarContents().size(); getContext().getCVContext().encodeInlineLineTable(*this, F); - return OldSize != F.getContents().size(); + return OldSize != F.getVarContents().size(); } bool MCAssembler::relaxCVDefRange(MCCVDefRangeFragment &F) { - unsigned OldSize = F.getContents().size(); + unsigned OldSize = F.getVarContents().size(); getContext().getCVContext().encodeDefRange(*this, F); - return OldSize != F.getContents().size(); + return OldSize != F.getVarContents().size(); } bool MCAssembler::relaxFill(MCFillFragment &F) { @@ -974,22 +942,6 @@ bool MCAssembler::relaxFill(MCFillFragment &F) { return true; } -bool MCAssembler::relaxPseudoProbeAddr(MCPseudoProbeAddrFragment &PF) { - uint64_t OldSize = PF.getContents().size(); - int64_t AddrDelta; - bool Abs = PF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, *this); - assert(Abs && "We created a pseudo probe with an invalid expression"); - (void)Abs; - SmallVector<char, 8> Data; - raw_svector_ostream OSE(Data); - - // AddrDelta is a signed integer - encodeSLEB128(AddrDelta, OSE, OldSize); - PF.setContents(Data); - PF.clearFixups(); - return OldSize != Data.size(); -} - bool MCAssembler::relaxFragment(MCFragment &F) { switch(F.getKind()) { default: @@ -1011,8 +963,6 @@ bool MCAssembler::relaxFragment(MCFragment &F) { return relaxCVDefRange(cast<MCCVDefRangeFragment>(F)); case MCFragment::FT_Fill: return relaxFill(cast<MCFillFragment>(F)); - case MCFragment::FT_PseudoProbe: - return relaxPseudoProbeAddr(cast<MCPseudoProbeAddrFragment>(F)); } } @@ -1020,7 +970,32 @@ void MCAssembler::layoutSection(MCSection &Sec) { uint64_t Offset = 0; for (MCFragment &F : Sec) { F.Offset = Offset; - Offset += computeFragmentSize(F); + if (F.getKind() == MCFragment::FT_Align) { + Offset += F.getFixedSize(); + unsigned Size = offsetToAlignment(Offset, F.getAlignment()); + // In the nops mode, RISC-V style linker relaxation might adjust the size + // and add a fixup, even if `Size` is originally 0. + bool AlignFixup = false; + if (F.hasAlignEmitNops()) { + AlignFixup = getBackend().relaxAlign(F, Size); + // If the backend does not handle the fragment specially, pad with nops, + // but ensure that the padding is larger than the minimum nop size. + if (!AlignFixup) + while (Size % getBackend().getMinimumNopSize()) + Size += F.getAlignment().value(); + } + if (!AlignFixup && Size > F.getAlignMaxBytesToEmit()) + Size = 0; + // Update the variable tail size, offset by FixedSize to prevent ubsan + // pointer-overflow in evaluateFixup. The content is ignored. + F.VarContentStart = F.getFixedSize(); + F.VarContentEnd = F.VarContentStart + Size; + if (F.VarContentEnd > F.getParent()->ContentStorage.size()) + F.getParent()->ContentStorage.resize(F.VarContentEnd); + Offset += Size; + } else { + Offset += computeFragmentSize(F); + } } } diff --git a/llvm/lib/MC/MCCodeView.cpp b/llvm/lib/MC/MCCodeView.cpp index 1f98251..7d528a5 100644 --- a/llvm/lib/MC/MCCodeView.cpp +++ b/llvm/lib/MC/MCCodeView.cpp @@ -26,8 +26,10 @@ using namespace llvm; using namespace llvm::codeview; void CodeViewContext::finish() { - if (StrTabFragment) - StrTabFragment->setContents(StrTab); + if (!StrTabFragment) + return; + assert(StrTabFragment->getKind() == MCFragment::FT_Data); + StrTabFragment->setVarContents(StrTab); } /// This is a valid number for use with .cv_loc if we've already seen a .cv_file @@ -166,8 +168,9 @@ void CodeViewContext::emitStringTable(MCObjectStreamer &OS) { // somewhere else. If somebody wants two string tables in their .s file, one // will just be empty. if (!StrTabFragment) { - StrTabFragment = Ctx.allocFragment<MCFragment>(); - OS.insert(StrTabFragment); + OS.newFragment(); + StrTabFragment = OS.getCurrentFragment(); + OS.newFragment(); } OS.emitValueToAlignment(Align(4), 0); @@ -603,7 +606,7 @@ void CodeViewContext::encodeInlineLineTable(const MCAssembler &Asm, compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer); compressAnnotation(std::min(EndSymLength, LocAfterLength), Buffer); - Frag.setContents(Buffer); + Frag.setVarContents(Buffer); } void CodeViewContext::encodeDefRange(const MCAssembler &Asm, @@ -691,6 +694,6 @@ void CodeViewContext::encodeDefRange(const MCAssembler &Asm, } } - Frag.setContents(Contents); - Frag.setFixups(Fixups); + Frag.setVarContents(Contents); + Frag.setVarFixups(Fixups); } diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp index b1dced7..e7c0d37 100644 --- a/llvm/lib/MC/MCDwarf.cpp +++ b/llvm/lib/MC/MCDwarf.cpp @@ -447,10 +447,17 @@ static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile, StringRef(reinterpret_cast<const char *>(Cksum.data()), Cksum.size())); } if (HasAnySource) { + // From https://dwarfstd.org/issues/180201.1.html + // * The value is an empty null-terminated string if no source is available + StringRef Source = DwarfFile.Source.value_or(StringRef()); + // * If the source is available but is an empty file then the value is a + // null-terminated single "\n". + if (DwarfFile.Source && DwarfFile.Source->empty()) + Source = "\n"; if (LineStr) - LineStr->emitRef(MCOS, DwarfFile.Source.value_or(StringRef())); + LineStr->emitRef(MCOS, Source); else { - MCOS->emitBytes(DwarfFile.Source.value_or(StringRef())); // Source and... + MCOS->emitBytes(Source); // Source and... MCOS->emitBytes(StringRef("\0", 1)); // its null terminator. } } diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index 49071bd..b8cbaea5 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -88,7 +88,7 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { if (SectionELF->getFlags() & ELF::SHF_GNU_RETAIN) getWriter().markGnuAbi(); - changeSectionImpl(Section, Subsection); + MCObjectStreamer::changeSection(Section, Subsection); Asm.registerSymbol(*Section->getBeginSymbol()); } diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index 22dff49..dbb2fd1 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -370,7 +370,6 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, } int64_t Num; - unsigned Count; if (DF) { Displacement += DF->getContents().size(); } else if (F->getKind() == MCFragment::FT_Relaxable && @@ -379,11 +378,9 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, // After layout, during relocation generation, it can be treated as a // data fragment. Displacement += F->getSize(); - } else if (auto *AF = dyn_cast<MCAlignFragment>(F); - AF && Layout && AF->hasEmitNops() && - !Asm->getBackend().shouldInsertExtraNopBytesForCodeAlign( - *AF, Count)) { - Displacement += Asm->computeFragmentSize(*AF); + } else if (F->getKind() == MCFragment::FT_Align && Layout && + F->isLinkerRelaxable()) { + Displacement += Asm->computeFragmentSize(*F); } else if (auto *FF = dyn_cast<MCFillFragment>(F); FF && FF->getNumValues().evaluateAsAbsolute(Num)) { Displacement += Num * FF->getValueSize(); diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp index bfe045a..3c395e5 100644 --- a/llvm/lib/MC/MCFragment.cpp +++ b/llvm/lib/MC/MCFragment.cpp @@ -58,7 +58,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { case MCFragment::FT_SymbolId: OS << "SymbolId"; break; case MCFragment::FT_CVInlineLines: OS << "CVInlineLineTable"; break; case MCFragment::FT_CVDefRange: OS << "CVDefRangeTable"; break; - case MCFragment::FT_PseudoProbe: OS << "PseudoProbe"; break; // clang-format on } @@ -73,17 +72,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { }; switch (getKind()) { - case MCFragment::FT_Align: { - const auto *AF = cast<MCAlignFragment>(this); - OS << " Align:" << AF->getAlignment().value() << " Fill:" << AF->getFill() - << " FillLen:" << unsigned(AF->getFillLen()) - << " MaxBytesToEmit:" << AF->getMaxBytesToEmit(); - if (AF->hasEmitNops()) - OS << " Nops"; - break; - } case MCFragment::FT_Data: case MCFragment::FT_Relaxable: + case MCFragment::FT_Align: case MCFragment::FT_LEB: case MCFragment::FT_Dwarf: case MCFragment::FT_DwarfFrame: { @@ -92,8 +83,13 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { auto Fixed = getContents(); auto Var = getVarContents(); OS << " Size:" << Fixed.size(); - if (getKind() != MCFragment::FT_Data) + if (getKind() != MCFragment::FT_Data) { OS << '+' << Var.size(); + // FT_Align uses getVarContents to track the size, but the content is + // ignored and not useful. + if (getKind() == MCFragment::FT_Align) + Var = {}; + } OS << " ["; for (unsigned i = 0, e = Fixed.size(); i != e; ++i) { if (i) OS << ","; @@ -112,6 +108,13 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { OS << ' '; getInst().dump_pretty(OS); break; + case MCFragment::FT_Align: + OS << "\n Align:" << getAlignment().value() << " Fill:" << getAlignFill() + << " FillLen:" << unsigned(getAlignFillLen()) + << " MaxBytesToEmit:" << getAlignMaxBytesToEmit(); + if (hasAlignEmitNops()) + OS << " Nops"; + break; case MCFragment::FT_LEB: { OS << " Value:"; getLEBValue().print(OS, nullptr); @@ -182,12 +185,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { } break; } - case MCFragment::FT_PseudoProbe: { - const auto *OF = cast<MCPseudoProbeAddrFragment>(this); - OS << " AddrDelta:"; - OF->getAddrDelta().print(OS, nullptr); - break; - } } } #endif diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index 43598ef..7560399 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -132,8 +132,7 @@ public: } // end anonymous namespace. void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) { - // Change the section normally. - changeSectionImpl(Section, Subsection); + MCObjectStreamer::changeSection(Section, Subsection); // Output a linker-local symbol so we don't need section-relative local // relocations. The linker hates us when we do that. @@ -161,7 +160,7 @@ void MCMachOStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { // We have to create a new fragment if this is an atom defining symbol, // fragments cannot span atoms. if (cast<MCSymbolMachO>(Symbol)->isSymbolLinkerVisible()) - insert(getContext().allocFragment<MCFragment>()); + newFragment(); MCObjectStreamer::emitLabel(Symbol, Loc); @@ -393,7 +392,7 @@ void MCMachOStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol, // On darwin all virtual sections have zerofill type. Disallow the usage of // .zerofill in non-virtual functions. If something similar is needed, use // .space or .zero. - if (!Section->isVirtualSection()) { + if (!Section->isBssSection()) { getContext().reportError( Loc, "The usage of .zerofill is restricted to sections of " "ZEROFILL type. Use .zero or .space instead."); diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index d5b8f22..42f4cf4 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -33,6 +33,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context, Context, std::move(TAB), std::move(Emitter), std::move(OW))), EmitEHFrame(true), EmitDebugFrame(false) { assert(Assembler->getBackendPtr() && Assembler->getEmitterPtr()); + IsObj = true; setAllowAutoPadding(Assembler->getBackend().allowAutoPadding()); if (Context.getTargetOptions() && Context.getTargetOptions()->MCRelaxAll) Assembler->setRelaxAll(true); @@ -46,6 +47,25 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() { return nullptr; } +void MCObjectStreamer::newFragment() { + addFragment(getContext().allocFragment<MCFragment>()); +} + +void MCObjectStreamer::insert(MCFragment *F) { + assert(F->getKind() != MCFragment::FT_Data && + "F should have a variable-size tail"); + addFragment(F); + newFragment(); +} + +void MCObjectStreamer::appendContents(size_t Num, char Elt) { + CurFrag->appendContents(Num, Elt); +} + +void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) { + CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind)); +} + // As a compile-time optimization, avoid allocating and evaluating an MCExpr // tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment's fixed // part. @@ -106,18 +126,6 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) { MCDwarfFrameEmitter::Emit(*this, MAB, false); } -MCFragment *MCObjectStreamer::getOrCreateDataFragment() { - // TODO: Start a new fragment whenever finalizing the variable-size tail of a - // previous one, so that all getOrCreateDataFragment calls can be replaced - // with getCurrentFragment - auto *F = getCurrentFragment(); - if (F->getKind() != MCFragment::FT_Data) { - F = getContext().allocFragment<MCFragment>(); - insert(F); - } - return F; -} - void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) { Assembler->registerSymbol(Sym); } @@ -131,7 +139,7 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug) { void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) { MCStreamer::emitValueImpl(Value, Size, Loc); - MCFragment *DF = getOrCreateDataFragment(); + MCFragment *DF = getCurrentFragment(); MCDwarfLineEntry::make(this, getCurrentSectionOnly()); @@ -180,7 +188,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { // If there is a current fragment, mark the symbol as pointing into it. // Otherwise queue the label and set its fragment pointer when we emit the // next fragment. - MCFragment *F = getOrCreateDataFragment(); + MCFragment *F = getCurrentFragment(); Symbol->setFragment(F); Symbol->setOffset(F->getContents().size()); @@ -214,10 +222,9 @@ void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) { emitULEB128IntValue(IntValue); return; } - auto *F = getOrCreateDataFragment(); - F->Kind = MCFragment::FT_LEB; - F->setLEBSigned(false); - F->setLEBValue(Value); + auto *F = getCurrentFragment(); + F->makeLEB(false, Value); + newFragment(); } void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) { @@ -226,10 +233,9 @@ void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) { emitSLEB128IntValue(IntValue); return; } - auto *F = getOrCreateDataFragment(); - F->Kind = MCFragment::FT_LEB; - F->setLEBSigned(true); - F->setLEBValue(Value); + auto *F = getCurrentFragment(); + F->makeLEB(true, Value); + newFragment(); } void MCObjectStreamer::emitWeakReference(MCSymbol *Alias, @@ -238,11 +244,6 @@ void MCObjectStreamer::emitWeakReference(MCSymbol *Alias, } void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { - changeSectionImpl(Section, Subsection); -} - -bool MCObjectStreamer::changeSectionImpl(MCSection *Section, - uint32_t Subsection) { assert(Section && "Cannot switch to a null section!"); getContext().clearDwarfLocSeen(); @@ -261,7 +262,7 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section, Section->CurFragList = &Subsections[I].second; CurFrag = Section->CurFragList->Tail; - return getAssembler().registerSection(*Section); + getAssembler().registerSection(*Section); } void MCObjectStreamer::switchSectionNoPrint(MCSection *Section) { @@ -293,18 +294,6 @@ bool MCObjectStreamer::mayHaveInstructions(MCSection &Sec) const { void MCObjectStreamer::emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) { - const MCSection &Sec = *getCurrentSectionOnly(); - if (Sec.isVirtualSection()) { - getContext().reportError(Inst.getLoc(), Twine(Sec.getVirtualSectionKind()) + - " section '" + Sec.getName() + - "' cannot have instructions"); - return; - } - emitInstructionImpl(Inst, STI); -} - -void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst, - const MCSubtargetInfo &STI) { MCStreamer::emitInstruction(Inst, STI); MCSection *Sec = getCurrentSectionOnly(); @@ -338,7 +327,7 @@ void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst, void MCObjectStreamer::emitInstToData(const MCInst &Inst, const MCSubtargetInfo &STI) { - MCFragment *F = getOrCreateDataFragment(); + MCFragment *F = getCurrentFragment(); // Append the instruction to the data fragment. size_t FixupStartIndex = F->getFixups().size(); @@ -370,7 +359,7 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst, void MCObjectStreamer::emitInstToFragment(const MCInst &Inst, const MCSubtargetInfo &STI) { - auto *F = getOrCreateDataFragment(); + auto *F = getCurrentFragment(); SmallVector<char, 16> Data; SmallVector<MCFixup, 1> Fixups; getAssembler().getEmitter().encodeInstruction(Inst, Data, Fixups, STI); @@ -381,6 +370,7 @@ void MCObjectStreamer::emitInstToFragment(const MCInst &Inst, F->setVarContents(Data); F->setVarFixups(Fixups); F->setInst(Inst); + newFragment(); } void MCObjectStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line, @@ -442,10 +432,11 @@ void MCObjectStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta, return; } - auto *F = getOrCreateDataFragment(); + auto *F = getCurrentFragment(); F->Kind = MCFragment::FT_Dwarf; F->setDwarfAddrDelta(buildSymbolDiff(*this, Label, LastLabel, SMLoc())); F->setDwarfLineDelta(LineDelta); + newFragment(); } void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section, @@ -473,9 +464,10 @@ void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section, void MCObjectStreamer::emitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel, const MCSymbol *Label, SMLoc Loc) { - auto *F = getOrCreateDataFragment(); + auto *F = getCurrentFragment(); F->Kind = MCFragment::FT_DwarfFrame; F->setDwarfAddrDelta(buildSymbolDiff(*this, Label, LastLabel, Loc)); + newFragment(); } void MCObjectStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo, @@ -534,7 +526,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) { void MCObjectStreamer::emitBytes(StringRef Data) { MCDwarfLineEntry::make(this, getCurrentSectionOnly()); - MCFragment *DF = getOrCreateDataFragment(); + MCFragment *DF = getCurrentFragment(); DF->appendContents(ArrayRef(Data.data(), Data.size())); } @@ -543,28 +535,21 @@ void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill, unsigned MaxBytesToEmit) { if (MaxBytesToEmit == 0) MaxBytesToEmit = Alignment.value(); - insert(getContext().allocFragment<MCAlignFragment>(Alignment, Fill, FillLen, - MaxBytesToEmit)); + MCFragment *F = getCurrentFragment(); + F->makeAlign(Alignment, Fill, FillLen, MaxBytesToEmit); + newFragment(); // Update the maximum alignment on the current section if necessary. - MCSection *CurSec = getCurrentSectionOnly(); - CurSec->ensureMinAlignment(Alignment); + F->getParent()->ensureMinAlignment(Alignment); } void MCObjectStreamer::emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI, unsigned MaxBytesToEmit) { + auto *F = getCurrentFragment(); emitValueToAlignment(Alignment, 0, 1, MaxBytesToEmit); - auto *F = cast<MCAlignFragment>(getCurrentFragment()); - F->setEmitNops(true, STI); - // With RISC-V style linker relaxation, mark the section as linker-relaxable - // if the alignment is larger than the minimum NOP size. - unsigned Size; - if (getAssembler().getBackend().shouldInsertExtraNopBytesForCodeAlign(*F, - Size)) { - getCurrentSectionOnly()->setLinkerRelaxable(); - newFragment(); - } + F->u.align.EmitNops = true; + F->STI = STI; } void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset, diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 77bf843..d0b6ea4 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -3404,11 +3404,10 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, uint8_t ValueSize) { const MCSection *Section = getStreamer().getCurrentSectionOnly(); assert(Section && "must have section to emit alignment"); - if (HasFillExpr && FillExpr != 0 && Section->isVirtualSection()) { + if (HasFillExpr && FillExpr != 0 && Section->isBssSection()) { ReturnVal |= - Warning(FillExprLoc, "ignoring non-zero fill value in " + - Section->getVirtualSectionKind() + - " section '" + Section->getName() + "'"); + Warning(FillExprLoc, "ignoring non-zero fill value in BSS section '" + + Section->getName() + "'"); FillExpr = 0; } diff --git a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp index 7f09349..d7b0546 100644 --- a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp +++ b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp @@ -8,8 +8,8 @@ #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCRegister.h" -#include "llvm/MC/MCStreamer.h" using namespace llvm; @@ -25,8 +25,9 @@ MCSubtargetInfo &MCTargetAsmParser::copySTI() { STI = &STICopy; // The returned STI will likely be modified. Create a new fragment to prevent // mixing STI values within a fragment. - if (getStreamer().getCurrentFragment()) - getStreamer().newFragment(); + auto &S = getStreamer(); + if (S.isObj() && S.getCurrentFragment()) + static_cast<MCObjectStreamer &>(S).newFragment(); return STICopy; } diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp index f87d27f..b493337 100644 --- a/llvm/lib/MC/MCPseudoProbe.cpp +++ b/llvm/lib/MC/MCPseudoProbe.cpp @@ -81,8 +81,9 @@ void MCPseudoProbe::emit(MCObjectStreamer *MCOS, if (AddrDelta->evaluateAsAbsolute(Delta, MCOS->getAssemblerPtr())) { MCOS->emitSLEB128IntValue(Delta); } else { - MCOS->insert(MCOS->getContext().allocFragment<MCPseudoProbeAddrFragment>( - AddrDelta)); + auto *F = MCOS->getCurrentFragment(); + F->makeLEB(true, AddrDelta); + MCOS->newFragment(); } } else { // Emit the GUID of the split function that the sentinel probe represents. diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp index 9367145..023f7f2 100644 --- a/llvm/lib/MC/MCSection.cpp +++ b/llvm/lib/MC/MCSection.cpp @@ -18,10 +18,10 @@ using namespace llvm; -MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText, - bool IsVirtual, MCSymbol *Begin) +MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss, + MCSymbol *Begin) : Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText), - IsVirtual(IsVirtual), LinkerRelaxable(false), Name(Name), Variant(V) { + IsBss(IsBss), LinkerRelaxable(false), Name(Name), Variant(V) { // The initial subsection number is 0. Create a fragment list. CurFragList = &Subsections.emplace_back(0u, FragList{}).second; } @@ -34,8 +34,6 @@ MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) { bool MCSection::hasEnded() const { return End && End->isInSection(); } -StringRef MCSection::getVirtualSectionKind() const { return "virtual"; } - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void MCSection::dump( DenseMap<const MCFragment *, SmallVector<const MCSymbol *, 0>> *FragToSyms) @@ -60,16 +58,6 @@ LLVM_DUMP_METHOD void MCSection::dump( } #endif -void MCFragment::setContents(ArrayRef<char> Contents) { - auto &S = getParent()->ContentStorage; - if (ContentStart + Contents.size() > ContentEnd) { - ContentStart = S.size(); - S.resize_for_overwrite(S.size() + Contents.size()); - } - ContentEnd = ContentStart + Contents.size(); - llvm::copy(Contents, S.begin() + ContentStart); -} - void MCFragment::setVarContents(ArrayRef<char> Contents) { auto &S = getParent()->ContentStorage; if (VarContentStart + Contents.size() > VarContentEnd) { @@ -96,16 +84,6 @@ void MCFragment::appendFixups(ArrayRef<MCFixup> Fixups) { FixupEnd = S.size(); } -void MCFragment::setFixups(ArrayRef<MCFixup> Fixups) { - auto &S = getParent()->FixupStorage; - if (FixupStart + Fixups.size() > FixupEnd) { - FixupStart = S.size(); - S.resize_for_overwrite(S.size() + Fixups.size()); - } - FixupEnd = FixupStart + Fixups.size(); - llvm::copy(Fixups, S.begin() + FixupStart); -} - void MCFragment::setVarFixups(ArrayRef<MCFixup> Fixups) { auto &S = getParent()->FixupStorage; if (VarFixupStart + Fixups.size() > VarFixupEnd) { diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp index 94e29ce..5bf1473 100644 --- a/llvm/lib/MC/MCSectionCOFF.cpp +++ b/llvm/lib/MC/MCSectionCOFF.cpp @@ -115,7 +115,3 @@ void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, } bool MCSectionCOFF::useCodeAlign() const { return isText(); } - -StringRef MCSectionCOFF::getVirtualSectionKind() const { - return "IMAGE_SCN_CNT_UNINITIALIZED_DATA"; -} diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp index 299fe40..ef33f9c 100644 --- a/llvm/lib/MC/MCSectionELF.cpp +++ b/llvm/lib/MC/MCSectionELF.cpp @@ -215,5 +215,3 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, bool MCSectionELF::useCodeAlign() const { return getFlags() & ELF::SHF_EXECINSTR; } - -StringRef MCSectionELF::getVirtualSectionKind() const { return "SHT_NOBITS"; } diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index c3ecf8f..30198c9 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -1404,7 +1404,7 @@ MCSymbol *MCStreamer::endSection(MCSection *Section) { return Sym; } -void MCStreamer::insert(MCFragment *F) { +void MCStreamer::addFragment(MCFragment *F) { auto *Sec = CurFrag->getParent(); F->setParent(Sec); F->setLayoutOrder(CurFrag->getLayoutOrder() + 1); @@ -1413,10 +1413,6 @@ void MCStreamer::insert(MCFragment *F) { Sec->curFragList()->Tail = F; } -void MCStreamer::newFragment() { - insert(getContext().allocFragment<MCFragment>()); -} - static VersionTuple targetVersionOrMinimumSupportedOSVersion(const Triple &Target, VersionTuple TargetVersion) { diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp index e8b26bf..72a8dd7 100644 --- a/llvm/lib/MC/MCWin64EH.cpp +++ b/llvm/lib/MC/MCWin64EH.cpp @@ -318,15 +318,13 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { // Emit the epilog instructions. if (EnableUnwindV2) { - MCFragment *DF = OS->getOrCreateDataFragment(); - bool IsLast = true; for (const auto &Epilog : llvm::reverse(info->EpilogMap)) { if (IsLast) { IsLast = false; uint8_t Flags = LastEpilogIsAtEnd ? 0x01 : 0; - streamer.emitInt8(EpilogSize); - streamer.emitInt8((Flags << 4) | Win64EH::UOP_Epilog); + OS->emitInt8(EpilogSize); + OS->emitInt8((Flags << 4) | Win64EH::UOP_Epilog); if (LastEpilogIsAtEnd) continue; @@ -337,9 +335,8 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { // layout has been completed. auto *MCE = MCUnwindV2EpilogTargetExpr::create(*info, Epilog.second, EpilogSize, context); - MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_2); - DF->addFixup(Fixup); - DF->appendContents(2, 0); + OS->addFixup(MCE, FK_Data_2); + OS->appendContents(2, 0); } } if (AddPaddingEpilogCode) diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp index 3398775..9369bea 100644 --- a/llvm/lib/MC/MCWinCOFFStreamer.cpp +++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp @@ -153,7 +153,7 @@ void MCWinCOFFStreamer::initSections(bool NoExecStack, } void MCWinCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { - changeSectionImpl(Section, Subsection); + MCObjectStreamer::changeSection(Section, Subsection); // Ensure that the first and the second symbols relative to the section are // the section symbol and the COMDAT symbol. getAssembler().registerSymbol(*Section->getBeginSymbol()); @@ -278,35 +278,28 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) { void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) { visitUsedSymbol(*Symbol); - MCFragment *DF = getOrCreateDataFragment(); const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext()); - MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, FK_SecRel_2); - DF->addFixup(Fixup); - DF->appendContents(2, 0); + addFixup(SRE, FK_SecRel_2); + appendContents(2, 0); } void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol, uint64_t Offset) { visitUsedSymbol(*Symbol); - MCFragment *DF = getOrCreateDataFragment(); // Create Symbol A for the relocation relative reference. const MCExpr *MCE = MCSymbolRefExpr::create(Symbol, getContext()); // Add the constant offset, if given. if (Offset) MCE = MCBinaryExpr::createAdd( MCE, MCConstantExpr::create(Offset, getContext()), getContext()); - // Build the secrel32 relocation. - MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_SecRel_4); - // Record the relocation. - DF->addFixup(Fixup); + addFixup(MCE, FK_SecRel_4); // Emit 4 bytes (zeros) to the object file. - DF->appendContents(4, 0); + appendContents(4, 0); } void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol, int64_t Offset) { visitUsedSymbol(*Symbol); - MCFragment *DF = getOrCreateDataFragment(); // Create Symbol A for the relocation relative reference. const MCExpr *MCE = MCSymbolRefExpr::create( Symbol, MCSymbolRefExpr::VK_COFF_IMGREL32, getContext()); @@ -314,40 +307,29 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol, if (Offset) MCE = MCBinaryExpr::createAdd( MCE, MCConstantExpr::create(Offset, getContext()), getContext()); - // Build the imgrel relocation. - MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4); - // Record the relocation. - DF->addFixup(Fixup); + addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. - DF->appendContents(4, 0); + appendContents(4, 0); } void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) { visitUsedSymbol(*Symbol); - MCFragment *DF = getOrCreateDataFragment(); // Create Symbol for section number. const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create( *Symbol, this->getWriter(), getContext()); - // Build the relocation. - MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4); - // Record the relocation. - DF->addFixup(Fixup); + addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. - DF->appendContents(4, 0); + appendContents(4, 0); } void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) { visitUsedSymbol(*Symbol); - MCFragment *DF = getOrCreateDataFragment(); // Create Symbol for section offset. const MCExpr *MCE = MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext()); - // Build the relocation. - MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4); - // Record the relocation. - DF->addFixup(Fixup); + addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. - DF->appendContents(4, 0); + appendContents(4, 0); } void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size, diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp index 4d45296..63381b4 100644 --- a/llvm/lib/MC/MCXCOFFStreamer.cpp +++ b/llvm/lib/MC/MCXCOFFStreamer.cpp @@ -89,7 +89,7 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility( void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) { // Add a Fixup here to later record a relocation of type R_REF to prevent the // ref symbol from being garbage collected (by the binder). - MCFragment *DF = getOrCreateDataFragment(); + MCFragment *DF = getCurrentFragment(); const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext()); std::optional<MCFixupKind> MaybeKind = getAssembler().getBackend().getFixupKind("R_REF"); diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 3291dd7..48d2fc6 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -131,7 +131,7 @@ uint64_t MachObjectWriter::getPaddingSize(const MCAssembler &Asm, return 0; const MCSection &NextSec = *SectionOrder[Next]; - if (NextSec.isVirtualSection()) + if (NextSec.isBssSection()) return 0; return offsetToAlignment(EndAddr, NextSec.getAlign()); } @@ -267,7 +267,7 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm, const MCSectionMachO &Section = cast<MCSectionMachO>(Sec); // The offset is unused for virtual sections. - if (Section.isVirtualSection()) { + if (Section.isBssSection()) { assert(Asm.getSectionFileSize(Sec) == 0 && "Invalid file size!"); FileOffset = 0; } @@ -682,13 +682,13 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm) { unsigned i = 0; // Compute the section layout order. Virtual sections must go last. for (MCSection &Sec : Asm) { - if (!Sec.isVirtualSection()) { + if (!Sec.isBssSection()) { SectionOrder.push_back(&Sec); cast<MCSectionMachO>(Sec).setLayoutOrder(i++); } } for (MCSection &Sec : Asm) { - if (Sec.isVirtualSection()) { + if (Sec.isBssSection()) { SectionOrder.push_back(&Sec); cast<MCSectionMachO>(Sec).setLayoutOrder(i++); } @@ -797,11 +797,8 @@ uint64_t MachObjectWriter::writeObject() { UndefinedSymbolData); if (!CGProfile.empty()) { - MCSection *CGProfileSection = getContext().getMachOSection( - "__LLVM", "__cg_profile", 0, SectionKind::getMetadata()); - auto &Frag = *CGProfileSection->begin(); - Frag.clearContents(); - raw_svector_ostream OS(Frag.getContentsForAppending()); + SmallString<0> Content; + raw_svector_ostream OS(Content); for (const MCObjectWriter::CGProfileEntry &CGPE : CGProfile) { uint32_t FromIndex = CGPE.From->getSymbol().getIndex(); uint32_t ToIndex = CGPE.To->getSymbol().getIndex(); @@ -809,7 +806,9 @@ uint64_t MachObjectWriter::writeObject() { support::endian::write(OS, ToIndex, W.Endian); support::endian::write(OS, CGPE.Count, W.Endian); } - Frag.doneAppending(); + MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0, + SectionKind::getMetadata()); + llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data()); } unsigned NumSections = Asm.end() - Asm.begin(); @@ -883,7 +882,7 @@ uint64_t MachObjectWriter::writeObject() { VMSize = std::max(VMSize, Address + Size); - if (Sec.isVirtualSection()) + if (Sec.isBssSection()) continue; SectionDataSize = std::max(SectionDataSize, Address + Size); @@ -915,7 +914,7 @@ uint64_t MachObjectWriter::writeObject() { unsigned Flags = Sec.getTypeAndAttributes(); if (Sec.hasInstructions()) Flags |= MachO::S_ATTR_SOME_INSTRUCTIONS; - if (!cast<MCSectionMachO>(Sec).isVirtualSection() && + if (!cast<MCSectionMachO>(Sec).isBssSection() && !isUInt<32>(SectionStart)) { getContext().reportError( SMLoc(), "cannot encode offset of section; object file too large"); diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index 7af240a..3b99af4 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -696,14 +696,15 @@ static void addData(SmallVectorImpl<char> &DataBytes, if (Frag.hasInstructions()) report_fatal_error("only data supported in data sections"); - if (auto *Align = dyn_cast<MCAlignFragment>(&Frag)) { - if (Align->getFillLen() != 1) + llvm::append_range(DataBytes, Frag.getContents()); + if (Frag.getKind() == MCFragment::FT_Align) { + if (Frag.getAlignFillLen() != 1) report_fatal_error("only byte values supported for alignment"); // If nops are requested, use zeros, as this is the data section. - uint8_t Value = Align->hasEmitNops() ? 0 : Align->getFill(); + uint8_t Value = Frag.hasAlignEmitNops() ? 0 : Frag.getAlignFill(); uint64_t Size = - std::min<uint64_t>(alignTo(DataBytes.size(), Align->getAlignment()), - DataBytes.size() + Align->getMaxBytesToEmit()); + std::min<uint64_t>(alignTo(DataBytes.size(), Frag.getAlignment()), + DataBytes.size() + Frag.getAlignMaxBytesToEmit()); DataBytes.resize(Size, Value); } else if (auto *Fill = dyn_cast<MCFillFragment>(&Frag)) { int64_t NumValues; @@ -711,12 +712,10 @@ static void addData(SmallVectorImpl<char> &DataBytes, llvm_unreachable("The fill should be an assembler constant"); DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues, Fill->getValue()); + } else if (Frag.getKind() == MCFragment::FT_LEB) { + llvm::append_range(DataBytes, Frag.getVarContents()); } else { - llvm::append_range(DataBytes, Frag.getContents()); - if (Frag.getKind() == MCFragment::FT_LEB) - llvm::append_range(DataBytes, Frag.getVarContents()); - else - assert(Frag.getKind() == MCFragment::FT_Data); + assert(Frag.getKind() == MCFragment::FT_Data); } } @@ -1858,23 +1857,9 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, auto IT = WS.begin(); if (IT == WS.end()) continue; - const MCFragment &EmptyFrag = *IT; - if (EmptyFrag.getKind() != MCFragment::FT_Data) - report_fatal_error(".init_array section should be aligned"); - - const MCFragment *nextFrag = EmptyFrag.getNext(); - while (nextFrag != nullptr) { - const MCFragment &AlignFrag = *nextFrag; - if (AlignFrag.getKind() != MCFragment::FT_Align) - report_fatal_error(".init_array section should be aligned"); - if (cast<MCAlignFragment>(AlignFrag).getAlignment() != - Align(is64Bit() ? 8 : 4)) - report_fatal_error( - ".init_array section should be aligned for pointers"); - - const MCFragment &Frag = *AlignFrag.getNext(); - nextFrag = Frag.getNext(); - if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data) + for (auto *Frag = &*IT; Frag; Frag = Frag->getNext()) { + if (Frag->hasInstructions() || (Frag->getKind() != MCFragment::FT_Align && + Frag->getKind() != MCFragment::FT_Data)) report_fatal_error("only data supported in .init_array section"); uint16_t Priority = UINT16_MAX; @@ -1886,9 +1871,8 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, if (WS.getName().substr(PrefixLength + 1).getAsInteger(10, Priority)) report_fatal_error("invalid .init_array section priority"); } - const auto &DataFrag = Frag; - assert(llvm::all_of(DataFrag.getContents(), [](char C) { return !C; })); - for (const MCFixup &Fixup : DataFrag.getFixups()) { + assert(llvm::all_of(Frag->getContents(), [](char C) { return !C; })); + for (const MCFixup &Fixup : Frag->getFixups()) { assert(Fixup.getKind() == MCFixup::getDataKindForSize(is64Bit() ? 8 : 4)); const MCExpr *Expr = Fixup.getValue(); diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index ee4d957..6ad4334 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -179,7 +179,7 @@ private: void SetSymbolName(COFFSymbol &S); void SetSectionName(COFFSection &S); - bool IsPhysicalSection(COFFSection *S); + bool isUninitializedData(const COFFSection &S); // Entity writing methods. void WriteFileHeader(const COFF::header &Header); @@ -453,8 +453,8 @@ void WinCOFFWriter::SetSymbolName(COFFSymbol &S) { std::memcpy(S.Data.Name, S.Name.c_str(), S.Name.size()); } -bool WinCOFFWriter::IsPhysicalSection(COFFSection *S) { - return (S->Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) == +bool WinCOFFWriter::isUninitializedData(const COFFSection &S) { + return (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) != 0; } @@ -606,6 +606,9 @@ void WinCOFFWriter::writeSection(const COFFSection &Sec) { assert(AuxSyms.size() == 1 && AuxSyms[0].AuxType == ATSectionDefinition); AuxSymbol &SecDef = AuxSyms[0]; SecDef.Aux.SectionDefinition.CheckSum = CRC; + } else if (isUninitializedData(Sec)) { + // Error if fixups or non-zero bytes are present. + writeSectionContents(*Sec.MCSection); } // Write relocations for this section. @@ -745,7 +748,7 @@ void WinCOFFWriter::assignFileOffsets() { Sec->Header.SizeOfRawData = Asm->getSectionAddressSize(Section); - if (IsPhysicalSection(Sec)) { + if (!isUninitializedData(*Sec)) { Sec->Header.PointerToRawData = Offset; Offset += Sec->Header.SizeOfRawData; } @@ -1067,10 +1070,8 @@ uint64_t WinCOFFWriter::writeObject() { // Create the contents of the .llvm_addrsig section. if (Mode != DwoOnly && OWriter.getEmitAddrsigSection()) { - auto *Sec = getContext().getCOFFSection(".llvm_addrsig", - COFF::IMAGE_SCN_LNK_REMOVE); - auto *Frag = Sec->curFragList()->Head; - raw_svector_ostream OS(Frag->getContentsForAppending()); + SmallString<0> Content; + raw_svector_ostream OS(Content); for (const MCSymbol *S : OWriter.AddrsigSyms) { if (!S->isRegistered()) continue; @@ -1085,15 +1086,15 @@ uint64_t WinCOFFWriter::writeObject() { "executePostLayoutBinding!"); encodeULEB128(SectionMap[TargetSection]->Symbol->getIndex(), OS); } - Frag->doneAppending(); + auto *Sec = getContext().getCOFFSection(".llvm_addrsig", + COFF::IMAGE_SCN_LNK_REMOVE); + Sec->curFragList()->Tail->setVarContents(OS.str()); } // Create the contents of the .llvm.call-graph-profile section. if (Mode != DwoOnly && !OWriter.getCGProfile().empty()) { - auto *Sec = getContext().getCOFFSection(".llvm.call-graph-profile", - COFF::IMAGE_SCN_LNK_REMOVE); - auto *Frag = Sec->curFragList()->Head; - raw_svector_ostream OS(Frag->getContentsForAppending()); + SmallString<0> Content; + raw_svector_ostream OS(Content); for (const auto &CGPE : OWriter.getCGProfile()) { uint32_t FromIndex = CGPE.From->getSymbol().getIndex(); uint32_t ToIndex = CGPE.To->getSymbol().getIndex(); @@ -1101,7 +1102,9 @@ uint64_t WinCOFFWriter::writeObject() { support::endian::write(OS, ToIndex, W.Endian); support::endian::write(OS, CGPE.Count, W.Endian); } - Frag->doneAppending(); + auto *Sec = getContext().getCOFFSection(".llvm.call-graph-profile", + COFF::IMAGE_SCN_LNK_REMOVE); + Sec->curFragList()->Tail->setVarContents(OS.str()); } assignFileOffsets(); diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h index 8f9444f..86c6b12 100644 --- a/llvm/lib/ObjCopy/MachO/MachOObject.h +++ b/llvm/lib/ObjCopy/MachO/MachOObject.h @@ -64,14 +64,14 @@ struct Section { return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE); } - bool isVirtualSection() const { + bool isBssSection() const { return (getType() == MachO::S_ZEROFILL || getType() == MachO::S_GB_ZEROFILL || getType() == MachO::S_THREAD_LOCAL_ZEROFILL); } bool hasValidOffset() const { - return !(isVirtualSection() || OriginalOffset == 0); + return !(isBssSection() || OriginalOffset == 0); } }; diff --git a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp index 7c24d12..89c1df8 100644 --- a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp +++ b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp @@ -112,7 +112,7 @@ size_t MachOWriter::totalSize() const { for (const std::unique_ptr<Section> &S : LC.Sections) { if (!S->hasValidOffset()) { assert((S->Offset == 0) && "Skipped section's offset must be zero"); - assert((S->isVirtualSection() || S->Size == 0) && + assert((S->isBssSection() || S->Size == 0) && "Non-zero-fill sections with zero offset must have zero size"); continue; } @@ -240,7 +240,7 @@ void MachOWriter::writeSections() { for (const std::unique_ptr<Section> &Sec : LC.Sections) { if (!Sec->hasValidOffset()) { assert((Sec->Offset == 0) && "Skipped section's offset must be zero"); - assert((Sec->isVirtualSection() || Sec->Size == 0) && + assert((Sec->isBssSection() || Sec->Size == 0) && "Non-zero-fill sections with zero offset must have zero size"); continue; } diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt index 870169a..0f6d2f7 100644 --- a/llvm/lib/Object/CMakeLists.txt +++ b/llvm/lib/Object/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_component_library(LLVMObject OffloadBundle.cpp RecordStreamer.cpp RelocationResolver.cpp + SFrameParser.cpp SymbolicFile.cpp SymbolSize.cpp TapiFile.cpp diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 5597d7d..0919c6a 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -620,7 +620,9 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const { StringRef ELFObjectFileBase::getNVPTXCPUName() const { assert(getEMachine() == ELF::EM_CUDA); - unsigned SM = getPlatformFlags() & ELF::EF_CUDA_SM; + unsigned SM = getEIdentABIVersion() == ELF::ELFABIVERSION_CUDA_V1 + ? getPlatformFlags() & ELF::EF_CUDA_SM + : getPlatformFlags() & ELF::EF_CUDA_SM_MASK; switch (SM) { // Fermi architecture. @@ -679,7 +681,18 @@ StringRef ELFObjectFileBase::getNVPTXCPUName() const { // Hopper architecture. case ELF::EF_CUDA_SM90: - return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_90a" : "sm_90"; + return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS_V1 ? "sm_90a" + : "sm_90"; + + // Blackwell architecture. + case ELF::EF_CUDA_SM100: + return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_100a" + : "sm_100"; + + // Rubin architecture. + case ELF::EF_CUDA_SM120: + return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_120a" + : "sm_120"; default: llvm_unreachable("Unknown EF_CUDA_SM value"); } diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp new file mode 100644 index 0000000..2d74d1d --- /dev/null +++ b/llvm/lib/Object/SFrameParser.cpp @@ -0,0 +1,55 @@ +//===- SFrameParser.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Object/SFrameParser.h" +#include "llvm/BinaryFormat/SFrame.h" +#include "llvm/Object/Error.h" +#include "llvm/Support/FormatVariadic.h" + +using namespace llvm; +using namespace llvm::object; + +template <typename T> +static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data, + uint64_t Offset) { + static_assert(std::is_trivial_v<T>); + if (Data.size() < Offset + sizeof(T)) { + return createStringError( + formatv("unexpected end of data at offset {0:x} while reading [{1:x}, " + "{2:x})", + Data.size(), Offset, Offset + sizeof(T)) + .str(), + object_error::unexpected_eof); + } + return *reinterpret_cast<const T *>(Data.data() + Offset); +} + +template <endianness E> +Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) { + Expected<const sframe::Preamble<E> &> Preamble = + getDataSliceAs<sframe::Preamble<E>>(Contents, 0); + if (!Preamble) + return Preamble.takeError(); + + if (Preamble->Magic != sframe::Magic) + return createError( + formatv("invalid magic number ({0:x+4})", Preamble->Magic.value())); + if (Preamble->Version != sframe::Version::V2) + return createError( + formatv("invalid/unsupported version number ({0})", + static_cast<unsigned>(Preamble->Version.value()))); + + Expected<const sframe::Header<E> &> Header = + getDataSliceAs<sframe::Header<E>>(Contents, 0); + if (!Header) + return Header.takeError(); + return SFrameParser(Contents, *Header); +} + +template class llvm::object::SFrameParser<endianness::big>; +template class llvm::object::SFrameParser<endianness::little>; diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 80fb52f..e15570c 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1189,9 +1189,13 @@ Expected<GVNOptions> parseGVNOptions(StringRef Params) { } else if (ParamName == "split-backedge-load-pre") { Result.setLoadPRESplitBackedge(Enable); } else if (ParamName == "memdep") { + // MemDep and MemorySSA are mutually exclusive. Result.setMemDep(Enable); + Result.setMemorySSA(!Enable); } else if (ParamName == "memoryssa") { + // MemDep and MemorySSA are mutually exclusive. Result.setMemorySSA(Enable); + Result.setMemDep(!Enable); } else { return make_error<StringError>( formatv("invalid GVN pass parameter '{}'", ParamName).str(), diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index a579eaf..10b6101 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -380,7 +380,7 @@ if(LLVM_WITH_Z3) ) endif() -target_include_directories(LLVMSupport SYSTEM +target_include_directories(LLVMSupport PRIVATE ${LLVM_THIRD_PARTY_DIR}/siphash/include - ) +) diff --git a/llvm/lib/Support/StringMap.cpp b/llvm/lib/Support/StringMap.cpp index 432e1fc..3432dc1 100644 --- a/llvm/lib/Support/StringMap.cpp +++ b/llvm/lib/Support/StringMap.cpp @@ -45,23 +45,15 @@ static inline unsigned *getHashTable(StringMapEntryBase **TheTable, uint32_t StringMapImpl::hash(StringRef Key) { return xxh3_64bits(Key); } -StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) { - ItemSize = itemSize; - +StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) + : ItemSize(itemSize) { // If a size is specified, initialize the table with that many buckets. if (InitSize) { // The table will grow when the number of entries reach 3/4 of the number of // buckets. To guarantee that "InitSize" number of entries can be inserted // in the table without growing, we allocate just what is needed here. init(getMinBucketToReserveForEntries(InitSize)); - return; } - - // Otherwise, initialize it with zero buckets to avoid the allocation. - TheTable = nullptr; - NumBuckets = 0; - NumItems = 0; - NumTombstones = 0; } void StringMapImpl::init(unsigned InitSize) { diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 12fc976..201bfe0 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1205,32 +1205,36 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, Register DstReg = MI.getOperand(0).getReg(); if (DstReg == MI.getOperand(3).getReg()) { // Expand to BIT - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8 - : AArch64::BITv16i8)) - .add(MI.getOperand(0)) - .add(MI.getOperand(3)) - .add(MI.getOperand(2)) - .add(MI.getOperand(1)); + auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8 + : AArch64::BITv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(3)) + .add(MI.getOperand(2)) + .add(MI.getOperand(1)); + transferImpOps(MI, I, I); } else if (DstReg == MI.getOperand(2).getReg()) { // Expand to BIF - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8 - : AArch64::BIFv16i8)) - .add(MI.getOperand(0)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(1)); + auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8 + : AArch64::BIFv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(1)); + transferImpOps(MI, I, I); } else { // Expand to BSL, use additional move if required if (DstReg == MI.getOperand(1).getReg()) { - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 - : AArch64::BSLv16i8)) - .add(MI.getOperand(0)) - .add(MI.getOperand(1)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)); + auto I = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + transferImpOps(MI, I, I); } else { BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8 @@ -1240,15 +1244,17 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, getRenamableRegState(MI.getOperand(0).isRenamable())) .add(MI.getOperand(1)) .add(MI.getOperand(1)); - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 - : AArch64::BSLv16i8)) - .add(MI.getOperand(0)) - .addReg(DstReg, - RegState::Kill | - getRenamableRegState(MI.getOperand(0).isRenamable())) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)); + auto I2 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .addReg(DstReg, + RegState::Kill | getRenamableRegState( + MI.getOperand(0).isRenamable())) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + transferImpOps(MI, I2, I2); } } MI.eraseFromParent(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 6c46b18..9f8a257 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1053,13 +1053,6 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; def AArch64uaddlv : SDNode<"AArch64ISD::UADDLV", SDT_AArch64uaddlp>; def AArch64saddlv : SDNode<"AArch64ISD::SADDLV", SDT_AArch64uaddlp>; -def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs), - [(abdu node:$lhs, node:$rhs), - (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>; -def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs), - [(abds node:$lhs, node:$rhs), - (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>; - // Add Pairwise of two vectors def AArch64addp_n : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>; // Add Long Pairwise @@ -5667,8 +5660,7 @@ let Predicates = [HasFullFP16] in { // Advanced SIMD two vector instructions. //===----------------------------------------------------------------------===// -defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", - AArch64uabd>; +defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", abdu>; // Match UABDL in log2-shuffle patterns. def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)), (zext (v8i8 V64:$opB))))), @@ -6018,8 +6010,8 @@ defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>; defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", - TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >; -defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>; + TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >; +defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", abds>; defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", avgfloors>; defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; @@ -6037,8 +6029,8 @@ defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba", - TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >; -defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>; + TriOpFrag<(add node:$LHS, (abdu node:$MHS, node:$RHS))> >; +defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", abdu>; defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", avgflooru>; defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; @@ -6759,10 +6751,8 @@ defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn> defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>; -defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", - AArch64sabd>; -defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", - AArch64sabd>; +defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>; +defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>; defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl", BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", @@ -6780,8 +6770,7 @@ defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>; defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; -defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", - AArch64uabd>; +defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", abdu>; defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>; defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 0ddd17c..abcd550 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -8,8 +8,8 @@ // // This pass performs below peephole optimizations on MIR level. // -// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri -// MOVi64imm + ANDXrr ==> ANDXri + ANDXri +// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri +// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri // // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi // MOVi64imm + ADDXrr ==> ANDXri + ANDXri @@ -126,7 +126,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); template <typename T> - bool visitAND(unsigned Opc, MachineInstr &MI); + bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0); bool visitORR(MachineInstr &MI); bool visitCSEL(MachineInstr &MI); bool visitINSERT(MachineInstr &MI); @@ -194,12 +194,12 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { } template <typename T> -bool AArch64MIPeepholeOpt::visitAND( - unsigned Opc, MachineInstr &MI) { +bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, + unsigned OtherOpc) { // Try below transformation. // - // MOVi32imm + ANDWrr ==> ANDWri + ANDWri - // MOVi64imm + ANDXrr ==> ANDXri + ANDXri + // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri + // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri // // The mov pseudo instruction could be expanded to multiple mov instructions // later. Let's try to split the constant operand of mov instruction into two @@ -208,10 +208,10 @@ bool AArch64MIPeepholeOpt::visitAND( return splitTwoPartImm<T>( MI, - [Opc](T Imm, unsigned RegSize, T &Imm0, - T &Imm1) -> std::optional<OpcodePair> { + [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0, + T &Imm1) -> std::optional<OpcodePair> { if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) - return std::make_pair(Opc, Opc); + return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc); return std::nullopt; }, [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, @@ -864,6 +864,12 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { case AArch64::ANDXrr: Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); break; + case AArch64::ANDSWrr: + Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri); + break; + case AArch64::ANDSXrr: + Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri); + break; case AArch64::ORRWrs: Changed |= visitORR(MI); break; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 2409cc8..0f4f012 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -534,7 +534,7 @@ unsigned AArch64Subtarget::classifyGlobalFunctionReference( } void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { + const SchedRegion &Region) const { // LNT run (at least on Cyclone) showed reasonably significant gains for // bi-directional scheduling. 253.perlbmk. Policy.OnlyTopDown = false; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 154db3c..061ed61 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -343,7 +343,8 @@ public: } void overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const override; + const SchedRegion &Region) const override; + void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 473ba5e..bb0f667b 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -287,6 +287,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .moreElementsToNextPow2(0) .lower(); + getActionDefinitionsBuilder({G_ABDS, G_ABDU}) + .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) + .lower(); + getActionDefinitionsBuilder( {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) .legalFor({{s32, s32}, {s64, s32}}) @@ -1794,6 +1798,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerBinOp(AArch64::G_SMULL); case Intrinsic::aarch64_neon_umull: return LowerBinOp(AArch64::G_UMULL); + case Intrinsic::aarch64_neon_sabd: + return LowerBinOp(TargetOpcode::G_ABDS); + case Intrinsic::aarch64_neon_uabd: + return LowerBinOp(TargetOpcode::G_ABDU); case Intrinsic::aarch64_neon_abs: { // Lower the intrinsic to G_ABS. MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 0e0e83b..6076ac4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1848,7 +1848,8 @@ def FeatureISAVersion11_Common : FeatureSet< FeatureImageInsts, FeaturePackedTID, FeatureVcmpxPermlaneHazard, - FeatureMemoryAtomicFAddF32DenormalSupport]>; + FeatureMemoryAtomicFAddF32DenormalSupport, + FeatureRealTrue16Insts]>; // There are few workarounds that need to be // added to all targets. This pessimizes codegen @@ -1868,8 +1869,7 @@ def FeatureISAVersion11_0_Common : FeatureSet< [FeatureMSAALoadDstSelBug, FeatureVALUTransUseHazard, FeatureMADIntraFwdBug, - FeaturePrivEnabledTrap2NopBug, - FeatureRealTrue16Insts])>; + FeaturePrivEnabledTrap2NopBug])>; def FeatureISAVersion11_0_0 : FeatureSet< !listconcat(FeatureISAVersion11_0_Common.Features, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 749b9ef..4b3dc37 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1415,6 +1415,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode); MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered); + MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress); if (AMDGPU::isCompute(CC)) { MD->setHwStage(CC, ".trap_present", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 14101e5..3d8d274 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -374,8 +374,10 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, return true; } - unsigned ReturnOpc = - IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN; + const bool IsWholeWave = MFI->isWholeWaveFunction(); + unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN + : IsShader ? AMDGPU::SI_RETURN_TO_EPILOG + : AMDGPU::SI_RETURN; auto Ret = B.buildInstrNoInsert(ReturnOpc); if (!FLI.CanLowerReturn) @@ -383,6 +385,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, else if (!lowerReturnVal(B, Val, VRegs, Ret)) return false; + if (IsWholeWave) + addOriginalExecToReturn(B.getMF(), Ret); + // TODO: Handle CalleeSavedRegsViaCopy. B.insertInstr(Ret); @@ -632,6 +637,17 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (DL.getTypeStoreSize(Arg.getType()) == 0) continue; + if (Info->isWholeWaveFunction() && Idx == 0) { + assert(VRegs[Idx].size() == 1 && "Expected only one register"); + + // The first argument for whole wave functions is the original EXEC value. + B.buildInstr(AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP) + .addDef(VRegs[Idx][0]); + + ++Idx; + continue; + } + const bool InReg = Arg.hasAttribute(Attribute::InReg); if (Arg.hasAttribute(Attribute::SwiftSelf) || @@ -1347,6 +1363,7 @@ bool AMDGPUCallLowering::lowerTailCall( SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; if (Info.CallConv != CallingConv::AMDGPU_Gfx && + Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave && !AMDGPU::isChainCC(Info.CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) @@ -1524,7 +1541,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // after the ordinary user argument registers. SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; - if (Info.CallConv != CallingConv::AMDGPU_Gfx) { + if (Info.CallConv != CallingConv::AMDGPU_Gfx && + Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; @@ -1592,3 +1610,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return true; } + +void AMDGPUCallLowering::addOriginalExecToReturn( + MachineFunction &MF, MachineInstrBuilder &Ret) const { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF); + Ret.addReg(Setup->getOperand(0).getReg()); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index a6e801f..e0033d5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -37,6 +37,9 @@ class AMDGPUCallLowering final : public CallLowering { bool lowerReturnVal(MachineIRBuilder &B, const Value *Val, ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const; + void addOriginalExecToReturn(MachineFunction &MF, + MachineInstrBuilder &Ret) const; + public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 7b5d4077e..891d362 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -137,6 +137,9 @@ def gi_global_offset : def gi_global_saddr : GIComplexOperandMatcher<s64, "selectGlobalSAddr">, GIComplexPatternEquiv<GlobalSAddr>; +def gi_global_saddr_glc : + GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">, + GIComplexPatternEquiv<GlobalSAddrGLC>; def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, @@ -312,6 +315,10 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>; +def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>; +// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return, +// so we don't mark it as equivalent. + class GISelSop2Pat < SDPatternOperator node, Instruction inst, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 25672a5..3412bb5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1872,6 +1872,23 @@ static SDValue matchZExtFromI32(SDValue Op) { return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue(); } +// If this matches *_extend i32:x, return x +// Otherwise if the value is I32 returns x. +static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, + const SelectionDAG *DAG) { + if (Op.getValueType() == MVT::i32) + return Op; + + if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) && + Op.getOpcode() != ISD::ANY_EXTEND && + !(DAG->SignBitIsZero(Op) && + Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND))) + return SDValue(); + + SDValue ExtSrc = Op.getOperand(0); + return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue(); +} + // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, @@ -1968,6 +1985,29 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + return false; + + CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + return false; + + unsigned CPolVal = AMDGPU::CPol::GLC; + CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); @@ -2136,17 +2176,59 @@ bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset, return true; } +// Given \p Offset and load node \p N check if an \p Offset is a multiple of +// the load byte size. If it is update \p Offset to a pre-scaled value and +// return true. +bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset, + bool IsSigned) const { + bool ScaleOffset = false; + if (!Subtarget->hasScaleOffset() || !Offset) + return false; + + unsigned Size = + (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8; + + SDValue Off = Offset; + if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG)) + Off = Ext; + + if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) { + if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1))) + ScaleOffset = C->getZExtValue() == Log2_32(Size); + } else if (Offset.getOpcode() == ISD::MUL || + (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) || + Offset.getOpcode() == AMDGPUISD::MUL_U24 || + (Offset.isMachineOpcode() && + Offset.getMachineOpcode() == + (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO + : AMDGPU::S_MUL_U64_U32_PSEUDO))) { + if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1))) + ScaleOffset = C->getZExtValue() == Size; + } + + if (ScaleOffset) + Offset = Off.getOperand(0); + + return ScaleOffset; +} + // Match an immediate (if Offset is not null) or an SGPR (if SOffset is // not null) offset. If Imm32Only is true, match only 32-bit immediate // offsets available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only, bool IsBuffer, - bool HasSOffset, - int64_t ImmOffset) const { + bool HasSOffset, int64_t ImmOffset, + bool *ScaleOffset) const { assert((!SOffset || !Offset) && "Cannot match both soffset and offset at the same time!"); + if (ScaleOffset) { + assert(N && SOffset); + + *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */); + } + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); if (!C) { if (!SOffset) @@ -2231,24 +2313,25 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { // Match a base and an immediate (if Offset is not null) or an SGPR (if // SOffset is not null) or an immediate+SGPR offset. If Imm32Only is // true, match only 32-bit immediate offsets available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, - SDValue *SOffset, SDValue *Offset, - bool Imm32Only, bool IsBuffer, - bool HasSOffset, - int64_t ImmOffset) const { +bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr, + SDValue &SBase, SDValue *SOffset, + SDValue *Offset, bool Imm32Only, + bool IsBuffer, bool HasSOffset, + int64_t ImmOffset, + bool *ScaleOffset) const { if (SOffset && Offset) { assert(!Imm32Only && !IsBuffer); SDValue B; - if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true)) + if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true)) return false; int64_t ImmOff = 0; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) ImmOff = C->getSExtValue(); - return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true, - ImmOff); + return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false, + true, ImmOff, ScaleOffset); } // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -2268,23 +2351,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, if (!N0 || !N1) return false; - if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, - ImmOffset)) { + if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset, ScaleOffset)) { SBase = N0; return true; } - if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, - ImmOffset)) { + if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset, ScaleOffset)) { SBase = N1; return true; } return false; } -bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, +bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, - bool Imm32Only) const { - if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) { + bool Imm32Only, bool *ScaleOffset) const { + if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only, + /* IsBuffer */ false, /* HasSOffset */ false, + /* ImmOffset */ 0, ScaleOffset)) { SBase = Expand32BitAddress(SBase); return true; } @@ -2300,36 +2385,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset); + return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr, + &Offset); } bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const { assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset, - /* Imm32Only */ true); + return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr, + &Offset, /* Imm32Only */ true); } -bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, - SDValue &SOffset) const { - return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr); +bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue &SOffset, SDValue &CPol) const { + bool ScaleOffset; + if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr, + /* Imm32Only */ false, &ScaleOffset)) + return false; + + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(N), MVT::i32); + return true; } -bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, - SDValue &SOffset, - SDValue &Offset) const { - return SelectSMRD(Addr, SBase, &SOffset, &Offset); +bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr, + SDValue &SBase, SDValue &SOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset)) + return false; + + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(N), MVT::i32); + return true; } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const { - return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, + return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset, /* Imm32Only */ false, /* IsBuffer */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const { assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, + return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset, /* Imm32Only */ true, /* IsBuffer */ true); } @@ -2338,9 +2438,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, // Match the (soffset + offset) pair as a 32-bit register base and // an immediate offset. return N.getValueType() == MVT::i32 && - SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr, - &Offset, /* Imm32Only */ false, - /* IsBuffer */ true); + SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset, + /* SOffset*/ nullptr, &Offset, + /* Imm32Only */ false, /* IsBuffer */ true); } bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 9967f46..f7c7b3e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -163,6 +163,12 @@ private: SDValue &Offset) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset) const; + bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; + bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, @@ -170,22 +176,28 @@ private: bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, + bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, bool IsBuffer = false, bool HasSOffset = false, - int64_t ImmOffset = 0) const; + int64_t ImmOffset = 0, + bool *ScaleOffset = nullptr) const; SDValue Expand32BitAddress(SDValue Addr) const; - bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, - SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false, bool HasSOffset = false, - int64_t ImmOffset = 0) const; - bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, - SDValue *Offset, bool Imm32Only = false) const; + bool SelectSMRDBaseOffset(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue *SOffset, SDValue *Offset, + bool Imm32Only = false, bool IsBuffer = false, + bool HasSOffset = false, int64_t ImmOffset = 0, + bool *ScaleOffset = nullptr) const; + bool SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset, + SDValue *Offset, bool Imm32Only = false, + bool *ScaleOffset = nullptr) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; - bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const; - bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset, - SDValue &Offset) const; + bool SelectScaleOffset(SDNode *N, SDValue &Offset, bool IsSigned) const; + bool SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, SDValue &SOffset, + SDValue &CPol) const; + bool SelectSMRDSgprImm(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue &SOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3d040fb..e3ca09e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -375,7 +375,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand); setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand); setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); - setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand); @@ -1143,6 +1142,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::Cold: return CC_AMDGPU_Func; case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: return CC_SI_Gfx; case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: @@ -1168,6 +1168,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, case CallingConv::AMDGPU_LS: return RetCC_SI_Shader; case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: return RetCC_SI_Gfx; case CallingConv::C: case CallingConv::Fast: @@ -5875,6 +5876,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) + NODE_NAME_CASE(WHOLE_WAVE_SETUP) + NODE_NAME_CASE(WHOLE_WAVE_RETURN) } return nullptr; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 4e8c6c7..39bb0ad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -608,6 +608,12 @@ enum NodeType : unsigned { BUFFER_ATOMIC_FMAX, BUFFER_ATOMIC_COND_SUB_U32, LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32, + + // Set up a whole wave function. + WHOLE_WAVE_SETUP, + + // Return from a whole wave function. + WHOLE_WAVE_RETURN, }; } // End namespace AMDGPUISD diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index e2c2e89..f2207ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1694,6 +1694,47 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { NewII->takeName(&II); return IC.replaceInstUsesWith(II, NewII); } + case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: { + Value *Src0 = II.getArgOperand(1); + Value *Src1 = II.getArgOperand(3); + unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); + uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue(); + auto *Src0Ty = cast<FixedVectorType>(Src0->getType()); + auto *Src1Ty = cast<FixedVectorType>(Src1->getType()); + + bool MadeChange = false; + unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA); + unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB); + + // Depending on the used format, fewer registers are required so shrink the + // vector type. + if (Src0Ty->getNumElements() > Src0NumElts) { + Src0 = IC.Builder.CreateExtractVector( + FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0, + IC.Builder.getInt64(0)); + MadeChange = true; + } + + if (Src1Ty->getNumElements() > Src1NumElts) { + Src1 = IC.Builder.CreateExtractVector( + FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1, + IC.Builder.getInt64(0)); + MadeChange = true; + } + + if (!MadeChange) + return std::nullopt; + + SmallVector<Value *, 13> Args(II.args()); + Args[1] = Src0; + Args[3] = Src1; + + CallInst *NewII = IC.Builder.CreateIntrinsic( + IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()}, + Args, &II); + NewII->takeName(&II); + return IC.replaceInstUsesWith(II, NewII); + } } if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index ce58e93..e305f08 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -348,6 +348,17 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; +// Marks the entry into a whole wave function. +def AMDGPUwhole_wave_setup : SDNode< + "AMDGPUISD::WHOLE_WAVE_SETUP", SDTypeProfile<1, 0, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPSideEffect]>; + +// Marks the return from a whole wave function. +def AMDGPUwhole_wave_return : SDNode< + "AMDGPUISD::WHOLE_WAVE_RETURN", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; + // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ SDTCisInt<0>, // i8 tgt diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 1a63c48..d2e718c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3494,25 +3494,74 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { } /// Match a zero extend from a 32-bit value to 64-bits. -static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { +Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const { Register ZExtSrc; - if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) - return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); + if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc)))) + return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) - const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) return Register(); assert(Def->getNumOperands() == 3 && - MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); - if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) { return Def->getOperand(1).getReg(); } return Register(); } +/// Match a sign extend from a 32-bit value to 64-bits. +Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const { + Register SExtSrc; + if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc)))) + return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register(); + + // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31)) + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return Register(); + + assert(Def->getNumOperands() == 3 && + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + if (mi_match(Def->getOperand(2).getReg(), *MRI, + m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()), + m_SpecificICst(31)))) + return Def->getOperand(1).getReg(); + + if (VT->signBitIsZero(Reg)) + return matchZeroExtendFromS32(Reg); + + return Register(); +} + +/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it +/// is 32-bit. +Register +AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const { + return MRI->getType(Reg) == LLT::scalar(32) ? Reg + : matchZeroExtendFromS32(Reg); +} + +/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it +/// is 32-bit. +Register +AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const { + return MRI->getType(Reg) == LLT::scalar(32) ? Reg + : matchSignExtendFromS32(Reg); +} + +Register +AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg, + bool IsSigned) const { + if (IsSigned) + return matchSignExtendFromS32OrS32(Reg); + + return matchZeroExtendFromS32OrS32(Reg); +} + Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const { Register AnyExtSrc; if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc)))) @@ -3581,7 +3630,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); if (isSGPR(SAddr)) { Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); - if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) { Addr = SAddr; VOffset = Off; } @@ -4160,6 +4209,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return true; case AMDGPU::G_AMDGPU_WAVE_ADDRESS: return selectWaveAddress(I); + case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: { + I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN)); + return true; + } case AMDGPU::G_STACKRESTORE: return selectStackRestore(I); case AMDGPU::G_PHI: @@ -5219,7 +5272,7 @@ AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const { getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); unsigned Key = 0; - Register S32 = matchZeroExtendFromS32(*MRI, Src); + Register S32 = matchZeroExtendFromS32(Src); if (!S32) S32 = matchAnyExtendFromS32(Src); @@ -5292,10 +5345,68 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { }}; } +// Given \p Offset and load specified by the \p Root operand check if \p Offset +// is a multiple of the load byte size. If it is update \p Offset to a +// pre-scaled value and return true. +bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root, + Register &Offset, + bool IsSigned) const { + if (!Subtarget->hasScaleOffset()) + return false; + + const MachineInstr &MI = *Root.getParent(); + MachineMemOperand *MMO = *MI.memoperands_begin(); + + if (!MMO->getSize().hasValue()) + return false; + + uint64_t Size = MMO->getSize().getValue(); + + Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned); + if (!OffsetReg) + OffsetReg = Offset; + + if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI)) + OffsetReg = Def->Reg; + + Register Op0; + MachineInstr *Mul; + bool ScaleOffset = + (isPowerOf2_64(Size) && + mi_match(OffsetReg, *MRI, + m_GShl(m_Reg(Op0), + m_any_of(m_SpecificICst(Log2_64(Size)), + m_Copy(m_SpecificICst(Log2_64(Size))))))) || + mi_match(OffsetReg, *MRI, + m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size), + m_Copy(m_SpecificICst(Size))))) || + mi_match( + OffsetReg, *MRI, + m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64, + m_Reg(Op0), m_SpecificICst(Size))) || + // Match G_AMDGPU_MAD_U64_U32 offset, c, 0 + (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) && + (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32 + : AMDGPU::G_AMDGPU_MAD_U64_U32) || + (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 && + VT->signBitIsZero(Mul->getOperand(2).getReg()))) && + mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) && + mi_match(Mul->getOperand(3).getReg(), *MRI, + m_GTrunc(m_any_of(m_SpecificICst(Size), + m_Copy(m_SpecificICst(Size))))) && + mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0))); + + if (ScaleOffset) + Offset = Op0; + + return ScaleOffset; +} + bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, - int64_t *Offset) const { + int64_t *Offset, + bool *ScaleOffset) const { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); @@ -5310,6 +5421,9 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, const GEPInfo &GEPI = AddrInfo[0]; std::optional<int64_t> EncodedImm; + if (ScaleOffset) + *ScaleOffset = false; + if (SOffset && Offset) { EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, /*HasSOffset=*/true); @@ -5317,8 +5431,12 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, AddrInfo.size() > 1) { const GEPInfo &GEPI2 = AddrInfo[1]; if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) { - if (Register OffsetReg = - matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) { + Register OffsetReg = GEPI2.SgprParts[1]; + if (ScaleOffset) + *ScaleOffset = + selectScaleOffset(Root, OffsetReg, false /* IsSigned */); + OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg); + if (OffsetReg) { Base = GEPI2.SgprParts[0]; *SOffset = OffsetReg; *Offset = *EncodedImm; @@ -5363,7 +5481,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, } if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) { - if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) { + Register OffsetReg = GEPI.SgprParts[1]; + if (ScaleOffset) + *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */); + OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg); + if (OffsetReg) { Base = GEPI.SgprParts[0]; *SOffset = OffsetReg; return true; @@ -5377,7 +5499,8 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { Register Base; int64_t Offset; - if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset)) + if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset, + /* ScaleOffset */ nullptr)) return std::nullopt; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, @@ -5408,23 +5531,30 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { Register Base, SOffset; - if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr)) + bool ScaleOffset; + if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr, + &ScaleOffset)) return std::nullopt; + unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; + [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}}; } InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const { Register Base, SOffset; int64_t Offset; - if (!selectSmrdOffset(Root, Base, &SOffset, &Offset)) + bool ScaleOffset; + if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset)) return std::nullopt; + unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}}; } std::pair<Register, int> @@ -5485,7 +5615,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { +AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, + unsigned CPolBits) const { Register Addr = Root.getReg(); Register PtrBase; int64_t ConstOffset; @@ -5529,6 +5660,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { MIB.addReg(HighBits); }, // voffset [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }, }}; } } @@ -5559,7 +5691,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { // It's possible voffset is an SGPR here, but the copy to VGPR will be // inserted later. - if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + if (Register VOffset = matchZeroExtendFromS32(PtrBaseOffset)) { return {{[=](MachineInstrBuilder &MIB) { // saddr MIB.addReg(SAddr); }, @@ -5568,6 +5700,9 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { }, [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(ImmOffset); + }, + [=](MachineInstrBuilder &MIB) { // cpol + MIB.addImm(CPolBits); }}}; } } @@ -5591,11 +5726,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol }}; } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { + return selectGlobalSAddr(Root, 0); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { + return selectGlobalSAddr(Root, AMDGPU::CPol::GLC); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); Register PtrBase; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 2cb7904..e58fbb4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -232,8 +232,10 @@ private: InstructionSelector::ComplexRendererFns selectVINTERPModsHi(MachineOperand &Root) const; + bool selectScaleOffset(MachineOperand &Root, Register &Offset, + bool IsSigned) const; bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, - int64_t *Offset) const; + int64_t *Offset, bool *ScaleOffset) const; InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns @@ -254,7 +256,11 @@ private: selectScratchOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const; + InstructionSelector::ComplexRendererFns selectGlobalSAddr(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddrGLC(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; @@ -417,6 +423,19 @@ private: // shift amount operand's `ShAmtBits` bits is unneeded. bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const; + /// Match a zero extend from a 32-bit value to 64-bits. + Register matchZeroExtendFromS32(Register Reg) const; + /// Match a sign extend from a 32-bit value to 64-bits. + Register matchSignExtendFromS32(Register Reg) const; + /// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it + /// is 32-bit. + Register matchZeroExtendFromS32OrS32(Register Reg) const; + /// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it + /// is 32-bit. + Register matchSignExtendFromS32OrS32(Register Reg) const; + /// Match either sign or zero extend depending on the \p IsSigned from a + /// 32-bit value to 64-bits, or \p Reg itself if it is 32-bit. + Register matchExtendFromS32OrS32(Register Reg, bool IsSigned) const; /// Match an any extend from a 32-bit value to 64-bit. Register matchAnyExtendFromS32(Register Reg) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index fa8af68..304e91e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -1583,15 +1583,13 @@ void SplitPtrStructs::killAndReplaceSplitInstructions( if (!SplitUsers.contains(I)) continue; - SmallVector<DbgValueInst *> Dbgs; - findDbgValues(Dbgs, I); - for (auto *Dbg : Dbgs) { - IRB.SetInsertPoint(Dbg); + SmallVector<DbgVariableRecord *> Dbgs; + findDbgValues(I, Dbgs); + for (DbgVariableRecord *Dbg : Dbgs) { auto &DL = I->getDataLayout(); assert(isSplitFatPtr(I->getType()) && "We should've RAUW'd away loads, stores, etc. at this point"); - auto *OffDbg = cast<DbgValueInst>(Dbg->clone()); - copyMetadata(OffDbg, Dbg); + DbgVariableRecord *OffDbg = Dbg->clone(); auto [Rsrc, Off] = getPtrParts(I); int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType()); @@ -1606,9 +1604,9 @@ void SplitPtrStructs::killAndReplaceSplitInstructions( if (OffExpr) { OffDbg->setExpression(*OffExpr); OffDbg->replaceVariableLocationOp(I, Off); - IRB.Insert(OffDbg); + OffDbg->insertBefore(Dbg); } else { - OffDbg->deleteValue(); + OffDbg->eraseFromParent(); } if (RsrcExpr) { Dbg->setExpression(*RsrcExpr); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index bf2f37b..f1caf24 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4714,6 +4714,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8: case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8: case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_wmma_f32_32x16x128_f4: case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: @@ -5540,6 +5541,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_PREFETCH: OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); break; + case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP: + case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + break; } return getInstructionMapping(/*ID*/1, /*Cost*/1, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 43d4e8db..421fc42 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -176,6 +176,8 @@ public: ImmTyWaitVAVDst, ImmTyWaitVMVSrc, ImmTyBitOp3, + ImmTyMatrixAFMT, + ImmTyMatrixBFMT, ImmTyMatrixAReuse, ImmTyMatrixBReuse, ImmTyByteSel, @@ -423,6 +425,8 @@ public: bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); } bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); } bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); } + bool isMatrixAFMT() const { return isImmTy(ImmTyMatrixAFMT); } + bool isMatrixBFMT() const { return isImmTy(ImmTyMatrixBFMT); } bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); } bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); } bool isTFE() const { return isImmTy(ImmTyTFE); } @@ -1174,6 +1178,8 @@ public: case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break; case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break; case ImmTyBitOp3: OS << "BitOp3"; break; + case ImmTyMatrixAFMT: OS << "ImmTyMatrixAFMT"; break; + case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break; case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break; case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break; case ImmTyByteSel: OS << "ByteSel" ; break; @@ -1714,6 +1720,10 @@ public: ParseStatus parseIndexKey8bit(OperandVector &Operands); ParseStatus parseIndexKey16bit(OperandVector &Operands); ParseStatus parseIndexKey32bit(OperandVector &Operands); + ParseStatus tryParseMatrixFMT(OperandVector &Operands, StringRef Name, + AMDGPUOperand::ImmTy Type); + ParseStatus parseMatrixAFMT(OperandVector &Operands); + ParseStatus parseMatrixBFMT(OperandVector &Operands); ParseStatus parseDfmtNfmt(int64_t &Format); ParseStatus parseUfmt(int64_t &Format); @@ -1849,6 +1859,7 @@ private: const unsigned CPol); bool validateTFE(const MCInst &Inst, const OperandVector &Operands); std::optional<StringRef> validateLdsDirect(const MCInst &Inst); + bool validateWMMA(const MCInst &Inst, const OperandVector &Operands); unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; @@ -5128,13 +5139,45 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const { bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const { auto FB = getFeatureBits(); + if (!FB[AMDGPU::FeatureGFX90AInsts] && !FB[AMDGPU::FeatureGFX1250Insts]) + return true; + unsigned Opc = Inst.getOpcode(); + const MCRegisterInfo *MRI = getMRI(); // DS_READ_B96_TR_B6 is the only DS instruction in GFX950, that allows // unaligned VGPR. All others only allow even aligned VGPRs. - if (!(FB[AMDGPU::FeatureGFX90AInsts]) || Opc == AMDGPU::DS_READ_B96_TR_B6_vi) + if (FB[AMDGPU::FeatureGFX90AInsts] && Opc == AMDGPU::DS_READ_B96_TR_B6_vi) return true; - const MCRegisterInfo *MRI = getMRI(); + if (FB[AMDGPU::FeatureGFX1250Insts]) { + switch (Opc) { + default: + break; + case AMDGPU::DS_LOAD_TR6_B96: + case AMDGPU::DS_LOAD_TR6_B96_gfx12: + // DS_LOAD_TR6_B96 is the only DS instruction in GFX1250, that + // allows unaligned VGPR. All others only allow even aligned VGPRs. + return true; + case AMDGPU::GLOBAL_LOAD_TR6_B96: + case AMDGPU::GLOBAL_LOAD_TR6_B96_gfx1250: { + // GLOBAL_LOAD_TR6_B96 is the only GLOBAL instruction in GFX1250, that + // allows unaligned VGPR for vdst, but other operands still only allow + // even aligned VGPRs. + int VAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); + if (VAddrIdx != -1) { + const MCOperand &Op = Inst.getOperand(VAddrIdx); + MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0); + if ((Sub - AMDGPU::VGPR0) & 1) + return false; + } + return true; + } + case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR: + case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR_gfx1250: + return true; + } + } + const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID); const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID); for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { @@ -5280,6 +5323,28 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, unsigned CPol = Inst.getOperand(CPolPos).getImm(); + if (!isGFX1250()) { + if (CPol & CPol::SCAL) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + StringRef CStr(S.getPointer()); + S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]); + Error(S, "scale_offset is not supported on this GPU"); + } + if (CPol & CPol::NV) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + StringRef CStr(S.getPointer()); + S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]); + Error(S, "nv is not supported on this GPU"); + } + } + + if ((CPol & CPol::SCAL) && !supportsScaleOffset(MII, Inst.getOpcode())) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + StringRef CStr(S.getPointer()); + S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]); + Error(S, "scale_offset is not supported for this instruction"); + } + if (isGFX12Plus()) return validateTHAndScopeBits(Inst, Operands, CPol); @@ -5400,6 +5465,37 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst, return true; } +bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst, + const OperandVector &Operands) { + unsigned Opc = Inst.getOpcode(); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + const MCInstrDesc &Desc = MII.get(Opc); + + auto validateFmt = [&](AMDGPU::OpName FmtOp, AMDGPU::OpName SrcOp) -> bool { + int FmtIdx = AMDGPU::getNamedOperandIdx(Opc, FmtOp); + if (FmtIdx == -1) + return true; + unsigned Fmt = Inst.getOperand(FmtIdx).getImm(); + int SrcIdx = AMDGPU::getNamedOperandIdx(Opc, SrcOp); + unsigned RegSize = + TRI->getRegClass(Desc.operands()[SrcIdx].RegClass).getSizeInBits(); + + if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32) + return true; + + static const char *FmtNames[] = {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8", + "MATRIX_FMT_FP6", "MATRIX_FMT_BF6", + "MATRIX_FMT_FP4"}; + + Error(getRegLoc(mc2PseudoReg(Inst.getOperand(SrcIdx).getReg()), Operands), + "wrong register tuple size for " + Twine(FmtNames[Fmt])); + return false; + }; + + return validateFmt(AMDGPU::OpName::matrix_a_fmt, AMDGPU::OpName::src0) && + validateFmt(AMDGPU::OpName::matrix_b_fmt, AMDGPU::OpName::src1); +} + bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands) { @@ -5533,6 +5629,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, if (!validateTFE(Inst, Operands)) { return false; } + if (!validateWMMA(Inst, Operands)) { + return false; + } return true; } @@ -6916,6 +7015,8 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { int64_t CPolVal = 0; ParseStatus ResTH = ParseStatus::NoMatch; ParseStatus ResScope = ParseStatus::NoMatch; + ParseStatus ResNV = ParseStatus::NoMatch; + ParseStatus ResScal = ParseStatus::NoMatch; for (;;) { if (ResTH.isNoMatch()) { @@ -6940,10 +7041,36 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { } } + // NV bit exists on GFX12+, but does something starting from GFX1250. + // Allow parsing on all GFX12 and fail on validation for better + // diagnostics. + if (ResNV.isNoMatch()) { + if (trySkipId("nv")) { + ResNV = ParseStatus::Success; + CPolVal |= CPol::NV; + continue; + } else if (trySkipId("no", "nv")) { + ResNV = ParseStatus::Success; + continue; + } + } + + if (ResScal.isNoMatch()) { + if (trySkipId("scale_offset")) { + ResScal = ParseStatus::Success; + CPolVal |= CPol::SCAL; + continue; + } else if (trySkipId("no", "scale_offset")) { + ResScal = ParseStatus::Success; + continue; + } + } + break; } - if (ResTH.isNoMatch() && ResScope.isNoMatch()) + if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch() && + ResScal.isNoMatch()) return ParseStatus::NoMatch; Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc, @@ -7191,6 +7318,26 @@ ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) { return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit); } +ParseStatus AMDGPUAsmParser::tryParseMatrixFMT(OperandVector &Operands, + StringRef Name, + AMDGPUOperand::ImmTy Type) { + return parseStringOrIntWithPrefix(Operands, Name, + {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8", + "MATRIX_FMT_FP6", "MATRIX_FMT_BF6", + "MATRIX_FMT_FP4"}, + Type); +} + +ParseStatus AMDGPUAsmParser::parseMatrixAFMT(OperandVector &Operands) { + return tryParseMatrixFMT(Operands, "matrix_a_fmt", + AMDGPUOperand::ImmTyMatrixAFMT); +} + +ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) { + return tryParseMatrixFMT(Operands, "matrix_b_fmt", + AMDGPUOperand::ImmTyMatrixBFMT); +} + // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their // values to live in a joint format operand in the MCInst encoding. ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { @@ -9292,6 +9439,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, DefaultVal); } + int MatrixAFMTIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_fmt); + if (MatrixAFMTIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixAFMT, 0); + } + + int MatrixBFMTIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_fmt); + if (MatrixBFMTIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixBFMT, 0); + } + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse)) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyMatrixAReuse, 0); diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 0caabe4..f99e716 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1488,7 +1488,6 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; @@ -2451,6 +2450,7 @@ class VBUFFER_Real <bits<8> op, BUF_Pseudo ps, string real_name> : let Inst{62} = ps.offen; let Inst{63} = ps.idxen; + let Inst{7} = cpol{5}; // nv let Inst{54-53} = cpol{2-1}; // th{2-1} let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0} let Inst{51-50} = cpol{4-3}; // scope diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index e219fe0..319cc9d 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -886,7 +886,6 @@ defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">; defm : DSReadPat_mc <DS_READ_U8, i32, "extloadi8_local">; defm : DSReadPat_mc <DS_READ_U8, i32, "zextloadi8_local">; defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">; -defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">; defm : DSReadPat_t16 <DS_READ_I8, i16, "sextloadi8_local">; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 98f7e17..5c1989b 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -877,6 +877,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsMAI) convertMAIInst(MI); + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsWMMA) + convertWMMAInst(MI); + int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); if (VDstIn_Idx != -1) { @@ -974,10 +977,23 @@ static void adjustMFMA_F8F6F4OpRegClass(const MCRegisterInfo &MRI, return MO.setReg( MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5)); case 8: + if (MCRegister NewReg = MRI.getSubReg( + MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7)) { + MO.setReg(NewReg); + } + return; + case 12: { + // There is no 384-bit subreg index defined. + MCRegister BaseReg = MRI.getSubReg(MO.getReg(), AMDGPU::sub0); + MCRegister NewReg = MRI.getMatchingSuperReg( + BaseReg, AMDGPU::sub0, &MRI.getRegClass(AMDGPU::VReg_384RegClassID)); + return MO.setReg(NewReg); + } + case 16: // No-op in cases where one operand is still f8/bf8. return; default: - llvm_unreachable("Unexpected size for mfma f8f6f4 operand"); + llvm_unreachable("Unexpected size for mfma/wmma f8f6f4 operand"); } } @@ -1015,6 +1031,35 @@ void AMDGPUDisassembler::convertMAIInst(MCInst &MI) const { AdjustedRegClassOpcode->NumRegsSrcB); } +void AMDGPUDisassembler::convertWMMAInst(MCInst &MI) const { + int FmtAIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_a_fmt); + if (FmtAIdx == -1) + return; + + int FmtBIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_b_fmt); + + unsigned FmtA = MI.getOperand(FmtAIdx).getImm(); + unsigned FmtB = MI.getOperand(FmtBIdx).getImm(); + + const AMDGPU::MFMA_F8F6F4_Info *AdjustedRegClassOpcode = + AMDGPU::getWMMA_F8F6F4_WithFormatArgs(FmtA, FmtB, MI.getOpcode()); + if (!AdjustedRegClassOpcode || + AdjustedRegClassOpcode->Opcode == MI.getOpcode()) + return; + + MI.setOpcode(AdjustedRegClassOpcode->Opcode); + int Src0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + int Src1Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); + adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src0Idx), + AdjustedRegClassOpcode->NumRegsSrcA); + adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src1Idx), + AdjustedRegClassOpcode->NumRegsSrcB); +} + struct VOPModifiers { unsigned OpSel = 0; unsigned OpSelHi = 0; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 8404100..f4d164b 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -161,6 +161,7 @@ public: void convertFMAanyK(MCInst &MI) const; void convertSDWAInst(MCInst &MI) const; void convertMAIInst(MCInst &MI) const; + void convertWMMAInst(MCInst &MI) const; void convertDPP8Inst(MCInst &MI) const; void convertMIMGInst(MCInst &MI) const; void convertVOP3DPPInst(MCInst &MI) const; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c8a4e22..1cc717b 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -11,7 +11,8 @@ let WantsRoot = true in { def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>; def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>; - def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [], -10>; + def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; + def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>; def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>; } @@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : bits<7> saddr; bits<8> vdst; - bits<6> cpol; + bits<12> cpol; bits<8> vdata; // vsrc bits<8> vaddr; bits<24> offset; @@ -192,6 +193,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : let Inst{31-26} = 0x3b; let Inst{39-32} = !if(ps.has_vdst, vdst, ?); let Inst{49} = ps.sve; + let Inst{7} = cpol{5}; // nv let Inst{54-53} = cpol{2-1}; // th{2-1} let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0} let Inst{51-50} = cpol{4-3}; // scope @@ -1252,13 +1254,13 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp >; class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), - (inst $saddr, $voffset, $offset, (i32 0), $in) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), + (inst $saddr, $voffset, $offset, $cpol, $in) >; class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), - (inst $saddr, $voffset, $offset, (i32 0)) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (inst $saddr, $voffset, $offset, $cpol) >; class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1272,26 +1274,26 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> >; class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), - (inst $saddr, $voffset, $offset, 0) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (inst $saddr, $voffset, $offset, $cpol) >; class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)), - (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) + (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)), + (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol) >; -class GlobalAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, - ValueType vt, ValueType data_vt = vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), data_vt:$data)), - (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset) +class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat, + ValueType vt, ValueType data_vt = vt> : GCNPat < + (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)), + (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol) >; class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data), - (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) + (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$data), + (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol) >; class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1320,6 +1322,12 @@ multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt, let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + + def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), + GlobalSAddr, vt, data_vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } } multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix, @@ -1338,6 +1346,11 @@ multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt, def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + + def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> { + let AddedComplexity = 8; + let SubtargetPredicate = HasFlatGVSMode; + } } multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix, @@ -1507,7 +1520,8 @@ multiclass GlobalFLATAtomicPatsNoRtnBase<string inst, string node, ValueType vt, def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<SDPatternOperator>(node), vt, data_vt>; let AddedComplexity = 13 in - def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), vt, data_vt>; + def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), + GlobalSAddr, vt, data_vt>; } multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt, @@ -1518,7 +1532,7 @@ multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt, def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>; let AddedComplexity = 12 in - def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; + def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt>; } multiclass GlobalFLATAtomicPatsNoRtn<string inst, string node, ValueType vt, @@ -1797,12 +1811,13 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; -let SubtargetPredicate = isGFX12Plus in { - defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >; +} // End OtherPredicates = [HasFlatAddressSpace] - let OtherPredicates = [HasAtomicCSubNoRtnInsts] in - defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; -} +let OtherPredicates = [isGFX12Plus] in +defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; + +let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in +defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; let OtherPredicates = [HasD16LoadStore] in { defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; @@ -1826,8 +1841,6 @@ defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } -} // End OtherPredicates = [HasFlatAddressSpace] - let OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i32>; @@ -2928,6 +2941,7 @@ multiclass VFLAT_Real_gfx12 <bits<8> op, string name = get_FLAT_ps<NAME>.Mnemoni let DecoderNamespace = "GFX12"; let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch}; + let Inst{48} = cpol{CPolBit.SCAL}; // scale offset } } @@ -3157,6 +3171,7 @@ multiclass VFLAT_Real_gfx1250<bits<8> op, let DecoderNamespace = "GFX1250"; let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch}; + let Inst{48} = cpol{CPolBit.SCAL}; // scale offset } } diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index bbed828..94886b0 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -520,8 +520,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineInstr *MI, IsExpiredFn IsExpired) { DenseSet<const MachineBasicBlock *> Visited; return getWaitStatesSince(IsHazard, MI->getParent(), - std::next(MI->getReverseIterator()), - 0, IsExpired, Visited); + std::next(MI->getReverseIterator()), 0, IsExpired, + Visited, SIInstrInfo::getNumWaitStates); } int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { @@ -1190,7 +1190,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixVALUPartialForwardingHazard(MI); fixVALUTransUseHazard(MI); fixVALUTransCoexecutionHazards(MI); - fixWMMAHazards(MI); + fixWMMAHazards(MI); // fall-through if co-execution is enabled. + fixWMMACoexecutionHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); fixRequiredExportPriority(MI); @@ -1909,6 +1910,182 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { return true; } +static bool isCoexecutableVALUInst(const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isTRANS(MI) && + !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else? +} + +static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, + const SIInstrInfo *TII, unsigned Latency, + unsigned Category) { + assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) && + "Handle me if the xdl wmma instruction latency changes"); + + switch (Category) { + case 0: // Dense WMMA Instructions: + // WMMA_*F16, WMMA_*BF16 + // WMMA_*FP8FP8 + // WMMA_*FP8BF8 + // WMMA_*BF8FP8 + // WMMA_*BF8BF8 + // WMMA_*F8F6F4 if SRCA & SRCB != F8 + return Latency == 8 && SIInstrInfo::isWMMA(MI); + + case 1: // Dense WMMA Instructions: + // WMMA_IU8 + // WMMA_IU4 + // WMMA_*F8F6F4 if SRCA OR SRCB == F8 + return Latency == 16 && SIInstrInfo::isWMMA(MI); + + case 2: // Dense SWMMAC Instructions + // SWMMAC_*F16, SWMMAC_*BF16, + // SWMMAC_*FP8FP8 + // SWMMAC_*BF8FP8 + // SWMMAC_*FP8BF8 + // SWMMAC_*BF8BF8 + return Latency == 8 && SIInstrInfo::isSWMMAC(MI); + + case 3: // Sparse WMMA Instructions: + // SWMMAC_IU8 + // SWMMAC_IU4 + return Latency == 16 && SIInstrInfo::isSWMMAC(MI); + default: + break; + } // end switch. + + return false; +} + +bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) { + if (!AMDGPU::isGFX1250(ST)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI)) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + // WaitStates here is the number of V_NOPs or unrelated VALU instructions must + // be in between the first WMMA and the second instruction to cover the hazard + // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second + // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for + // numbers, which depends on the category of the first WMMA. + const int WMMAWaitStates[] = {5, 9, 3, 5}; + const int VALUWaitStates[] = {4, 8, 2, 4}; + unsigned Category = 0; + + auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) { + if (!TII->isXDLWMMA(I)) + return false; + + unsigned Latency = TSchedModel.computeInstrLatency(&I); + if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category)) + return false; + + Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); + Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); + + // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1). + if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1)) + return true; + + if (SIInstrInfo::isSWMMAC(*MI)) { + Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + if (TRI->regsOverlap(D0, Idx1)) + return true; + } + + return false; + }; + + auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) { + if (!TII->isXDLWMMA(I)) + return false; + + unsigned Latency = TSchedModel.computeInstrLatency(&I); + if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category)) + return false; + + // WMMA writes, VALU reads. + Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + for (const MachineOperand &ValuUse : MI->explicit_uses()) { + if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg())) + return true; + } + + auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst); + if (!ValuDst || !ValuDst->isReg()) + return false; + Register D1 = ValuDst->getReg(); + + // WMMA writes, VALU writes. + if (TRI->regsOverlap(D0, D1)) + return true; + + // WMMA reads, VALU writes. + Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg(); + Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg(); + if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1)) + return true; + + if (SIInstrInfo::isSWMMAC(I)) { + Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg(); + if (TRI->regsOverlap(D1, Idx0)) + return true; + } + + return false; + }; + + int Limit = 0; + auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) { + return WaitStates >= Limit; + }; + + auto GetWaitStatesFn = [](const MachineInstr &I) { + return SIInstrInfo::isVALU(I) ? 1 : 0; + }; + + int WaitStatesNeeded = -1; + if (TII->isXDLWMMA(*MI)) { + for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) { + Limit = WMMAWaitStates[Category]; // for IsExpiredFn. + DenseSet<const MachineBasicBlock *> Visited; + // '::getWaitStatesSince' returns the number of VALUs in between if hazard + // exists, and INT_MAX if there is no hazard. As a result, a negative + // WaitStatesNeeded here means no hazard, and we will continue to search + // for other categories. + WaitStatesNeeded = + Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(), + std::next(MI->getReverseIterator()), 0, + IsExpiredFn, Visited, GetWaitStatesFn); + } + } else { // Must be a co-executable VALU. + for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) { + Limit = VALUWaitStates[Category]; // for IsExpiredFn. + DenseSet<const MachineBasicBlock *> Visited; + // '::getWaitStatesSince' returns the number of VALUs in between if hazard + // exists, and INT_MAX if there is no hazard. As a result, a negative + // WaitStatesNeeded here means no hazard, and we will continue to search + // for other categories. + WaitStatesNeeded = + Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(), + std::next(MI->getReverseIterator()), 0, + IsExpiredFn, Visited, GetWaitStatesFn); + } + } + + // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative + // means not needed. + for (int i = 0; i < WaitStatesNeeded; i++) + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_NOP_e32)); + + return true; +} + bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { if (!ST.hasShift64HighRegBug()) return false; @@ -3206,7 +3383,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { // Check entry priority at each export (as there will only be a few). // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. bool Changed = false; - if (CC != CallingConv::AMDGPU_Gfx) + if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave) Changed = ensureEntrySetPrio(MF, NormalPriority, TII); auto NextMI = std::next(It); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index ef6ddd8..f796eeae 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -106,6 +106,7 @@ private: bool fixVALUTransUseHazard(MachineInstr *MI); bool fixVALUTransCoexecutionHazards(MachineInstr *MI); bool fixWMMAHazards(MachineInstr *MI); + bool fixWMMACoexecutionHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); bool fixRequiredExportPriority(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 7b8f0f4..9a2bab1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -324,7 +324,7 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { } void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { + const SchedRegion &Region) const { // Track register pressure so the scheduler can try to decrease // pressure once register usage is above the threshold defined by // SIRegisterInfo::getRegPressureSetLimit() diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 268162b..407d79a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1022,7 +1022,7 @@ public: } void overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const override; + const SchedRegion &Region) const override; void mirFileLoaded(MachineFunction &MF) const override; @@ -1162,6 +1162,9 @@ public: bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; } + // Scalar and global loads support scale_offset bit. + bool hasScaleOffset() const { return GFX1250Insts; } + bool hasFlatGVSMode() const { return FlatGVSMode; } bool enableSIScheduler() const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index ec9248b..11b072e 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -157,9 +157,15 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, const int64_t TH = Imm & CPol::TH; const int64_t Scope = Imm & CPol::SCOPE; + if (Imm & CPol::SCAL) + O << " scale_offset"; + printTH(MI, TH, Scope, O); printScope(Scope, O); + if (Imm & CPol::NV) + O << " nv"; + return; } @@ -1342,6 +1348,48 @@ void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo, O << " index_key:" << Imm; } +void AMDGPUInstPrinter::printMatrixFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O, char AorB) { + auto Imm = MI->getOperand(OpNo).getImm() & 0x7; + if (Imm == 0) + return; + + O << " matrix_" << AorB << "_fmt:"; + switch (Imm) { + default: + O << Imm; + break; + case WMMA::MatrixFMT::MATRIX_FMT_FP8: + O << "MATRIX_FMT_FP8"; + break; + case WMMA::MatrixFMT::MATRIX_FMT_BF8: + O << "MATRIX_FMT_BF8"; + break; + case WMMA::MatrixFMT::MATRIX_FMT_FP6: + O << "MATRIX_FMT_FP6"; + break; + case WMMA::MatrixFMT::MATRIX_FMT_BF6: + O << "MATRIX_FMT_BF6"; + break; + case WMMA::MatrixFMT::MATRIX_FMT_FP4: + O << "MATRIX_FMT_FP4"; + break; + } +} + +void AMDGPUInstPrinter::printMatrixAFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixFMT(MI, OpNo, STI, O, 'a'); +} + +void AMDGPUInstPrinter::printMatrixBFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixFMT(MI, OpNo, STI, O, 'b'); +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index e3299a6..e0b7aa5 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -134,6 +134,12 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printIndexKey32bit(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O, char AorB); + void printMatrixAFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixBFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index f48739f..c49ad79 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -384,6 +384,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, if (((Desc.TSFlags & SIInstrFlags::VOP3P) || Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi || Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) && + // Matrix B format operand reuses op_sel_hi. + !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) && // Matrix B reuse operand reuses op_sel_hi. !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) { Encoding |= getImplicitOpSelHiEncoding(Opcode); diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index a864997..3902d4c 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -398,8 +398,12 @@ enum CPol { SCOPE_DEV = 2 << 3, SCOPE_SYS = 3 << 3, + NV = 1 << 5, // Non-volatile bit + SWZ = 1 << 6, // Swizzle bit + SCAL = 1 << 11, // Scale offset bit + ALL = TH | SCOPE, // Helper bits @@ -1003,6 +1007,16 @@ enum Target : unsigned { } // namespace Exp +namespace WMMA { +enum MatrixFMT : unsigned { + MATRIX_FMT_FP8 = 0, + MATRIX_FMT_BF8 = 1, + MATRIX_FMT_FP6 = 2, + MATRIX_FMT_BF6 = 3, + MATRIX_FMT_FP4 = 4 +}; +} // namespace WMMA + namespace VOP3PEncoding { enum OpSel : uint64_t { diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index e172c0b..e5d1eaa 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1209,18 +1209,24 @@ void SIFoldOperandsImpl::foldOperand( return; } - // A frame index will resolve to a positive constant, so it should always be - // safe to fold the addressing mode, even pre-GFX9. - UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI()); - const unsigned Opc = UseMI->getOpcode(); if (TII->isFLATScratch(*UseMI) && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) { unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc); + unsigned CPol = + TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm(); + if ((CPol & AMDGPU::CPol::SCAL) && + !AMDGPU::supportsScaleOffset(*TII, NewOpc)) + return; + UseMI->setDesc(TII->get(NewOpc)); } + // A frame index will resolve to a positive constant, so it should always be + // safe to fold the addressing mode, even pre-GFX9. + UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI()); + return; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 6a38679..11552b3 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -946,8 +946,18 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); - ScratchExecCopy = findScratchNonCalleeSaveRegister( - MRI, LiveUnits, *TRI.getWaveMaskRegClass()); + if (FuncInfo->isWholeWaveFunction()) { + // Whole wave functions already have a copy of the original EXEC mask that + // we can use. + assert(IsProlog && "Epilog should look at return, not setup"); + ScratchExecCopy = + TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg(); + assert(ScratchExecCopy && "Couldn't find copy of EXEC"); + } else { + ScratchExecCopy = findScratchNonCalleeSaveRegister( + MRI, LiveUnits, *TRI.getWaveMaskRegClass()); + } + if (!ScratchExecCopy) report_fatal_error("failed to find free scratch register"); @@ -996,10 +1006,15 @@ void SIFrameLowering::emitCSRSpillStores( }; StoreWWMRegisters(WWMScratchRegs); + + auto EnableAllLanes = [&]() { + unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); + }; + if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); + EnableAllLanes(); } else { ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, @@ -1008,7 +1023,18 @@ void SIFrameLowering::emitCSRSpillStores( } StoreWWMRegisters(WWMCalleeSavedRegs); - if (ScratchExecCopy) { + if (FuncInfo->isWholeWaveFunction()) { + // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove + // it now. If we have already saved some WWM CSR registers, then the EXEC is + // already -1 and we don't need to do anything else. Otherwise, set EXEC to + // -1 here. + if (!ScratchExecCopy) + buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, + /*EnableInactiveLanes*/ true); + else if (WWMCalleeSavedRegs.empty()) + EnableAllLanes(); + TII->getWholeWaveFunctionSetup(MF)->eraseFromParent(); + } else if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) @@ -1083,11 +1109,6 @@ void SIFrameLowering::emitCSRSpillRestores( Register ScratchExecCopy; SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); - if (!WWMScratchRegs.empty()) - ScratchExecCopy = - buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, - /*IsProlog*/ false, /*EnableInactiveLanes*/ true); - auto RestoreWWMRegisters = [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { for (const auto &Reg : WWMRegs) { @@ -1098,6 +1119,36 @@ void SIFrameLowering::emitCSRSpillRestores( } }; + if (FuncInfo->isWholeWaveFunction()) { + // For whole wave functions, the EXEC is already -1 at this point. + // Therefore, we can restore the CSR WWM registers right away. + RestoreWWMRegisters(WWMCalleeSavedRegs); + + // The original EXEC is the first operand of the return instruction. + const MachineInstr &Return = MBB.instr_back(); + assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN && + "Unexpected return inst"); + Register OrigExec = Return.getOperand(0).getReg(); + + if (!WWMScratchRegs.empty()) { + unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; + BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec()) + .addReg(OrigExec) + .addImm(-1); + RestoreWWMRegisters(WWMScratchRegs); + } + + // Restore original EXEC. + unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec); + return; + } + + if (!WWMScratchRegs.empty()) { + ScratchExecCopy = + buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, + /*IsProlog=*/false, /*EnableInactiveLanes=*/true); + } RestoreWWMRegisters(WWMScratchRegs); if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { @@ -1634,6 +1685,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, NeedExecCopyReservedReg = true; else if (MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || (MFI->isChainFunction() && TII->isChainCallOpcode(MI.getOpcode()))) { // We expect all return to be the same size. @@ -1662,6 +1714,21 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, if (MFI->isEntryFunction()) return; + if (MFI->isWholeWaveFunction()) { + // In practice, all the VGPRs are WWM registers, and we will need to save at + // least their inactive lanes. Add them to WWMReservedRegs. + assert(!NeedExecCopyReservedReg && + "Whole wave functions can use the reg mapped for their i1 argument"); + + // FIXME: Be more efficient! + for (MCRegister Reg : AMDGPU::VGPR_32RegClass) + if (MF.getRegInfo().isPhysRegModified(Reg)) { + MFI->reserveWWMRegister(Reg); + MF.begin()->addLiveIn(Reg); + } + MF.begin()->sortUniqueLiveIns(); + } + // Remove any VGPRs used in the return value because these do not need to be saved. // This prevents CSR restore from clobbering return VGPRs. if (ReturnMI) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0c76ff2..bc0fd8d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -618,6 +618,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::FSIN, ISD::FROUND}, MVT::f16, Custom); + // BF16 - VOP1 Actions. + if (Subtarget->hasBF16TransInsts()) + setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom); + setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote); @@ -2260,7 +2264,8 @@ SDValue SITargetLowering::getPreloadedValue( const ArgDescriptor WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); if (Subtarget->hasArchitectedSGPRs() && - (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) { + (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx || + CC == CallingConv::AMDGPU_Gfx_WholeWave)) { switch (PVID) { case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: Reg = &WorkGroupIDX; @@ -2942,12 +2947,15 @@ SDValue SITargetLowering::LowerFormalArguments( if (!Subtarget->enableFlatScratch()) assert(!UserSGPRInfo.hasFlatScratchInit()); if ((CallConv != CallingConv::AMDGPU_CS && - CallConv != CallingConv::AMDGPU_Gfx) || + CallConv != CallingConv::AMDGPU_Gfx && + CallConv != CallingConv::AMDGPU_Gfx_WholeWave) || !Subtarget->hasArchitectedSGPRs()) assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ()); } + bool IsWholeWaveFunc = Info->isWholeWaveFunction(); + if (CallConv == CallingConv::AMDGPU_PS) { processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); @@ -2988,7 +2996,8 @@ SDValue SITargetLowering::LowerFormalArguments( } else if (IsKernel) { assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); } else { - Splits.append(Ins.begin(), Ins.end()); + Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(), + Ins.end()); } if (IsKernel) @@ -3019,6 +3028,13 @@ SDValue SITargetLowering::LowerFormalArguments( SmallVector<SDValue, 16> Chains; + if (IsWholeWaveFunc) { + SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL, + {MVT::i1, MVT::Other}, Chain); + InVals.push_back(Setup.getValue(0)); + Chains.push_back(Setup.getValue(1)); + } + // FIXME: This is the minimum kernel argument alignment. We should improve // this to the maximum alignment of the arguments. // @@ -3026,7 +3042,8 @@ SDValue SITargetLowering::LowerFormalArguments( // kern arg offset. const Align KernelArgBaseAlign = Align(16); - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e; + ++i) { const ISD::InputArg &Arg = Ins[i]; if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) { InVals.push_back(DAG.getPOISON(Arg.VT)); @@ -3374,7 +3391,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, unsigned Opc = AMDGPUISD::ENDPGM; if (!IsWaveEnd) - Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE; + Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN + : IsShader ? AMDGPUISD::RETURN_TO_EPILOG + : AMDGPUISD::RET_GLUE; return DAG.getNode(Opc, DL, MVT::Other, RetOps); } @@ -3876,7 +3895,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) { + if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) && + CallConv != CallingConv::AMDGPU_Gfx_WholeWave) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -5890,6 +5910,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); return SplitBB; } + case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: { + assert(MFI->isWholeWaveFunction()); + + // During ISel, it's difficult to propagate the original EXEC mask to use as + // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead. + MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent()); + Register OriginalExec = Setup->getOperand(0).getReg(); + assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC"); + MF->getRegInfo().clearKillFlags(OriginalExec); + MI.getOperand(0).setReg(OriginalExec); + return BB; + } default: if (TII->isImage(MI) || TII->isMUBUF(MI)) { if (!MI.mayStore()) @@ -11172,7 +11204,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, // Without !fpmath accuracy information, we can't do more because we don't // know exactly whether rcp is accurate enough to meet !fpmath requirement. // f16 is always accurate enough - if (!AllowInaccurateRcp && VT != MVT::f16) + if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16) return SDValue(); if (CLHS->isExactlyValue(1.0)) { @@ -11199,9 +11231,10 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, } } - // For f16 require afn or arcp. + // For f16 and bf16 require afn or arcp. // For f32 require afn. - if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) + if (!AllowInaccurateRcp && + ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal())) return SDValue(); // Turn into multiply by the reciprocal. @@ -11592,7 +11625,7 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::f64) return LowerFDIV64(Op, DAG); - if (VT == MVT::f16) + if (VT == MVT::f16 || VT == MVT::bf16) return LowerFDIV16(Op, DAG); llvm_unreachable("Unexpected type for fdiv"); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 2af0a57..9faf497 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1812,6 +1812,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || MI.getOpcode() == AMDGPU::SI_RETURN || + MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index a368bc5..89d9b0d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -317,6 +317,8 @@ def CPolBit { int SLC = 1; int DLC = 2; int SCC = 4; + int NV = 5; + int SCAL = 11; } class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index c8935f0..571f3ef 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2472,6 +2472,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } + case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: case AMDGPU::SI_RETURN: { const MachineFunction *MF = MBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); @@ -5481,6 +5482,19 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) { + if (CPol->getImm() & AMDGPU::CPol::SCAL) { + if (!ST.hasScaleOffset()) { + ErrInfo = "Subtarget does not support offset scaling"; + return false; + } + if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) { + ErrInfo = "Instruction does not support offset scaling"; + return false; + } + } + } + return true; } @@ -5757,6 +5771,19 @@ void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, Indexes->insertMachineInstrInMaps(*ExecRestoreMI); } +MachineInstr * +SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { + assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() && + "Not a whole wave func"); + MachineBasicBlock &MBB = *MF.begin(); + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP || + MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP) + return &MI; + + llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction"); +} + static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 5e92921..800ea9a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1215,6 +1215,8 @@ public: MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes = nullptr) const; + MachineInstr *getWholeWaveFunctionSetup(MachineFunction &MF) const; + /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined /// in tablegen. For generic instructions, like REG_SEQUENCE it will return diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 9e1951e..bd4995b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1307,6 +1307,9 @@ let PrintMethod = "printBitOp3" in def BitOp3 : NamedIntOperand<"bitop3">; def bitop3_0 : DefaultOperand<BitOp3, 0>; +def MatrixAFMT : CustomOperand<i32, 1, "MatrixAFMT">; +def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">; + def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">; def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">; @@ -1882,6 +1885,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> { !eq(VT, v4bf16) : AVSrc_64, !eq(VT.Size, 1024) : VRegSrc_1024, !eq(VT.Size, 512) : VRegSrc_512, + !eq(VT.Size, 384) : VRegSrc_384, !eq(VT.Size, 256) : VRegSrc_256, !eq(VT.Size, 192) : VRegSrc_192, !eq(VT.Size, 128) : VRegSrc_128, @@ -1894,6 +1898,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> { class getVOP3VRegSrcForVT<ValueType VT> { RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024, !eq(VT.Size, 512) : VRegSrc_512, + !eq(VT.Size, 384) : VRegSrc_384, !eq(VT.Size, 256) : VRegSrc_256, !eq(VT.Size, 192) : VRegSrc_192, !eq(VT.Size, 128) : VRegSrc_128, @@ -2666,6 +2671,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { HasOMod); field bit HasNeg = HasModifiers; field bit HasMatrixReuse = 0; + field bit HasMatrixFMT = 0; field bit HasSrc0Mods = HasModifiers; field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 991d9f8..d05be8f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -644,6 +644,32 @@ def SI_INIT_WHOLE_WAVE : SPseudoInstSI < let isConvergent = 1; } +// Sets EXEC to all lanes and returns the previous EXEC. +def SI_WHOLE_WAVE_FUNC_SETUP : SPseudoInstSI < + (outs SReg_1:$dst), (ins), [(set i1:$dst, (AMDGPUwhole_wave_setup))]> { + let Defs = [EXEC]; + let Uses = [EXEC]; + + let isConvergent = 1; +} + +// Restores the previous EXEC and otherwise behaves entirely like a SI_RETURN. +def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI < + (outs), (ins SReg_1:$orig_exec)> { + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; + let SchedRW = [WriteBranch]; + + // We're going to use custom handling to set the $orig_exec to the correct value. + let usesCustomInserter = 1; +} + +// Generate a SI_WHOLE_WAVE_FUNC_RETURN pseudo with a placeholder for its +// argument. It will be filled in by the custom inserter. +def : GCNPat< + (AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>; + // Return for returning shaders to a shader variant epilog. def SI_RETURN_TO_EPILOG : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { @@ -2473,6 +2499,7 @@ def : AMDGPUPat < >; let True16Predicate = NotHasTrue16BitInsts in { +let SubtargetPredicate = isNotGFX9Plus in { def : ROTRPattern <V_ALIGNBIT_B32_e64>; def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), @@ -2482,6 +2509,35 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; +} // isNotGFX9Plus + +let SubtargetPredicate = isGFX9GFX10 in { +def : GCNPat < + (rotr i32:$src0, i32:$src1), + (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, + /* src1_modifiers */ 0, $src0, + /* src2_modifiers */ 0, + $src1, /* clamp */ 0, /* op_sel */ 0) +>; + +foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), + (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in +def : GCNPat<pat, + (V_ALIGNBIT_B32_opsel_e64 0, /* src0_modifiers */ + (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), + 0, /* src1_modifiers */ + (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), + 0, /* src2_modifiers */ + $src1, /* clamp */ 0, /* op_sel */ 0) +>; + +def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), + (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, + /* src1_modifiers */ 0, $src1, + /* src2_modifiers */ 0, + $src2, /* clamp */ 0, /* op_sel */ 0) +>; +} // isGFX9GFX10 } // end True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseRealTrue16Insts in { @@ -3082,6 +3138,8 @@ def : GCNPat < (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; +// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped +// to V_PERM_B32. let True16Predicate = NotHasTrue16BitInsts in def : GCNPat < (i32 (bswap i32:$a)), @@ -3559,15 +3617,20 @@ def : GCNPat < // Take the upper 16 bits from V[0] and the lower 16 bits from V[1] // Special case, can use V_ALIGNBIT (always uses encoded literal) -let True16Predicate = NotHasTrue16BitInsts in -def : GCNPat < +let True16Predicate = NotHasTrue16BitInsts in { +defvar BuildVectorToAlignBitPat = (vecTy (DivergentBinFrag<build_vector> (Ty !if(!eq(Ty, i16), (Ty (trunc (srl VGPR_32:$a, (i32 16)))), (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), - (Ty VGPR_32:$b))), - (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) ->; + (Ty VGPR_32:$b))); + +let SubtargetPredicate = isNotGFX9Plus in +def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))>; + +let SubtargetPredicate = isGFX9GFX10 in +def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_opsel_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i32 16), 0, 0)>; +} //True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseFakeTrue16Insts in def : GCNPat < @@ -4300,6 +4363,20 @@ def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction { let hasSideEffects = 0; } +def G_AMDGPU_WHOLE_WAVE_FUNC_SETUP : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$origExec); + let InOperandList = (ins); + let isConvergent = 1; +} + +def G_AMDGPU_WHOLE_WAVE_FUNC_RETURN : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins type0:$origExec); + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; +} + // This is equivalent to the G_INTRINSIC*, but the operands may have // been legalized depending on the subtarget requirements. def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 5097ac03..b49c5a9 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -61,6 +61,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" @@ -1078,7 +1079,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, if (EltOffset0 + CI.Width != EltOffset1 && EltOffset1 + Paired.Width != EltOffset0) return false; - if (CI.CPol != Paired.CPol) + // Instructions with scale_offset modifier cannot be combined unless we + // also generate a code to scale the offset and reset that bit. + if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL)) return false; if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 75ce67c..f0be204 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -29,6 +29,16 @@ enum { MAX_LANES = 64 }; using namespace llvm; +// TODO -- delete this flag once we have more robust mechanisms to allocate the +// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases +// where it is better to produce the VGPR form (e.g. if there are VGPR users +// of the MFMA result). +cl::opt<bool> MFMAVGPRForm( + "amdgpu-mfma-vgpr-form", cl::Hidden, + cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " + "unspecified, default to compiler heuristics"), + cl::init(false)); + const GCNTargetMachine &getTM(const GCNSubtarget *STI) { const SITargetLowering *TLI = STI->getTargetLowering(); return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine()); @@ -41,7 +51,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false), PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false), - GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { + GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0), + IsWholeWaveFunction(F.getCallingConv() == + CallingConv::AMDGPU_Gfx_WholeWave) { const GCNSubtarget &ST = *STI; FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); @@ -69,8 +81,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts(); - if (ST.hasGFX90AInsts() && + MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm; + if (!MFMAVGPRForm && ST.hasGFX90AInsts() && ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && !mayUseAGPRs(F)) MayNeedAGPRs = false; // We will select all MAI with VGPR operands. @@ -89,7 +101,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, ImplicitArgPtr = false; } else if (!isEntryFunction()) { - if (CC != CallingConv::AMDGPU_Gfx) + if (CC != CallingConv::AMDGPU_Gfx && + CC != CallingConv::AMDGPU_Gfx_WholeWave) ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; FrameOffsetReg = AMDGPU::SGPR33; @@ -722,6 +735,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()), MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()), Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()), + IsWholeWaveFunction(MFI.isWholeWaveFunction()), DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()), ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) { for (Register Reg : MFI.getSGPRSpillPhysVGPRs()) @@ -768,6 +782,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; BytesInStackArgArea = YamlMFI.BytesInStackArgArea; ReturnsVoid = YamlMFI.ReturnsVoid; + IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction; if (YamlMFI.ScavengeFI) { auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo()); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 274a60ad..08b0206 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -298,6 +298,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { StringValue LongBranchReservedReg; bool HasInitWholeWave = false; + bool IsWholeWaveFunction = false; unsigned DynamicVGPRBlockSize = 0; unsigned ScratchReservedForDynamicVGPRs = 0; @@ -356,6 +357,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false); YamlIO.mapOptional("scratchReservedForDynamicVGPRs", MFI.ScratchReservedForDynamicVGPRs, 0); + YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false); } }; @@ -565,6 +567,8 @@ private: // the serialization easier. ReservedRegSet WWMReservedRegs; + bool IsWholeWaveFunction = false; + using PrologEpilogSGPRSpill = std::pair<Register, PrologEpilogSGPRSaveRestoreInfo>; // To track the SGPR spill method used for a CSR SGPR register during @@ -670,6 +674,8 @@ public: return WWMReservedRegs.contains(Reg); } + bool isWholeWaveFunction() const { return IsWholeWaveFunction; } + ArrayRef<PrologEpilogSGPRSpill> getPrologEpilogSGPRSpills() const { assert(is_sorted(PrologEpilogSGPRSpills, llvm::less_first())); return PrologEpilogSGPRSpills; diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 7093fe6..5940f45 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -85,7 +85,8 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo, S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | S_00B848_WGP_MODE(ProgInfo.WgpMode) | - S_00B848_MEM_ORDERED(ProgInfo.MemOrdered); + S_00B848_MEM_ORDERED(ProgInfo.MemOrdered) | + S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress); if (ST.hasDX10ClampMode()) Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp); @@ -93,10 +94,6 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo, if (ST.hasIEEEMode()) Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode); - // TODO: in the long run we will want to enable this unconditionally. - if (ST.getTargetTriple().getOS() == Triple::OSType::AMDHSA) - Reg |= S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress); - if (ST.hasRrWGMode()) Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index fa2b8db..84cfa87 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -407,6 +407,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList : CSR_AMDGPU_SaveList; case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList : CSR_AMDGPU_SI_Gfx_SaveList; case CallingConv::AMDGPU_CS_ChainPreserve: @@ -433,6 +434,7 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask : CSR_AMDGPU_RegMask; case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask : CSR_AMDGPU_SI_Gfx_RegMask; case CallingConv::AMDGPU_CS_Chain: diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index c194e5c..0039d2f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1207,6 +1207,7 @@ def VRegSrc_96 : SrcReg9<VReg_96>; def VRegSrc_128: SrcReg9<VReg_128>; def VRegSrc_192: SrcReg9<VReg_192>; def VRegSrc_256: SrcReg9<VReg_256>; +def VRegSrc_384: SrcReg9<VReg_384>; def VRegSrc_512: SrcReg9<VReg_512>; def VRegSrc_1024: SrcReg9<VReg_1024>; def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>; diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index ef8faff..8eecb1c 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -464,6 +464,20 @@ def : InstRW<[WriteCopy], (instrs COPY)>; } // End SchedModel = GFX12SpeedModel +// Check if any matrix inputs are interpreted as f8 in an f8f6f4 +// wmma instruction. +def PredIsF8_WMMA_SCALE : SchedPredicate<[{ + TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_a_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8 || + TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_b_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8 +}]>; + +// If either matrix format is f8, the instruction takes 2x as many +// cycles. TODO: This isn't reflected in MCA. +def WriteWMMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[ + SchedVar<PredIsF8_WMMA_SCALE, [WriteXDL4PassWMMA]>, + SchedVar<NoSchedPred, [WriteXDL2PassWMMA]> +]>; + multiclass GFX125xCommonWriteRes { let ReleaseAtCycles = [8] in @@ -495,6 +509,7 @@ def : InstRW<[WriteCopy], (instrs COPY)>; def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>; def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>; +def : InstRW<[WriteWMMAScale_16X16X128_F8F6F4], (instregex "^V_WMMA_.*_16X16X128_F8F6F4.*_w32")>; def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>; def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>; } // End GFX125xCommonWriteRes diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 37dcc100..38cc51b 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -87,7 +87,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic> bits<7> sdst; bits<32> offset; bits<8> soffset; - bits<5> cpol; + bits<12> cpol; } class OffsetMode<bit hasOffset, bit hasSOffset, string variant, @@ -864,8 +864,10 @@ def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">; def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">; -def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">; -def SMRDSgprImm : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">; +let WantsRoot = true in { + def SMRDSgpr : ComplexPattern<iPTR, 3, "SelectSMRDSgpr", [], [], -3>; + def SMRDSgprImm : ComplexPattern<iPTR, 4, "SelectSMRDSgprImm", [], []>; +} def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">; def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">; def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">; @@ -906,15 +908,15 @@ multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag, let SubtargetPredicate = isNotGFX9Plus; } def : GCNPat < - (frag (SMRDSgpr i64:$sbase, i32:$soffset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> { + (frag (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, $cpol))> { let SubtargetPredicate = isGFX9Plus; } // 4. SGPR+IMM offset def : GCNPat < - (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> { + (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, $cpol))> { let SubtargetPredicate = isGFX9Plus; } @@ -989,15 +991,15 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val // 2. SGPR offset def : GCNPat < - (node (SMRDSgpr i64:$sbase, i32:$soffset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{ + (node (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, $cpol))>{ let SubtargetPredicate = isGFX12Plus; } // 3. SGPR+IMM offset def : GCNPat < - (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{ + (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, $cpol))>{ let SubtargetPredicate = isGFX12Plus; } @@ -1485,8 +1487,10 @@ class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offs RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass; let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol)); + let Inst{20} = cpol{CPolBit.NV}; // non-volatile let Inst{22-21} = cpol{4-3}; // scope let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported + let Inst{56} = cpol{CPolBit.SCAL}; // scale offset } multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 7725881..b5b3cc9 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -598,6 +598,29 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ, return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode); } +uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt) { + switch (Fmt) { + case WMMA::MATRIX_FMT_FP8: + case WMMA::MATRIX_FMT_BF8: + return 16; + case WMMA::MATRIX_FMT_FP6: + case WMMA::MATRIX_FMT_BF6: + return 12; + case WMMA::MATRIX_FMT_FP4: + return 8; + } + + llvm_unreachable("covered switch over wmma scale formats"); +} + +const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA, + unsigned FmtB, + unsigned F8F8Opcode) { + uint8_t SrcANumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtA); + uint8_t SrcBNumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtB); + return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode); +} + unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) { if (ST.hasFeature(AMDGPU::FeatureGFX1250Insts)) return SIEncodingFamily::GFX1250; @@ -3205,6 +3228,25 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, : getGfx9BufferFormatInfo(Format); } +bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) { + uint64_t TSFlags = MII.get(Opcode).TSFlags; + + if (TSFlags & SIInstrFlags::SMRD) + return !getSMEMIsBuffer(Opcode); + if (!(TSFlags & SIInstrFlags::FLAT)) + return false; + + // Only SV and SVS modes are supported. + if (TSFlags & SIInstrFlags::FlatScratch) + return hasNamedOperand(Opcode, OpName::vaddr); + + // Only GVS mode is supported. + return hasNamedOperand(Opcode, OpName::vaddr) && + hasNamedOperand(Opcode, OpName::saddr); + + return false; +} + bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) { for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) { int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index c9d2c28..c09a9d6 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -627,6 +627,14 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ, unsigned BLGP, unsigned F8F8Opcode); +LLVM_READNONE +uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt); + +LLVM_READONLY +const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA, + unsigned FmtB, + unsigned F8F8Opcode); + LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, @@ -1423,7 +1431,8 @@ constexpr bool isShader(CallingConv::ID CC) { LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC) { - return isShader(CC) || CC == CallingConv::AMDGPU_Gfx; + return isShader(CC) || CC == CallingConv::AMDGPU_Gfx || + CC == CallingConv::AMDGPU_Gfx_WholeWave; } LLVM_READNONE @@ -1748,6 +1757,9 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID); /// \returns true if the intrinsic is uniform bool isIntrinsicAlwaysUniform(unsigned IntrID); +/// \returns true if a memory instruction supports scale_offset modifier. +bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode); + /// \returns lds block size in terms of dwords. \p /// This is used to calculate the lds size encoded for PAL metadata 3.0+ which /// must be defined in terms of bytes. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index e464470..fd6253d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -44,6 +44,7 @@ static const char *getStageName(CallingConv::ID CC) { case CallingConv::AMDGPU_LS: return ".ls"; case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: llvm_unreachable("Callable shader has no hardware stage"); default: return ".cs"; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 2e7f25b..aee2f2c 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -224,6 +224,12 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32", fshr, null_frag>; defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>; + +// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32. +// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored. +defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>; +defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>; + let True16Predicate = UseRealTrue16Insts in defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>; let True16Predicate = UseFakeTrue16Insts in @@ -265,6 +271,16 @@ let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 } // End isReMaterializable = 1 +let SubtargetPredicate = isGFX9GFX10 in +def : GCNPat < +(i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)), + (i32 (VOP3OpSelMods i32:$src1, i32:$src1_modifiers)), + (i32 (VOP3OpSelMods i32:$src2, i32:$src2_modifiers)))), +(V_ALIGNBYTE_B32_opsel_e64 i32:$src0_modifiers, VSrc_b32:$src0, + i32:$src1_modifiers, VSrc_b32:$src1, + i32:$src2_modifiers, VGPR_32:$src2) +>; + let True16Predicate = UseFakeTrue16Insts in def : GCNPat < (i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)), @@ -1954,6 +1970,9 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" +defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; +defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; + defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { @@ -2104,8 +2123,8 @@ defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>; defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>; defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>; defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>; -defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>; -defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>; +defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7<0x14e>; +defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7<0x14f>; defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>; defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>; defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>; @@ -2248,6 +2267,17 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0> } } +// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi. +// The following is created to support that. +multiclass VOP3OpSel_Real_gfx9_with_name<bits<10> op, string opName, string AsmName> { + defvar psName = opName#"_e64"; + def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(psName), SIEncodingFamily.VI>, // note: encoding family is VI + VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(psName).Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(psName); + let AsmString = AsmName # ps.AsmOperands; + } +} + } // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; @@ -2267,8 +2297,10 @@ defm V_BFI_B32 : VOP3_Real_vi <0x1ca>; defm V_FMA_F32 : VOP3_Real_vi <0x1cb>; defm V_FMA_F64 : VOP3_Real_vi <0x1cc>; defm V_LERP_U8 : VOP3_Real_vi <0x1cd>; +let SubtargetPredicate = isGFX8Only in { defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>; defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>; +} defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>; defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>; defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>; @@ -2313,6 +2345,9 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16" defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; +defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; +defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; + defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index e51e957..9feea36 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1318,13 +1318,15 @@ let WaveSizePredicate = isWave64 in { class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, - bit _HasMatrixReuse = 0, bit _IsF4 = 0> + bit _HasMatrixFMT = 0, bit _HasMatrixReuse = 0, + bit _IsF4 = 0> : VOP3P_Profile<VOPProfile<ArgTy>> { bit IsIU = _IsIU; bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32)); int IndexType = _IndexType; + let HasMatrixFMT = _HasMatrixFMT; let HasMatrixReuse = _HasMatrixReuse; bit HasIModOp = _Has_ImodOp; @@ -1422,7 +1424,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit), !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit), !eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit)); - + dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt), + (ins)); dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins)); dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins)); dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), @@ -1436,7 +1439,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, (ins VRegSrc_64:$src2), (ins VRegSrc_32:$src2)), IndexKey)), - MatrixReuse, Clamp, Neg); + MatrixFMT, MatrixReuse, Clamp, Neg); // asm @@ -1444,13 +1447,14 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 8) : "$index_key_8bit", !eq(IndexType, 16) : "$index_key_16bit", !eq(IndexType, 32) : "$index_key_32bit"); + string MatrxFMTAsm = !if(HasMatrixFMT, "$matrix_a_fmt$matrix_b_fmt", ""); string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", ""); string ClampAsm = !if(HasClamp, "$clamp", ""); string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi", !and(NegLoAny, !not(NegHiAny)) : "$neg_lo", !and(!not(NegLoAny), !not(NegHiAny)) : ""); - let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm; + let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrxFMTAsm#MatrixReuseAsm#NegAsm#ClampAsm; // isel patterns bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp)); @@ -1462,6 +1466,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), IsAB_BF16_IMod0 : (ins Src0VT:$src0), IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + HasMatrixFMT : (ins timm:$matrix_a_fmt, Src0VT:$src0), NoABMods : (ins Src0VT:$src0)); dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), @@ -1474,6 +1479,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), IsAB_BF16_IMod0 : (ins Src1VT:$src1), IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + HasMatrixFMT : (ins timm:$matrix_b_fmt, Src1VT:$src1), NoABMods : (ins Src1VT:$src1)); dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), @@ -1499,7 +1505,6 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, IsIUXF32 : (ins Src2VT:$src2), IsSWMMAC : (ins)); dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins)); - dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))), !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))), @@ -1508,6 +1513,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit), !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit), !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit)); + dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins)); dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2)))); dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2)); @@ -1515,7 +1521,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins)); dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat); - dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat); dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat); dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat); @@ -1523,7 +1529,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, // wmma pattern where src2 is inline imm uses _threeaddr pseudo, // can't use _twoaddr since it would violate src2 tied to vdst constraint. dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat); - dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat); } def WMMAInstInfoTable : GenericTable { @@ -1632,26 +1638,45 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, // *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored // for matrix A, index is i16; Matrix B uses all lanes -def F64_F64X4_WMMA_w32 : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>; -def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>; -def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>; -def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>; -def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>; -def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>; -def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>; -def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>; -def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>; -def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>; -def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>; -def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>; -def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>; -def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>; -def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>; -def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>; -def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>; -def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 1>; -def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 1>; -def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 1>; +def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 1>; +def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>; +def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 1>; +def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 1>; +def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 1>; +def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>; +def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 1>; +def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 1>; +def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 1>; +def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 1>; +def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 1>; +def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 1>; +def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 1>; +def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 1>; +def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 1>; +def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 1>; +def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 1>; +def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 1>; +def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 1>; + +multiclass WMMA_F8F6F4_Profiles<bit HasMatrixReuse> { + def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; + def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; + def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; + def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; + def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; + def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; + def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; + def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; + def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; +} + +defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0>; + +multiclass WMMAInst_SrcFormats_mc<string OpName, string Profile> { + foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { + defm _#I#_w32 : WMMAInstGFX12<OpName # "_" # I # "_w32", !cast<VOP3PWMMA_Profile>(Profile # "_" # I # "_w32"), "_w32">; + } +} let WaveSizePredicate = isWave32 in { let SubtargetPredicate = isGFX125xOnly in { @@ -1697,6 +1722,8 @@ defm V_SWMMAC_I32_16X16X128_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x12 defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16", F32_F16X64_SWMMAC_w32, "_w32">; defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">; +defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">; + } // End is_wmma_xdl = 1. } // End SubtargetPredicate = isGFX125xOnly @@ -1854,6 +1881,10 @@ let SubtargetPredicate = isGFX125xOnly in { defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>; defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>; + foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { + defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>; + } + def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>; def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x64_bf16, BF16_BF16X64_SWMMAC_w32>; def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>; @@ -1912,17 +1943,22 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<8> op, string backing_ps_name = NAME class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP> : VOP3Pe_gfx11_gfx12<op, P>{ + // opsel - let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0, + let Inst{11} = !cond(WMMAP.HasMatrixFMT : matrix_a_fmt{0}, + !eq(WMMAP.IndexType, 0) : 0, !eq(WMMAP.IndexType, 8) : index_key_8bit{0}, !eq(WMMAP.IndexType, 16) : index_key_16bit{0}, !eq(WMMAP.IndexType, 32) : index_key_32bit{0}); - let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0); - let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0); + let Inst{12} = !if(WMMAP.HasMatrixFMT, matrix_a_fmt{1}, + !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0)); + let Inst{13} = !if (WMMAP.HasMatrixFMT, matrix_a_fmt{2}, + !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0)); // opsel_hi - let Inst{59} = 1; - let Inst{60} = 1; - let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1); + let Inst{59} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{0}, 1); + let Inst{60} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{1}, 1); + let Inst{14} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{2}, + !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1)); // neg_lo let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0); let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0); @@ -1961,6 +1997,24 @@ multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> { } } +multiclass VOP3P_Real_WMMA_F8F6F4_gfx1250<bits<8> op, VOP3PWMMA_Profile WMMAP> { + defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr"); + defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); + defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); + let AsmString = asmName # PS.AsmOperands in + defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>, + MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_gfx1250">; +} + +multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> { + defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>; + foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { + let isAsmParserOnly = true in { // Disable ambiguous disassembly. + defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>; + } + } +} + defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>; defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>; defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>; @@ -2035,6 +2089,8 @@ defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8B defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>; defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>; +defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">; + defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>; defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>; defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index a25ebdf..c21e2d3 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -453,6 +453,8 @@ class VOP3Pe_Base { bits<2> index_key_8bit; bits<1> index_key_16bit; bits<1> index_key_32bit; + bits<3> matrix_a_fmt; + bits<3> matrix_b_fmt; bits<1> matrix_a_reuse; bits<1> matrix_b_reuse; } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index eaba6fe..a7a9911 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -593,7 +593,7 @@ public: getContext().reportError(Loc, "relocated expression must be 32-bit"); return; } - getOrCreateDataFragment(); + getCurrentFragment(); } emitDataMappingSymbol(); @@ -1207,7 +1207,7 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) { } void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) { - MCFragment *Frag = getOrCreateDataFragment(); + MCFragment *Frag = getCurrentFragment(); Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind)); } @@ -1295,7 +1295,7 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) { MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext()); visitUsedExpr(*PersonalityRef); - MCFragment *DF = getOrCreateDataFragment(); + MCFragment *DF = getCurrentFragment(); DF->addFixup( MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4)); } diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index db09738..128cc0b 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -514,19 +514,7 @@ bool AVRAsmBackend::forceRelocation(const MCFragment &F, const MCFixup &Fixup, return false; case AVR::fixup_7_pcrel: - case AVR::fixup_13_pcrel: { - uint64_t Offset = Target.getConstant(); - uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize; - - // If the jump is too large to encode it, fall back to a relocation. - // - // Note that trying to actually link that relocation *would* fail, but the - // hopes are that the module we're currently compiling won't be actually - // linked to the final binary. - return !adjust::adjustRelativeBranch(Size, Fixup, Offset, - getContext().getSubtargetInfo()); - } - + case AVR::fixup_13_pcrel: case AVR::fixup_call: return true; } diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp index 703a9e5..c8866bf 100644 --- a/llvm/lib/Target/DirectX/DXILPrepare.cpp +++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp @@ -24,7 +24,6 @@ #include "llvm/IR/AttributeMask.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -240,11 +239,6 @@ public: for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx) F.removeParamAttrs(Idx, AttrMask); - // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr - if (Intrinsic::ID IID = F.getIntrinsicID(); - IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end) - F.removeFnAttr(Attribute::Memory); - for (auto &BB : F) { IRBuilder<> Builder(&BB); for (auto &I : make_early_inc_range(BB)) { @@ -253,7 +247,7 @@ public: // Emtting NoOp bitcast instructions allows the ValueEnumerator to be // unmodified as it reserves instruction IDs during contruction. - if (auto *LI = dyn_cast<LoadInst>(&I)) { + if (auto LI = dyn_cast<LoadInst>(&I)) { if (Value *NoOpBitcast = maybeGenerateBitcast( Builder, PointerTypes, I, LI->getPointerOperand(), LI->getType())) { @@ -263,7 +257,7 @@ public: } continue; } - if (auto *SI = dyn_cast<StoreInst>(&I)) { + if (auto SI = dyn_cast<StoreInst>(&I)) { if (Value *NoOpBitcast = maybeGenerateBitcast( Builder, PointerTypes, I, SI->getPointerOperand(), SI->getValueOperand()->getType())) { @@ -274,7 +268,7 @@ public: } continue; } - if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { + if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) { if (Value *NoOpBitcast = maybeGenerateBitcast( Builder, PointerTypes, I, GEP->getPointerOperand(), GEP->getSourceElementType())) @@ -286,17 +280,6 @@ public: CB->removeRetAttrs(AttrMask); for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx) CB->removeParamAttrs(Idx, AttrMask); - // LLVM 3.7 Lifetime intrinics require an i8* pointer operand, so we - // insert a bitcast here to ensure that is the case - if (isa<LifetimeIntrinsic>(CB)) { - Value *PtrOperand = CB->getArgOperand(1); - Builder.SetInsertPoint(CB); - PointerType *PtrTy = cast<PointerType>(PtrOperand->getType()); - Value *NoOpBitcast = Builder.Insert( - CastInst::Create(Instruction::BitCast, PtrOperand, - Builder.getPtrTy(PtrTy->getAddressSpace()))); - CB->setArgOperand(1, NoOpBitcast); - } continue; } } diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp index eb4adfe..bd3349d 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp @@ -152,7 +152,7 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF, if (!CSF.Int64Ops) CSF.Int64Ops = I.getType()->isIntegerTy(64); - if (!CSF.Int64Ops && !isa<LifetimeIntrinsic>(&I)) { + if (!CSF.Int64Ops) { for (const Value *Op : I.operands()) { if (Op->getType()->isIntegerTy(64)) { CSF.Int64Ops = true; diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 46d5d71..1d79c30 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -2545,25 +2545,6 @@ void DXILBitcodeWriter::writeInstruction(const Instruction &I, unsigned InstID, Vals.clear(); } -// HLSL Change -namespace { -struct ValueNameCreator { - MallocAllocator Allocator; - SmallVector<ValueName *, 2> - ValueNames; // SmallVector N = 2 because we currently only expect this - // to hold ValueNames for Lifetime intrinsics - ~ValueNameCreator() { - for (auto *VN : ValueNames) - VN->Destroy(Allocator); - } - ValueName *create(StringRef Name, Value *V) { - ValueName *VN = ValueName::create(Name, Allocator, V); - ValueNames.push_back(VN); - return VN; - } -}; -} // anonymous namespace - // Emit names for globals/functions etc. void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable( const ValueSymbolTable &VST) { @@ -2578,24 +2559,9 @@ void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable( // to ensure the binary is the same no matter what values ever existed. SmallVector<const ValueName *, 16> SortedTable; - // HLSL Change - ValueNameCreator VNC; for (auto &VI : VST) { - ValueName *VN = VI.second->getValueName(); - // Clang mangles lifetime intrinsic names by appending '.p0' to the end, - // making them invalid lifetime intrinsics in LLVM 3.7. We can't - // demangle in dxil-prepare because it would result in invalid IR. - // Therefore we have to do this in the bitcode writer while writing its - // name to the symbol table. - if (const Function *Fn = dyn_cast<Function>(VI.getValue()); - Fn && Fn->isIntrinsic()) { - Intrinsic::ID IID = Fn->getIntrinsicID(); - if (IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end) - VN = VNC.create(Intrinsic::getBaseName(IID), VI.second); - } - SortedTable.push_back(VN); + SortedTable.push_back(VI.second->getValueName()); } - // The keys are unique, so there shouldn't be stability issues. llvm::sort(SortedTable, [](const ValueName *A, const ValueName *B) { return A->first() < B->first(); diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 2378664..e915a3c4 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2514,8 +2514,9 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, assert(ResTy.isVector()); unsigned NumElts = ResTy.getVectorNumElements(); - SDValue Vector = DAG.getUNDEF(ResTy); - for (unsigned i = 0; i < NumElts; ++i) { + SDValue Vector = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0)); + for (unsigned i = 1; i < NumElts; ++i) { Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, Node->getOperand(i), DAG.getConstant(i, DL, Subtarget.getGRLenVT())); @@ -4560,6 +4561,80 @@ static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, llvm_unreachable("Unexpected node type for vXi1 sign extension"); } +static SDValue +performSETCC_BITCASTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + if (Src.getOpcode() != ISD::SETCC || !Src.hasOneUse()) + return SDValue(); + + bool UseLASX; + unsigned Opc = ISD::DELETED_NODE; + EVT CmpVT = Src.getOperand(0).getValueType(); + EVT EltVT = CmpVT.getVectorElementType(); + + if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() == 128) + UseLASX = false; + else if (Subtarget.has32S() && Subtarget.hasExtLASX() && + CmpVT.getSizeInBits() == 256) + UseLASX = true; + else + return SDValue(); + + SDValue SrcN1 = Src.getOperand(1); + switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) { + default: + break; + case ISD::SETEQ: + // x == 0 => not (vmsknez.b x) + if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8) + Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ; + break; + case ISD::SETGT: + // x > -1 => vmskgez.b x + if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8) + Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ; + break; + case ISD::SETGE: + // x >= 0 => vmskgez.b x + if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8) + Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ; + break; + case ISD::SETLT: + // x < 0 => vmskltz.{b,h,w,d} x + if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && + (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 || + EltVT == MVT::i64)) + Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; + break; + case ISD::SETLE: + // x <= -1 => vmskltz.{b,h,w,d} x + if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && + (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 || + EltVT == MVT::i64)) + Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; + break; + case ISD::SETNE: + // x != 0 => vmsknez.b x + if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8) + Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ; + break; + } + + if (Opc == ISD::DELETED_NODE) + return SDValue(); + + SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src.getOperand(0)); + EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); + V = DAG.getZExtOrTrunc(V, DL, T); + return DAG.getBitcast(VT, V); +} + static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const LoongArchSubtarget &Subtarget) { @@ -4574,110 +4649,63 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG, if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) return SDValue(); - unsigned Opc = ISD::DELETED_NODE; // Combine SETCC and BITCAST into [X]VMSK{LT,GE,NE} when possible + SDValue Res = performSETCC_BITCASTCombine(N, DAG, DCI, Subtarget); + if (Res) + return Res; + + // Generate vXi1 using [X]VMSKLTZ + MVT SExtVT; + unsigned Opc; + bool UseLASX = false; + bool PropagateSExt = false; + if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse()) { - bool UseLASX; EVT CmpVT = Src.getOperand(0).getValueType(); - EVT EltVT = CmpVT.getVectorElementType(); - - if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() <= 128) - UseLASX = false; - else if (Subtarget.has32S() && Subtarget.hasExtLASX() && - CmpVT.getSizeInBits() <= 256) - UseLASX = true; - else + if (CmpVT.getSizeInBits() > 256) return SDValue(); - - SDValue SrcN1 = Src.getOperand(1); - switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) { - default: - break; - case ISD::SETEQ: - // x == 0 => not (vmsknez.b x) - if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8) - Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ; - break; - case ISD::SETGT: - // x > -1 => vmskgez.b x - if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8) - Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ; - break; - case ISD::SETGE: - // x >= 0 => vmskgez.b x - if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8) - Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ; - break; - case ISD::SETLT: - // x < 0 => vmskltz.{b,h,w,d} x - if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && - (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 || - EltVT == MVT::i64)) - Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; - break; - case ISD::SETLE: - // x <= -1 => vmskltz.{b,h,w,d} x - if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && - (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 || - EltVT == MVT::i64)) - Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; - break; - case ISD::SETNE: - // x != 0 => vmsknez.b x - if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8) - Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ; - break; - } } - // Generate vXi1 using [X]VMSKLTZ - if (Opc == ISD::DELETED_NODE) { - MVT SExtVT; - bool UseLASX = false; - bool PropagateSExt = false; - switch (SrcVT.getSimpleVT().SimpleTy) { - default: - return SDValue(); - case MVT::v2i1: - SExtVT = MVT::v2i64; - break; - case MVT::v4i1: - SExtVT = MVT::v4i32; - if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) { - SExtVT = MVT::v4i64; - UseLASX = true; - PropagateSExt = true; - } - break; - case MVT::v8i1: - SExtVT = MVT::v8i16; - if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) { - SExtVT = MVT::v8i32; - UseLASX = true; - PropagateSExt = true; - } - break; - case MVT::v16i1: - SExtVT = MVT::v16i8; - if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) { - SExtVT = MVT::v16i16; - UseLASX = true; - PropagateSExt = true; - } - break; - case MVT::v32i1: - SExtVT = MVT::v32i8; + switch (SrcVT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v2i1: + SExtVT = MVT::v2i64; + break; + case MVT::v4i1: + SExtVT = MVT::v4i32; + if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) { + SExtVT = MVT::v4i64; UseLASX = true; - break; - }; - if (UseLASX && !Subtarget.has32S() && !Subtarget.hasExtLASX()) - return SDValue(); - Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL) - : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); - Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; - } else { - Src = Src.getOperand(0); - } + PropagateSExt = true; + } + break; + case MVT::v8i1: + SExtVT = MVT::v8i16; + if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) { + SExtVT = MVT::v8i32; + UseLASX = true; + PropagateSExt = true; + } + break; + case MVT::v16i1: + SExtVT = MVT::v16i8; + if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) { + SExtVT = MVT::v16i16; + UseLASX = true; + PropagateSExt = true; + } + break; + case MVT::v32i1: + SExtVT = MVT::v32i8; + UseLASX = true; + break; + }; + if (UseLASX && !(Subtarget.has32S() && Subtarget.hasExtLASX())) + return SDValue(); + Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL) + : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); + Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src); EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index a0107e4..5096a8f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1651,18 +1651,20 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm), (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>; def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm), (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>; -def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm), - (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>; -def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm), - (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>; +def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm), + (XVINSGR2VR_W $xd, $rj, uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$xd, (f64 (bitconvert i64:$rj)), uimm2:$imm), + (XVINSGR2VR_D $xd, $rj, uimm2:$imm)>; def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2), (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>; def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2), (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>; + +// XVINSVE0_{W/D} def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm), - (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>; + (XVINSVE0_W $xd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>; def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm), - (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>; + (XVINSVE0_D $xd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>; // scalar_to_vector def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)), diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 962e7c2..3c9defb 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1842,10 +1842,19 @@ def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$ (VINSGR2VR_W $vd, $rj, uimm2:$imm)>; def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm), (VINSGR2VR_D $vd, $rj, uimm1:$imm)>; -def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm), - (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>; -def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm), - (VINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm1:$imm)>; + +// VEXTRINS_{W/D} +foreach imm = 0...3 in { + defvar Imm = !shl(imm, 4); + def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, imm), + (VEXTRINS_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), Imm)>; +} + +foreach imm = 0...1 in { + defvar Imm = !shl(imm, 4); + def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, imm), + (VEXTRINS_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), Imm)>; +} // scalar_to_vector def : Pat<(v4f32 (scalar_to_vector FPR32:$fj)), diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index 7b9f115..8fa72bc 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -177,74 +177,6 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, } } -// Linker relaxation may change code size. We have to insert Nops -// for .align directive when linker relaxation enabled. So then Linker -// could satisfy alignment by removing Nops. -// The function returns the total Nops Size we need to insert. -bool LoongArchAsmBackend::shouldInsertExtraNopBytesForCodeAlign( - const MCAlignFragment &AF, unsigned &Size) { - // Calculate Nops Size only when linker relaxation enabled. - if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax)) - return false; - - // Ignore alignment if MaxBytesToEmit is less than the minimum Nop size. - const unsigned MinNopLen = 4; - if (AF.getMaxBytesToEmit() < MinNopLen) - return false; - Size = AF.getAlignment().value() - MinNopLen; - return AF.getAlignment() > MinNopLen; -} - -// We need to insert R_LARCH_ALIGN relocation type to indicate the -// position of Nops and the total bytes of the Nops have been inserted -// when linker relaxation enabled. -// The function inserts fixup_loongarch_align fixup which eventually will -// transfer to R_LARCH_ALIGN relocation type. -// The improved R_LARCH_ALIGN requires symbol index. The lowest 8 bits of -// addend represent alignment and the other bits of addend represent the -// maximum number of bytes to emit. The maximum number of bytes is zero -// means ignore the emit limit. -bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm, - MCAlignFragment &AF) { - // Insert the fixup only when linker relaxation enabled. - if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax)) - return false; - - // Calculate total Nops we need to insert. If there are none to insert - // then simply return. - unsigned InsertedNopBytes; - if (!shouldInsertExtraNopBytesForCodeAlign(AF, InsertedNopBytes)) - return false; - - MCSection *Sec = AF.getParent(); - MCContext &Ctx = getContext(); - const MCExpr *Dummy = MCConstantExpr::create(0, Ctx); - MCFixup Fixup = MCFixup::create(0, Dummy, ELF::R_LARCH_ALIGN); - unsigned MaxBytesToEmit = AF.getMaxBytesToEmit(); - - auto createExtendedValue = [&]() { - const MCSymbolRefExpr *MCSym = getSecToAlignSym()[Sec]; - if (MCSym == nullptr) { - // Define a marker symbol at the section with an offset of 0. - MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align"); - Sym->setFragment(&*Sec->getBeginSymbol()->getFragment()); - Asm.registerSymbol(*Sym); - MCSym = MCSymbolRefExpr::create(Sym, Ctx); - getSecToAlignSym()[Sec] = MCSym; - } - return MCValue::get(&MCSym->getSymbol(), nullptr, - MaxBytesToEmit << 8 | Log2(AF.getAlignment())); - }; - - uint64_t FixedValue = 0; - MCValue Value = MaxBytesToEmit >= InsertedNopBytes - ? MCValue::get(InsertedNopBytes) - : createExtendedValue(); - Asm.getWriter().recordRelocation(AF, Fixup, Value, FixedValue); - - return true; -} - bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target) { switch (Fixup.getKind()) { @@ -279,6 +211,53 @@ getRelocPairForSize(unsigned Size) { } } +// Check if an R_LARCH_ALIGN relocation is needed for an alignment directive. +// If conditions are met, compute the padding size and create a fixup encoding +// the padding size in the addend. If MaxBytesToEmit is smaller than the padding +// size, the fixup encodes MaxBytesToEmit in the higher bits and references a +// per-section marker symbol. +bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { + // Use default handling unless linker relaxation is enabled and the + // MaxBytesToEmit >= the nop size. + if (!F.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax)) + return false; + const unsigned MinNopLen = 4; + unsigned MaxBytesToEmit = F.getAlignMaxBytesToEmit(); + if (MaxBytesToEmit < MinNopLen) + return false; + + Size = F.getAlignment().value() - MinNopLen; + if (F.getAlignment() <= MinNopLen) + return false; + + MCContext &Ctx = getContext(); + const MCExpr *Expr = nullptr; + if (MaxBytesToEmit >= Size) { + Expr = MCConstantExpr::create(Size, getContext()); + } else { + MCSection *Sec = F.getParent(); + const MCSymbolRefExpr *SymRef = getSecToAlignSym()[Sec]; + if (SymRef == nullptr) { + // Define a marker symbol at the section with an offset of 0. + MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align"); + Sym->setFragment(&*Sec->getBeginSymbol()->getFragment()); + Asm->registerSymbol(*Sym); + SymRef = MCSymbolRefExpr::create(Sym, Ctx); + getSecToAlignSym()[Sec] = SymRef; + } + Expr = MCBinaryExpr::createAdd( + SymRef, + MCConstantExpr::create((MaxBytesToEmit << 8) | Log2(F.getAlignment()), + Ctx), + Ctx); + } + MCFixup Fixup = + MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN); + F.setVarFixups({Fixup}); + F.getParent()->setLinkerRelaxable(); + return true; +} + std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCFragment &F, int64_t &Value) const { const MCExpr &Expr = F.getLEBValue(); @@ -434,7 +413,7 @@ bool LoongArchAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA, // Otherwise, check if the offset between the symbol and fragment is fully // resolved, unaffected by linker-relaxable fragments (e.g. instructions or - // offset-affected MCAlignFragment). Complements the generic + // offset-affected FT_Align fragments). Complements the generic // isSymbolRefDifferenceFullyResolvedImpl. if (!PCRelTemp) PCRelTemp = getContext().createTempSymbol(); diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h index b32ba06..3d929fc 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h @@ -45,20 +45,13 @@ public: MutableArrayRef<char> Data, uint64_t Value, bool IsResolved) override; - // Return Size with extra Nop Bytes for alignment directive in code section. - bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF, - unsigned &Size) override; - - // Insert target specific fixup type for alignment directive in code section. - bool shouldInsertFixupForCodeAlign(MCAssembler &Asm, - MCAlignFragment &AF) override; - bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target); std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; + bool relaxAlign(MCFragment &F, unsigned &Size) override; bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override; bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override; std::pair<bool, bool> relaxLEB128(MCFragment &F, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp index 03ce004..7cefb3f 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp @@ -52,6 +52,9 @@ static ABI getTripleABI(const Triple &TT) { bool Is64Bit = TT.isArch64Bit(); ABI TripleABI; switch (TT.getEnvironment()) { + case llvm::Triple::EnvironmentType::UnknownEnvironment: + TripleABI = ABI_Unknown; + break; case llvm::Triple::EnvironmentType::GNUSF: case llvm::Triple::EnvironmentType::MuslSF: TripleABI = Is64Bit ? ABI_LP64S : ABI_ILP32S; @@ -96,7 +99,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, // 1. If the '-target-abi' is valid, use it. if (IsABIValidForFeature(ArgProvidedABI)) { - if (TT.hasEnvironment() && ArgProvidedABI != TripleABI) + if (IsABIValidForFeature(TripleABI) && ArgProvidedABI != TripleABI) errs() << "warning: triple-implied ABI conflicts with provided target-abi '" << ABIName << "', using target-abi\n"; @@ -164,10 +167,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, return Is64Bit ? ABI_LP64F : ABI_ILP32F; return Is64Bit ? ABI_LP64S : ABI_ILP32S; }; - if (ABIName.empty()) - errs() << "warning: the triple-implied ABI is invalid, ignoring and using " - "feature-implied ABI\n"; - else + if (!ABIName.empty()) errs() << "warning: both target-abi and the triple-implied ABI are " "invalid, ignoring and using feature-implied ABI\n"; return checkABIStandardized(GetFeatureABI()); diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index ad8f5f0..7abe9c9 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -385,11 +385,12 @@ void MipsELFObjectWriter::sortRelocs(std::vector<ELFRelocationEntry> &Relocs) { if (hasRelocationAddend()) return; - // Sort relocations by the address they are applied to. - llvm::sort(Relocs, - [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) { - return A.Offset < B.Offset; - }); + // Sort relocations by r_offset. There might be more than one at an offset + // with composed relocations or .reloc directives. + llvm::stable_sort( + Relocs, [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) { + return A.Offset < B.Offset; + }); // Place relocations in a list for reorder convenience. Hi16 contains the // iterators of high-part relocations. diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index b89d689..feb4eb3 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -1033,45 +1033,40 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() { } void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) { - MCFragment *DF = getStreamer().getOrCreateDataFragment(); - DF->addFixup(MCFixup::create(DF->getContents().size(), Value, - Mips::fixup_Mips_GPREL32)); - DF->appendContents(4, 0); + auto &S = getStreamer(); + S.addFixup(Value, Mips::fixup_Mips_GPREL32); + S.appendContents(4, 0); } void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) { - MCFragment *DF = getStreamer().getOrCreateDataFragment(); - DF->addFixup(MCFixup::create(DF->getContents().size(), Value, - Mips::fixup_Mips_GPREL32)); - DF->appendContents(8, 0); + auto &S = getStreamer(); + // fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64. + S.addFixup(Value, Mips::fixup_Mips_GPREL32); + S.appendContents(8, 0); } void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) { - MCFragment *DF = getStreamer().getOrCreateDataFragment(); - DF->addFixup(MCFixup::create(DF->getContents().size(), Value, - Mips::fixup_Mips_DTPREL32)); - DF->appendContents(4, 0); + auto &S = getStreamer(); + S.addFixup(Value, Mips::fixup_Mips_DTPREL32); + S.appendContents(4, 0); } void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) { - MCFragment *DF = getStreamer().getOrCreateDataFragment(); - DF->addFixup(MCFixup::create(DF->getContents().size(), Value, - Mips::fixup_Mips_DTPREL64)); - DF->appendContents(8, 0); + auto &S = getStreamer(); + S.addFixup(Value, Mips::fixup_Mips_DTPREL64); + S.appendContents(8, 0); } void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) { - MCFragment *DF = getStreamer().getOrCreateDataFragment(); - DF->addFixup(MCFixup::create(DF->getContents().size(), Value, - Mips::fixup_Mips_TPREL32)); - DF->appendContents(4, 0); + auto &S = getStreamer(); + S.addFixup(Value, Mips::fixup_Mips_TPREL32); + S.appendContents(4, 0); } void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) { - MCFragment *DF = getStreamer().getOrCreateDataFragment(); - DF->addFixup(MCFixup::create(DF->getContents().size(), Value, - Mips::fixup_Mips_TPREL64)); - DF->appendContents(8, 0); + auto &S = getStreamer(); + S.addFixup(Value, Mips::fixup_Mips_TPREL64); + S.appendContents(8, 0); } void MipsTargetELFStreamer::emitDirectiveSetMicroMips() { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7aa06f9..7883acc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -731,6 +731,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setTruncStoreAction(MVT::f32, MVT::bf16, Expand); setTruncStoreAction(MVT::f64, MVT::bf16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand); // PTX does not support load / store predicate registers setOperationAction(ISD::LOAD, MVT::i1, Custom); @@ -4004,7 +4006,10 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: - case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { + case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: + case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16: + case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16: + case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = MVT::v2i32; Info.ptrVal = I.getArgOperand(0); @@ -4027,6 +4032,30 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return true; } + case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16: + case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16: + case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOStore; + Info.align = Align(4); + return true; + } + + case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16: + case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16: + case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::v4i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOStore; + Info.align = Align(16); + return true; + } + case Intrinsic::nvvm_atomic_add_gen_f_cta: case Intrinsic::nvvm_atomic_add_gen_f_sys: case Intrinsic::nvvm_atomic_add_gen_i_cta: @@ -5060,12 +5089,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return !U.getUser()->use_empty(); } - // Handle CopyToReg nodes that will become dead after our replacement - if (U.getUser()->getOpcode() == ISD::CopyToReg) { - DeadCopyToRegs.push_back(U.getUser()); - return true; - } - // Otherwise, this use prevents us from splitting a value. return false; })) @@ -5132,10 +5155,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs)) Results.push_back(NewLoad.getValue(NewNumOutputs + I)); - // Remove dead CopyToReg nodes by folding them into the chain they reference - for (SDNode *CTR : DeadCopyToRegs) - DCI.CombineTo(CTR, CTR->getOperand(0)); - return DCI.DAG.getMergeValues(Results, DL); } @@ -6544,4 +6563,4 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode( default: break; } -}
\ No newline at end of file +} diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index a5bb83d..b5df4c6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -131,6 +131,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; +def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">; def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">; class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 70150bd..0a00220 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -600,12 +600,23 @@ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1>; // TMA Async Bulk Tensor Copy Functions //------------------------------------- -class TMA_DIMS_UTIL<int dim> { +class TMA_DIMS_UTIL<int dim, string mode = ""> { // For example, when 'dim' is 3, this generates: // an ins_dag: B32:$d0, B32:$d1, B32:$d2 // with base_str: $d0, $d1, $d2 dag ins_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i)); string base_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); + + // Tile::Gather4/scatter4 actually operate on a 2D tensor, + // though they take 5 co-ordinates. + // + // The scatter-gather happens over 4 rows with a fixed + // column-index. The first co-ordinate represents the + // col-index followed by four row-indices. + int num_dims = !cond( + !eq(mode, "tile_scatter4") : 2, + !eq(mode, "tile_gather4") : 2, + true : dim); // for all other modes } class TMA_IM2COL_UTIL<int dim, string mode> { @@ -692,14 +703,138 @@ foreach dim = [1, 2, 3, 4, 5] in { } } +multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> { + defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; + defvar dims_str = TMA_DIMS_UTIL<dim>.base_str; + defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; + + defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag; + defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str; + defvar asm_str = !if(!empty(im2col_str), + asm_str_base, + asm_str_base # ", {{" # im2col_str # "}}"); + + defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims; + defvar inst_name = "cp.async.bulk.tensor" + # "." # dim_val # "d" + # "." # "shared::cluster.global" + # "." # !subst("_", "::", mode) + # "." # "mbarrier::complete_tx::bytes"; + defvar intr = !cast<Intrinsic>( + "int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim_val # "d"); + + defvar ins_dag = !con( + (ins ADDR:$dst, ADDR:$mbar, B64:$tmap), + dims_dag, im2col_dag, + (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)); + + defvar intr_dag_base = !con( + (intr addr:$dst, addr:$mbar, B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B16:$mc, B64:$ch)); + defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg)); + defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg)); + defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg)); + defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg)); + + def "" : NVPTXInst<(outs), ins_dag, + inst_name # asm_str # ";", + [intr_dag_no_hints]>, + Requires<pred>; + def _MC : NVPTXInst<(outs), ins_dag, + inst_name # ".multicast::cluster" # asm_str # ", $mc;", + [intr_dag_with_mc]>, + Requires<pred>; + def _CH : NVPTXInst<(outs), ins_dag, + inst_name # ".L2::cache_hint" # asm_str # ", $ch;", + [intr_dag_with_ch]>, + Requires<pred>; + def _MC_CH : NVPTXInst<(outs), ins_dag, + inst_name # ".multicast::cluster.L2::cache_hint" # asm_str # ", $mc, $ch;", + [intr_dag_with_mc_ch]>, + Requires<pred>; +} +foreach dim = 3...5 in { + foreach mode = ["im2col_w", "im2col_w_128"] in { + defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D" + : TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>; + } +} +defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4", + [hasTMACTAGroupSupport]>; + +multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> { + defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; + defvar dims_str = TMA_DIMS_UTIL<dim>.base_str; + defvar asm_str_base = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; + + defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag; + defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str; + defvar asm_str = !if(!empty(im2col_str), + asm_str_base, + asm_str_base # ", {{" # im2col_str # "}}"); + + defvar ins_dag = !con( + (ins ADDR:$dst, ADDR:$mbar, B64:$tmap), + dims_dag, im2col_dag, + (ins B64:$ch)); + + defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims; + defvar intr = !cast<Intrinsic>( + "int_nvvm_cp_async_bulk_tensor_g2s_cta_" # mode # "_" # dim_val # "d"); + defvar intr_dag = !con( + (intr addr:$dst, addr:$mbar, B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B64:$ch, 0)); + defvar intr_dag_with_ch = !con( + (intr addr:$dst, addr:$mbar, B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B64:$ch, -1)); + defvar inst_name = "cp.async.bulk.tensor" + # "." # dim_val # "d" + # "." # "shared::cta.global" + # "." # !subst("_", "::", mode) + # "." # "mbarrier::complete_tx::bytes"; + + def "" : NVPTXInst<(outs), ins_dag, + inst_name # asm_str # ";", + [intr_dag]>, + Requires<pred>; + def _CH : NVPTXInst<(outs), ins_dag, + inst_name # ".L2::cache_hint" # asm_str # ", $ch;", + [intr_dag_with_ch]>, + Requires<pred>; +} +foreach dim = 1...5 in { + defm TMA_G2S_CTA_TILE_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR<dim, "tile", [hasPTX<86>, hasSM<90>]>; +} +foreach dim = 3...5 in { + defm TMA_G2S_CTA_IM2COL_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col", [hasPTX<86>, hasSM<90>]>; + + defm TMA_G2S_CTA_IM2COL_W_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>; + + defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>; +} +defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4", + [hasPTX<86>, hasSM<100>]>; + multiclass TMA_TENSOR_S2G_INTR<int dim, string mode, list<Predicate> pred = [hasPTX<80>, hasSM<90>]> { defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; defvar dims_str = TMA_DIMS_UTIL<dim>.base_str; defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]"; + defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims; defvar intr = !cast<Intrinsic>( - "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # d); + "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim_val # "d"); + defvar intr_dag = !con((intr addr:$src, B64:$tmap), !setdagop(dims_dag, intr), (intr B64:$ch, 0)); @@ -707,11 +842,13 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode, !setdagop(dims_dag, intr), (intr B64:$ch, -1)); - // For im2col mode, the actual asm_str is "im2col_no_offs" - defvar mode_asm_str = !if(!eq(mode, "im2col"), - "im2col_no_offs", mode); + // Fix-up the asm_str when it is im2col/scatter4. + defvar mode_asm_str = !cond( + !eq(mode, "im2col") : "im2col_no_offs", + !eq(mode, "tile_scatter4") : "tile::scatter4", + true : mode); defvar prefix = "cp.async.bulk.tensor" - # "." # dim # "d" + # "." # dim_val # "d" # ".global.shared::cta" # "." # mode_asm_str # ".bulk_group"; @@ -729,10 +866,12 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode, } foreach dim = 1...5 in { foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { - defvar suffix = !toupper(mode) # "_" # dim # D; + defvar suffix = !toupper(mode) # "_" # dim # "D"; defm TMA_TENSOR_S2G_ # suffix : TMA_TENSOR_S2G_INTR<dim, mode>; } } +defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4", + [hasTMACTAGroupSupport]>; def TMAReductionFlags : Operand<i32> { let PrintMethod = "printTmaReductionMode"; @@ -786,13 +925,14 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode, asm_str_base, asm_str_base # ", {{" # im2col_str # "}}"); + defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims; defvar inst_name = "cp.async.bulk.prefetch.tensor" - # "." # dim # "d" + # "." # dim_val # "d" # "." # "L2.global" - # "." # mode; + # "." # !subst("_", "::", mode); defvar intr = !cast<Intrinsic>( - "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # d); + "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim_val # "d"); defvar ins_dag = !con((ins B64:$tmap), dims_dag, @@ -818,10 +958,19 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode, } foreach dim = 1...5 in { foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { - defvar suffix = !toupper(mode) # "_" # dim # D; + defvar suffix = !toupper(mode) # "_" # dim # "D"; defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode>; } } +foreach dim = 3...5 in { + foreach mode = ["im2col_w", "im2col_w_128"] in { + defvar suffix = !toupper(mode) # "_" # dim # "D"; + defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode, + [hasTMACTAGroupSupport]>; + } +} +defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", + [hasTMACTAGroupSupport]>; //Prefetch and Prefetchu @@ -4609,7 +4758,14 @@ class WMMA_REGINFO<WMMA_REGS r, string op> !and(!eq(op, "ldmatrix"), !eq(ptx_elt_type, "b8x16.b4x16_p64"), - !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]); + !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>], + + !and(!eq(op, "stmatrix"),!eq(ptx_elt_type, "b16"), + !eq(geom, "m8n8")) : [hasSM<90>, hasPTX<78>], + + !and(!eq(op, "stmatrix"), + !eq(ptx_elt_type, "b8"), + !eq(geom, "m16n8")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]); // template DAGs for instruction inputs/output. dag Outs = !dag(outs, ptx_regs, reg_names); @@ -4890,6 +5046,42 @@ defset list<WMMA_INSTR> LDMATRIXs = { } // transposed } // defset +// +// stmatrix.sync.aligned.m8n8[|.trans][|.shared].b16 +// +class STMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space> + : WMMA_INSTR<STMATRIX_NAME<Frag, Transposed>.record, [!con((ins ADDR:$dst), Frag.Ins)]>, + Requires<Frag.Predicates> { + // Build PatFrag that only matches particular address space. + dag PFOperands = !con((ops node:$dst), + !dag(ops, !listsplat(node, !size(Frag.regs)), Frag.reg_names)); + PatFrag IntrFrag = PatFrag<PFOperands, + !foreach(tmp, PFOperands, !subst(ops, Intr, tmp)), + !cond(!eq(Space, ".shared"): AS_match.shared, + true: AS_match.generic)>; + // Build AS-constrained pattern. + let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret; + let OutOperandList = (outs); + let InOperandList = !con(Args, (ins MmaCode:$ptx)); + let AsmString = "stmatrix.sync.aligned." + # Frag.geom + # "." # Frag.frag + # !if(Transposed, ".trans", "") + # Space + # "." # Frag.ptx_elt_type + # " [$dst], " # Frag.regstring # ";"; +} + +// Create all stmatrix variants +defset list<WMMA_INSTR> STMATRIXs = { + foreach transposed = [false, true] in {foreach space = [".shared", ""] in { + foreach frag = NVVM_MMA_OPS.all_stmatrix_ops in + if NVVM_STMATRIX_SUPPORTED<frag, transposed>.ret then + def : STMATRIX<WMMA_REGINFO<frag, "stmatrix">, transposed, space>; + } // space + } // transposed +} // defset + // Constructing non-flat DAGs is still a pain. I can't !subst a dag node with a // dag, so the ptx.version must be appended *after* foreach replaces 'ins' with // the instruction record. @@ -4900,7 +5092,7 @@ class MMA_PAT<WMMA_INSTR wi> Requires<wi.Predicates>; // Build intrinsic->instruction patterns for all MMA instructions. -foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs) in +foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs, STMATRIXs) in def : MMA_PAT<mma>; multiclass MAPA<string suffix, Intrinsic Intr> { diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index 1ac91fa..80fac18 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -53,34 +53,30 @@ let Predicates = [IsISAFuture] in { let Predicates = [HasVSX, IsISAFuture] in { let mayLoad = 1 in { - def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB), - "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>; - - def LXVRLL : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB), - "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>; - - def LXVPRL : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), - (ins memr:$RA, g8rc:$RB), - "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>; - - def LXVPRLL : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), - (ins memr:$RA, g8rc:$RB), - "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>; + def LXVRL + : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB), + "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>; + def LXVRLL + : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB), + "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>; + def LXVPRL + : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB), + "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>; + def LXVPRLL + : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB), + "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>; } let mayStore = 1 in { - def STXVRL : XX1Form_memOp<31, 653, (outs), - (ins vsrc:$XT, memr:$RA, g8rc:$RB), - "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>; - - def STXVRLL : XX1Form_memOp<31, 685, (outs), - (ins vsrc:$XT, memr:$RA, g8rc:$RB), - "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>; - + def STXVRL + : XX1Form_memOp<31, 653, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB), + "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>; + def STXVRLL + : XX1Form_memOp<31, 685, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB), + "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>; def STXVPRL : XForm_XTp5_XAB5<31, 717, (outs), (ins vsrprc:$XTp, memr:$RA, g8rc:$RB), "stxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>; - def STXVPRLL : XForm_XTp5_XAB5<31, 749, (outs), (ins vsrprc:$XTp, memr:$RA, g8rc:$RB), "stxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>; diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 75a0272..996b6ef 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -171,7 +171,7 @@ void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const { } void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { + const SchedRegion &Region) const { // The GenericScheduler that we use defaults to scheduling bottom up only. // We want to schedule from both the top and the bottom and so we set // OnlyBottomUp to false. diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index 9a97d1a..3c59a47 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -240,7 +240,8 @@ public: void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override; void overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const override; + const SchedRegion &Region) const override; + bool useAA() const override; bool enableSubRegLiveness() const override; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index f76f8b3..2c37c3b 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -302,6 +302,28 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst, Inst = std::move(Res); } +// Check if an R_RISCV_ALIGN relocation is needed for an alignment directive. +// If conditions are met, compute the padding size and create a fixup encoding +// the padding size in the addend. +bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { + // Use default handling unless linker relaxation is enabled and the alignment + // is larger than the nop size. + const MCSubtargetInfo *STI = F.getSubtargetInfo(); + if (!STI->hasFeature(RISCV::FeatureRelax)) + return false; + unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4; + if (F.getAlignment() <= MinNopLen) + return false; + + Size = F.getAlignment().value() - MinNopLen; + auto *Expr = MCConstantExpr::create(Size, getContext()); + MCFixup Fixup = + MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN); + F.setVarFixups({Fixup}); + F.getParent()->setLinkerRelaxable(); + return true; +} + bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const { MCContext &C = getContext(); @@ -637,7 +659,7 @@ bool RISCVAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA, // Otherwise, check if the offset between the symbol and fragment is fully // resolved, unaffected by linker-relaxable fragments (e.g. instructions or - // offset-affected MCAlignFragment). Complements the generic + // offset-affected FT_Align fragments). Complements the generic // isSymbolRefDifferenceFullyResolvedImpl. if (!PCRelTemp) PCRelTemp = getContext().createTempSymbol(); @@ -887,55 +909,6 @@ void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, } } -// Linker relaxation may change code size. We have to insert Nops -// for .align directive when linker relaxation enabled. So then Linker -// could satisfy alignment by removing Nops. -// The function return the total Nops Size we need to insert. -bool RISCVAsmBackend::shouldInsertExtraNopBytesForCodeAlign( - const MCAlignFragment &AF, unsigned &Size) { - // Calculate Nops Size only when linker relaxation enabled. - const MCSubtargetInfo *STI = AF.getSubtargetInfo(); - if (!STI->hasFeature(RISCV::FeatureRelax)) - return false; - - unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4; - - if (AF.getAlignment() <= MinNopLen) { - return false; - } else { - Size = AF.getAlignment().value() - MinNopLen; - return true; - } -} - -// We need to insert R_RISCV_ALIGN relocation type to indicate the -// position of Nops and the total bytes of the Nops have been inserted -// when linker relaxation enabled. -// The function insert fixup_riscv_align fixup which eventually will -// transfer to R_RISCV_ALIGN relocation type. -bool RISCVAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm, - MCAlignFragment &AF) { - // Insert the fixup only when linker relaxation enabled. - const MCSubtargetInfo *STI = AF.getSubtargetInfo(); - if (!STI->hasFeature(RISCV::FeatureRelax)) - return false; - - // Calculate total Nops we need to insert. If there are none to insert - // then simply return. - unsigned Count; - if (!shouldInsertExtraNopBytesForCodeAlign(AF, Count) || (Count == 0)) - return false; - - MCContext &Ctx = getContext(); - const MCExpr *Dummy = MCConstantExpr::create(0, Ctx); - MCFixup Fixup = MCFixup::create(0, Dummy, ELF::R_RISCV_ALIGN); - - uint64_t FixedValue = 0; - MCValue NopBytes = MCValue::get(Count); - Asm.getWriter().recordRelocation(AF, Fixup, NopBytes, FixedValue); - return true; -} - std::unique_ptr<MCObjectTargetWriter> RISCVAsmBackend::createObjectTargetWriter() const { return createRISCVELFObjectWriter(OSABI, Is64Bit); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h index 8c10fbe..d97d632 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h @@ -38,14 +38,6 @@ public: const MCTargetOptions &Options); ~RISCVAsmBackend() override = default; - // Return Size with extra Nop Bytes for alignment directive in code section. - bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF, - unsigned &Size) override; - - // Insert target specific fixup type for alignment directive in code section. - bool shouldInsertFixupForCodeAlign(MCAssembler &Asm, - MCAlignFragment &AF) override; - std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; bool addReloc(const MCFragment &, const MCFixup &, const MCValue &, @@ -73,6 +65,7 @@ public: void relaxInstruction(MCInst &Inst, const MCSubtargetInfo &STI) const override; + bool relaxAlign(MCFragment &F, unsigned &Size) override; bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override; bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override; std::pair<bool, bool> relaxLEB128(MCFragment &LF, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp index aeda5ac..5abb546 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp @@ -52,15 +52,6 @@ namespace RISCV { #include "RISCVGenSearchableTables.inc" } // namespace RISCV -// Report an error but don't ask the user to report a bug. -// TODO: Remove these wrappers. -[[noreturn]] static void reportError(const char *Reason) { - reportFatalUsageError(Reason); -} -[[noreturn]] static void reportError(Error Err) { - reportFatalUsageError(std::move(Err)); -} - namespace RISCVABI { ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, StringRef ABIName) { @@ -97,7 +88,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, if ((TargetABI == RISCVABI::ABI::ABI_ILP32E || (TargetABI == ABI_Unknown && IsRVE && !IsRV64)) && FeatureBits[RISCV::FeatureStdExtD]) - reportError("ILP32E cannot be used with the D ISA extension"); + reportFatalUsageError("ILP32E cannot be used with the D ISA extension"); if (TargetABI != ABI_Unknown) return TargetABI; @@ -105,7 +96,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, // If no explicit ABI is given, try to compute the default ABI. auto ISAInfo = RISCVFeatures::parseFeatureBits(IsRV64, FeatureBits); if (!ISAInfo) - reportError(ISAInfo.takeError()); + reportFatalUsageError(ISAInfo.takeError()); return getTargetABI((*ISAInfo)->computeDefaultABI()); } @@ -137,12 +128,12 @@ namespace RISCVFeatures { void validate(const Triple &TT, const FeatureBitset &FeatureBits) { if (TT.isArch64Bit() && !FeatureBits[RISCV::Feature64Bit]) - reportError("RV64 target requires an RV64 CPU"); + reportFatalUsageError("RV64 target requires an RV64 CPU"); if (!TT.isArch64Bit() && !FeatureBits[RISCV::Feature32Bit]) - reportError("RV32 target requires an RV32 CPU"); + reportFatalUsageError("RV32 target requires an RV32 CPU"); if (FeatureBits[RISCV::Feature32Bit] && FeatureBits[RISCV::Feature64Bit]) - reportError("RV32 and RV64 can't be combined"); + reportFatalUsageError("RV32 and RV64 can't be combined"); } llvm::Expected<std::unique_ptr<RISCVISAInfo>> diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td index cbf039e..4c303a9 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.td +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td @@ -56,19 +56,21 @@ def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, (sequence "F%u_D", 0, 31))>; +defvar VREGS = (add (sequence "V%u", 0, 31), + (sequence "V%uM2", 0, 31, 2), + (sequence "V%uM4", 0, 31, 4), + (sequence "V%uM8", 0, 31, 8)); + // Same as CSR_Interrupt, but including all vector registers. -def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, - (sequence "V%u", 0, 31))>; +def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, VREGS)>; // Same as CSR_Interrupt, but including all 32-bit FP registers and all vector // registers. -def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt, - (sequence "V%u", 0, 31))>; +def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt, VREGS)>; // Same as CSR_Interrupt, but including all 64-bit FP registers and all vector // registers. -def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt, - (sequence "V%u", 0, 31))>; +def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt, VREGS)>; // Same as CSR_Interrupt, but excluding X16-X31. def CSR_Interrupt_RVE : CalleeSavedRegs<(sub CSR_Interrupt, diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 23b4554..b1ab76a 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -1544,10 +1544,53 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return Offset; } +static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI, + const Register &Reg) { + MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0); + // If it's not a grouped vector register, it doesn't have subregister, so + // the base register is just itself. + if (BaseReg == RISCV::NoRegister) + BaseReg = Reg; + return BaseReg; +} + void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + + // In TargetFrameLowering::determineCalleeSaves, any vector register is marked + // as saved if any of its subregister is clobbered, this is not correct in + // vector registers. We only want the vector register to be marked as saved + // if all of its subregisters are clobbered. + // For example: + // Original behavior: If v24 is marked, v24m2, v24m4, v24m8 are also marked. + // Correct behavior: v24m2 is marked only if v24 and v25 are marked. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + const RISCVRegisterInfo &TRI = *STI.getRegisterInfo(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned CSReg = CSRegs[i]; + // Only vector registers need special care. + if (!RISCV::VRRegClass.contains(getRVVBaseRegister(TRI, CSReg))) + continue; + + SavedRegs.reset(CSReg); + + auto SubRegs = TRI.subregs(CSReg); + // Set the register and all its subregisters. + if (!MRI.def_empty(CSReg) || MRI.getUsedPhysRegsMask().test(CSReg)) { + SavedRegs.set(CSReg); + llvm::for_each(SubRegs, [&](unsigned Reg) { return SavedRegs.set(Reg); }); + } + + // Combine to super register if all of its subregisters are marked. + if (!SubRegs.empty() && llvm::all_of(SubRegs, [&](unsigned Reg) { + return SavedRegs.test(Reg); + })) + SavedRegs.set(CSReg); + } + // Unconditionally spill RA and FP only if the function uses a frame // pointer. if (hasFP(MF)) { @@ -2137,16 +2180,6 @@ static unsigned getCalleeSavedRVVNumRegs(const Register &BaseReg) { : 8; } -static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI, - const Register &Reg) { - MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0); - // If it's not a grouped vector register, it doesn't have subregister, so - // the base register is just itself. - if (BaseReg == RISCV::NoRegister) - BaseReg = Reg; - return BaseReg; -} - void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, bool HasFP) const { MachineFunction *MF = MBB.getParent(); diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index cfec46d2..a541c2f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3106,6 +3106,25 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, return true; } +bool RISCVDAGToDAGISel::SelectAddrRegZextRegScale(SDValue Addr, + unsigned MaxShiftAmount, + unsigned Bits, SDValue &Base, + SDValue &Index, + SDValue &Scale) { + if (!SelectAddrRegRegScale(Addr, MaxShiftAmount, Base, Index, Scale)) + return false; + + if (Index.getOpcode() == ISD::AND) { + auto *C = dyn_cast<ConstantSDNode>(Index.getOperand(1)); + if (C && C->getZExtValue() == maskTrailingOnes<uint64_t>(Bits)) { + Index = Index.getOperand(0); + return true; + } + } + + return false; +} + bool RISCVDAGToDAGISel::SelectAddrRegReg(SDValue Addr, SDValue &Base, SDValue &Offset) { if (Addr.getOpcode() != ISD::ADD) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index 72e2f96..ee3a86e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -59,19 +59,14 @@ public: return SelectAddrRegRegScale(Addr, MaxShift, Base, Index, Scale); } + bool SelectAddrRegZextRegScale(SDValue Addr, unsigned MaxShiftAmount, + unsigned Bits, SDValue &Base, SDValue &Index, + SDValue &Scale); + template <unsigned MaxShift, unsigned Bits> bool SelectAddrRegZextRegScale(SDValue Addr, SDValue &Base, SDValue &Index, SDValue &Scale) { - if (SelectAddrRegRegScale(Addr, MaxShift, Base, Index, Scale)) { - if (Index.getOpcode() == ISD::AND) { - auto *C = dyn_cast<ConstantSDNode>(Index.getOperand(1)); - if (C && C->getZExtValue() == maskTrailingOnes<uint64_t>(Bits)) { - Index = Index.getOperand(0); - return true; - } - } - } - return false; + return SelectAddrRegZextRegScale(Addr, MaxShift, Bits, Base, Index, Scale); } bool SelectAddrRegReg(SDValue Addr, SDValue &Base, SDValue &Offset); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4845a9c..d859db3a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2319,6 +2319,10 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, if (getLegalZfaFPImm(Imm, VT) >= 0) return true; + // Some constants can be produced by fli+fneg. + if (Imm.isNegative() && getLegalZfaFPImm(-Imm, VT) >= 0) + return true; + // Cannot create a 64 bit floating-point immediate value for rv32. if (Subtarget.getXLen() < VT.getScalarSizeInBits()) { // td can handle +0.0 or -0.0 already. diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index e23001a..d9c6101 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -174,6 +174,7 @@ class EltDeps<bit vl, bit mask> { def EltDepsNone : EltDeps<vl=0, mask=0>; def EltDepsVL : EltDeps<vl=1, mask=0>; +def EltDepsMask : EltDeps<vl=0, mask=1>; def EltDepsVLMask : EltDeps<vl=1, mask=1>; class EEW <bits<2> val> { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index aef410f..17067220 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -44,45 +44,48 @@ def simm10_unsigned : RISCVOp { //===----------------------------------------------------------------------===// let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class RVPUnaryImm10<bits<7> funct7, string opcodestr, - DAGOperand TyImm10 = simm10> - : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins TyImm10:$imm10), - opcodestr, "$rd, $imm10"> { +class RVPLoadImm10<bits<7> funct7, string opcodestr, + DAGOperand TyImm10 = simm10> + : RVInst<(outs GPR:$rd), (ins TyImm10:$imm10), opcodestr, "$rd, $imm10", [], + InstFormatOther> { bits<10> imm10; + bits<5> rd; let Inst{31-25} = funct7; let Inst{24-16} = imm10{8-0}; let Inst{15} = imm10{9}; + let Inst{14-12} = 0b010; + let Inst{11-7} = rd; + let Inst{6-0} = OPC_OP_IMM_32.Value; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class RVPUnaryImm8<bits<8> funct8, string opcodestr> - : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins uimm8:$uimm8), - opcodestr, "$rd, $uimm8"> { +class RVPLoadImm8<bits<8> funct8, string opcodestr> + : RVInst<(outs GPR:$rd), (ins uimm8:$uimm8), opcodestr, "$rd, $uimm8", [], + InstFormatOther> { bits<8> uimm8; + bits<5> rd; let Inst{31-24} = funct8; let Inst{23-16} = uimm8; let Inst{15} = 0b0; + let Inst{14-12} = 0b010; + let Inst{11-7} = rd; + let Inst{6-0} = OPC_OP_IMM_32.Value; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class RVPUnary<bits<3> f, string opcodestr, dag operands, string argstr> : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), operands, opcodestr, argstr> { - bits<5> imm; - bits<5> rs1; - let Inst{31} = 0b1; let Inst{30-28} = f; let Inst{27} = 0b0; - let Inst{19-15} = rs1; } class RVPUnaryImm5<bits<3> f, string opcodestr> : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm5:$uimm5), "$rd, $rs1, $uimm5"> { bits<5> uimm5; - let imm = uimm5; let Inst{26-25} = 0b01; let Inst{24-20} = uimm5; } @@ -145,11 +148,11 @@ def PSSLAI_W : RVPUnaryImm5<0b101, "psslai.w">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in -def PLI_H : RVPUnaryImm10<0b1011000, "pli.h">; +def PLI_H : RVPLoadImm10<0b1011000, "pli.h">; let Predicates = [HasStdExtP, IsRV64] in -def PLI_W : RVPUnaryImm10<0b1011001, "pli.w">; +def PLI_W : RVPLoadImm10<0b1011001, "pli.w">; let Predicates = [HasStdExtP] in -def PLI_B : RVPUnaryImm8<0b10110100, "pli.b">; +def PLI_B : RVPLoadImm8<0b10110100, "pli.b">; let Predicates = [HasStdExtP] in { def PSEXT_H_B : RVPUnaryWUF<0b00, 0b00100, "psext.h.b">; @@ -162,6 +165,6 @@ def PSEXT_W_H : RVPUnaryWUF<0b01, 0b00101, "psext.w.h">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in -def PLUI_H : RVPUnaryImm10<0b1111000, "plui.h", simm10_unsigned>; +def PLUI_H : RVPLoadImm10<0b1111000, "plui.h", simm10_unsigned>; let Predicates = [HasStdExtP, IsRV64] in -def PLUI_W : RVPUnaryImm10<0b1111001, "plui.w", simm10_unsigned>; +def PLUI_W : RVPLoadImm10<0b1111001, "plui.w", simm10_unsigned>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index 5d13a87..33c7138 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -1642,7 +1642,7 @@ def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd), def : MnemonicAlias<"vpopc.m", "vcpop.m">; -let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask in { +let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsMask in { let DestEEW = EEW1 in { // vmsbf.m set-before-first mask bit @@ -1655,7 +1655,7 @@ defm VMSOF_M : VMSFS_MV_V<"vmsof.m", 0b010100, 0b00010>; // Vector Iota Instruction defm VIOTA_M : VIOTA_MV_V<"viota.m", 0b010100, 0b10000>; -} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask +} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsMask // Vector Element Index Instruction let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index c7cb6e2..f391300 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1377,9 +1377,9 @@ let Predicates = [HasVendorXqciac, IsRV32] in { def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12:$imm12))), (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12)>; def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)), - (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>; + (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)), - (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>; + (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; } // Predicates = [HasVendorXqciac, IsRV32] /// Simple arithmetic operations diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index dd68a55..878401e 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -131,25 +131,56 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, : Constant::getAllOnesValue(XLenTy); return true; } - auto *VPLdSt = cast<VPIntrinsic>(I); - assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load || - VPLdSt->getIntrinsicID() == Intrinsic::vp_store) && - "Unexpected intrinsic"); - Ptr = VPLdSt->getMemoryPointerParam(); - Alignment = VPLdSt->getPointerAlignment().value_or( - DL.getABITypeAlign(VTy->getElementType())); - assert(Mask && "vp.load and vp.store needs a mask!"); + auto *II = cast<IntrinsicInst>(I); + switch (II->getIntrinsicID()) { + default: + llvm_unreachable("Unsupported intrinsic type"); + case Intrinsic::vp_load: + case Intrinsic::vp_store: { + auto *VPLdSt = cast<VPIntrinsic>(I); + Ptr = VPLdSt->getMemoryPointerParam(); + Alignment = VPLdSt->getPointerAlignment().value_or( + DL.getABITypeAlign(VTy->getElementType())); + + assert(Mask && "vp.load and vp.store needs a mask!"); + + Value *WideEVL = VPLdSt->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor)) + return false; - Value *WideEVL = VPLdSt->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor)) - return false; + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); + return true; + } + case Intrinsic::masked_load: { + Ptr = II->getOperand(0); + Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue(); - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - return true; + if (!isa<UndefValue>(II->getOperand(3))) + return false; + + assert(Mask && "masked.load needs a mask!"); + + VL = isa<FixedVectorType>(VTy) + ? Builder.CreateElementCount(XLenTy, VTy->getElementCount()) + : Constant::getAllOnesValue(XLenTy); + return true; + } + case Intrinsic::masked_store: { + Ptr = II->getOperand(1); + Alignment = cast<ConstantInt>(II->getArgOperand(2))->getAlignValue(); + + assert(Mask && "masked.store needs a mask!"); + + VL = isa<FixedVectorType>(VTy) + ? Builder.CreateElementCount(XLenTy, VTy->getElementCount()) + : Constant::getAllOnesValue(XLenTy); + return true; + } + } } /// Lower an interleaved load into a vlsegN intrinsic. @@ -201,6 +232,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad( Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, {VTy, BasePtr->getType(), Stride->getType()}, {BasePtr, Stride, Mask, VL}); + Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes); CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), Alignment)); Shuffles[0]->replaceAllUsesWith(CI); @@ -272,8 +304,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, Intrinsic::experimental_vp_strided_store, {Data->getType(), BasePtr->getType(), Stride->getType()}, {Data, BasePtr, Stride, Mask, VL}); + Align Alignment = commonAlignment(SI->getAlign(), Index * ScalarSizeInBytes); CI->addParamAttr( - 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign())); + 1, Attribute::getWithAlignment(CI->getContext(), Alignment)); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp index 28d6403..3b19c34 100644 --- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp +++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp @@ -48,6 +48,8 @@ using namespace llvm; STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions"); STATISTIC(NumTransformedToWInstrs, "Number of instructions transformed to W-ops"); +STATISTIC(NumTransformedToNonWInstrs, + "Number of instructions transformed to non-W-ops"); static cl::opt<bool> DisableSExtWRemoval("riscv-disable-sextw-removal", cl::desc("Disable removal of sext.w"), @@ -67,10 +69,9 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; bool removeSExtWInstrs(MachineFunction &MF, const RISCVInstrInfo &TII, const RISCVSubtarget &ST, MachineRegisterInfo &MRI); - bool stripWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII, - const RISCVSubtarget &ST, MachineRegisterInfo &MRI); - bool appendWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII, - const RISCVSubtarget &ST, MachineRegisterInfo &MRI); + bool canonicalizeWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII, + const RISCVSubtarget &ST, + MachineRegisterInfo &MRI); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -721,45 +722,39 @@ bool RISCVOptWInstrs::removeSExtWInstrs(MachineFunction &MF, return MadeChange; } -bool RISCVOptWInstrs::stripWSuffixes(MachineFunction &MF, - const RISCVInstrInfo &TII, - const RISCVSubtarget &ST, - MachineRegisterInfo &MRI) { +// Strips or adds W suffixes to eligible instructions depending on the +// subtarget preferences. +bool RISCVOptWInstrs::canonicalizeWSuffixes(MachineFunction &MF, + const RISCVInstrInfo &TII, + const RISCVSubtarget &ST, + MachineRegisterInfo &MRI) { + bool ShouldStripW = !(DisableStripWSuffix || ST.preferWInst()); + bool ShouldPreferW = ST.preferWInst(); bool MadeChange = false; - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - unsigned Opc; - switch (MI.getOpcode()) { - default: - continue; - case RISCV::ADDW: Opc = RISCV::ADD; break; - case RISCV::ADDIW: Opc = RISCV::ADDI; break; - case RISCV::MULW: Opc = RISCV::MUL; break; - case RISCV::SLLIW: Opc = RISCV::SLLI; break; - } - if (hasAllWUsers(MI, ST, MRI)) { - MI.setDesc(TII.get(Opc)); - MadeChange = true; - } - } - } - - return MadeChange; -} - -bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF, - const RISCVInstrInfo &TII, - const RISCVSubtarget &ST, - MachineRegisterInfo &MRI) { - bool MadeChange = false; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - unsigned WOpc; - // TODO: Add more? - switch (MI.getOpcode()) { + std::optional<unsigned> WOpc; + std::optional<unsigned> NonWOpc; + unsigned OrigOpc = MI.getOpcode(); + switch (OrigOpc) { default: continue; + case RISCV::ADDW: + NonWOpc = RISCV::ADD; + break; + case RISCV::ADDIW: + NonWOpc = RISCV::ADDI; + break; + case RISCV::MULW: + NonWOpc = RISCV::MUL; + break; + case RISCV::SLLIW: + NonWOpc = RISCV::SLLI; + break; + case RISCV::SUBW: + NonWOpc = RISCV::SUB; + break; case RISCV::ADD: WOpc = RISCV::ADDW; break; @@ -773,7 +768,7 @@ bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF, WOpc = RISCV::MULW; break; case RISCV::SLLI: - // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits + // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits. if (MI.getOperand(2).getImm() >= 32) continue; WOpc = RISCV::SLLIW; @@ -784,19 +779,30 @@ bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF, break; } - if (hasAllWUsers(MI, ST, MRI)) { + if (ShouldStripW && NonWOpc.has_value() && hasAllWUsers(MI, ST, MRI)) { + LLVM_DEBUG(dbgs() << "Replacing " << MI); + MI.setDesc(TII.get(NonWOpc.value())); + LLVM_DEBUG(dbgs() << " with " << MI); + ++NumTransformedToNonWInstrs; + MadeChange = true; + continue; + } + // LWU is always converted to LW when possible as 1) LW is compressible + // and 2) it helps minimise differences vs RV32. + if ((ShouldPreferW || OrigOpc == RISCV::LWU) && WOpc.has_value() && + hasAllWUsers(MI, ST, MRI)) { LLVM_DEBUG(dbgs() << "Replacing " << MI); - MI.setDesc(TII.get(WOpc)); + MI.setDesc(TII.get(WOpc.value())); MI.clearFlag(MachineInstr::MIFlag::NoSWrap); MI.clearFlag(MachineInstr::MIFlag::NoUWrap); MI.clearFlag(MachineInstr::MIFlag::IsExact); LLVM_DEBUG(dbgs() << " with " << MI); ++NumTransformedToWInstrs; MadeChange = true; + continue; } } } - return MadeChange; } @@ -813,12 +819,6 @@ bool RISCVOptWInstrs::runOnMachineFunction(MachineFunction &MF) { bool MadeChange = false; MadeChange |= removeSExtWInstrs(MF, TII, ST, MRI); - - if (!(DisableStripWSuffix || ST.preferWInst())) - MadeChange |= stripWSuffixes(MF, TII, ST, MRI); - - if (ST.preferWInst()) - MadeChange |= appendWSuffixes(MF, TII, ST, MRI); - + MadeChange |= canonicalizeWSuffixes(MF, TII, ST, MRI); return MadeChange; } diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index c754de4..e35ffaf 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -216,7 +216,7 @@ unsigned RISCVSubtarget::getMinimumJumpTableEntries() const { } void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { + const SchedRegion &Region) const { // Do bidirectional scheduling since it provides a more balanced scheduling // leading to better performance. This will increase compile time. Policy.OnlyTopDown = false; @@ -231,8 +231,8 @@ void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, Policy.ShouldTrackPressure = true; } -void RISCVSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { +void RISCVSubtarget::overridePostRASchedPolicy( + MachineSchedPolicy &Policy, const SchedRegion &Region) const { MISched::Direction PostRASchedDirection = getPostRASchedDirection(); if (PostRASchedDirection == MISched::TopDown) { Policy.OnlyTopDown = true; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 4f560cc..fd57e02 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -395,11 +395,11 @@ public: } void overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const override; + const SchedRegion &Region) const override; void overridePostRASchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const override; + const SchedRegion &Region) const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 12bf8c1..d62d99c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -116,8 +116,8 @@ public: } TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override { - return ST->hasVInstructions() ? TailFoldingStyle::Data - : TailFoldingStyle::DataWithoutLaneMask; + return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL + : TailFoldingStyle::None; } std::optional<unsigned> getMaxVScale() const override; std::optional<unsigned> getVScaleForTuning() const override; diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index e656e8b..b53d919 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -33,6 +33,7 @@ namespace { class RISCVVLOptimizer : public MachineFunctionPass { const MachineRegisterInfo *MRI; const MachineDominatorTree *MDT; + const TargetInstrInfo *TII; public: static char ID; @@ -1291,7 +1292,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { return false; } - assert(!RISCVII::elementsDependOnVL(RISCV::getRVVMCOpcode(MI.getOpcode())) && + assert(!RISCVII::elementsDependOnVL( + TII->get(RISCV::getRVVMCOpcode(MI.getOpcode())).TSFlags) && "Instruction shouldn't be supported if elements depend on VL"); assert(MI.getOperand(0).isReg() && @@ -1484,7 +1486,6 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const { } bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) { - assert(DemandedVLs.size() == 0); if (skipFunction(MF.getFunction())) return false; @@ -1495,6 +1496,10 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) { if (!ST.hasVInstructions()) return false; + TII = ST.getInstrInfo(); + + assert(DemandedVLs.empty()); + // For each instruction that defines a vector, compute what VL its // downstream users demand. for (MachineBasicBlock *MBB : post_order(&MF)) { diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index 84ef539..c1cc19b 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -434,6 +434,15 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) { if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg())) return false; + // Masked off lanes past TrueVL will come from False, and converting to vmv + // will lose these lanes unless MIVL <= TrueVL. + // TODO: We could relax this for False == Passthru and True policy == TU + const MachineOperand &MIVL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc())); + const MachineOperand &TrueVL = + True->getOperand(RISCVII::getVLOpNum(True->getDesc())); + if (!RISCV::isVLKnownLE(MIVL, TrueVL)) + return false; + // True's passthru needs to be equivalent to False Register TruePassthruReg = True->getOperand(1).getReg(); Register FalseReg = MI.getOperand(2).getReg(); diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 6608b3f..d4fa62a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -296,6 +296,8 @@ private: bool selectImageWriteIntrinsic(MachineInstr &I) const; bool selectResourceGetPointer(Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const; + bool selectModf(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; // Utilities std::pair<Register, bool> @@ -3235,6 +3237,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, case Intrinsic::spv_discard: { return selectDiscard(ResVReg, ResType, I); } + case Intrinsic::modf: { + return selectModf(ResVReg, ResType, I); + } default: { std::string DiagMsg; raw_string_ostream OS(DiagMsg); @@ -4018,6 +4023,83 @@ bool SPIRVInstructionSelector::selectLog10(Register ResVReg, .constrainAllUses(TII, TRI, RBI); } +bool SPIRVInstructionSelector::selectModf(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + // llvm.modf has a single arg --the number to be decomposed-- and returns a + // struct { restype, restype }, while OpenCLLIB::modf has two args --the + // number to be decomposed and a pointer--, returns the fractional part and + // the integral part is stored in the pointer argument. Therefore, we can't + // use directly the OpenCLLIB::modf intrinsic. However, we can do some + // scaffolding to make it work. The idea is to create an alloca instruction + // to get a ptr, pass this ptr to OpenCL::modf, and then load the value + // from this ptr to place it in the struct. llvm.modf returns the fractional + // part as the first element of the result, and the integral part as the + // second element of the result. + + // At this point, the return type is not a struct anymore, but rather two + // independent elements of SPIRVResType. We can get each independent element + // from I.getDefs() or I.getOperands(). + if (STI.canUseExtInstSet(SPIRV::InstructionSet::OpenCL_std)) { + MachineIRBuilder MIRBuilder(I); + // Get pointer type for alloca variable. + const SPIRVType *PtrType = GR.getOrCreateSPIRVPointerType( + ResType, MIRBuilder, SPIRV::StorageClass::Function); + // Create new register for the pointer type of alloca variable. + Register PtrTyReg = + MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass); + MIRBuilder.getMRI()->setType( + PtrTyReg, + LLT::pointer(storageClassToAddressSpace(SPIRV::StorageClass::Function), + GR.getPointerSize())); + // Assign SPIR-V type of the pointer type of the alloca variable to the + // new register. + GR.assignSPIRVTypeToVReg(PtrType, PtrTyReg, MIRBuilder.getMF()); + MachineBasicBlock &EntryBB = I.getMF()->front(); + MachineBasicBlock::iterator VarPos = + getFirstValidInstructionInsertPoint(EntryBB); + auto AllocaMIB = + BuildMI(EntryBB, VarPos, I.getDebugLoc(), TII.get(SPIRV::OpVariable)) + .addDef(PtrTyReg) + .addUse(GR.getSPIRVTypeID(PtrType)) + .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function)); + Register Variable = AllocaMIB->getOperand(0).getReg(); + // Modf must have 4 operands, the first two are the 2 parts of the result, + // the third is the operand, and the last one is the floating point value. + assert(I.getNumOperands() == 4 && + "Expected 4 operands for modf instruction"); + MachineBasicBlock &BB = *I.getParent(); + // Create the OpenCLLIB::modf instruction. + auto MIB = + BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::OpenCL_std)) + .addImm(CL::modf) + .setMIFlags(I.getFlags()) + .add(I.getOperand(3)) // Floating point value. + .addUse(Variable); // Pointer to integral part. + // Assign the integral part stored in the ptr to the second element of the + // result. + Register IntegralPartReg = I.getOperand(1).getReg(); + if (IntegralPartReg.isValid()) { + // Load the value from the pointer to integral part. + auto LoadMIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpLoad)) + .addDef(IntegralPartReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(Variable); + return LoadMIB.constrainAllUses(TII, TRI, RBI); + } + + return MIB.constrainAllUses(TII, TRI, RBI); + } else if (STI.canUseExtInstSet(SPIRV::InstructionSet::GLSL_std_450)) { + assert(false && "GLSL::Modf is deprecated."); + // FIXME: GL::Modf is deprecated, use Modfstruct instead. + return false; + } + return false; +} + // Generate the instructions to load 3-element vector builtin input // IDs/Indices. // Like: GlobalInvocationId, LocalInvocationId, etc.... diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 2bffbf7..595424b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -380,7 +380,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { bool Changed = false; const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F); for (BasicBlock &BB : *F) { - for (Instruction &I : BB) { + for (Instruction &I : make_early_inc_range(BB)) { auto Call = dyn_cast<CallInst>(&I); if (!Call) continue; @@ -408,12 +408,18 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { if (!STI.isShader()) { Changed |= toSpvOverloadedIntrinsic( II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1}); + } else { + II->eraseFromParent(); + Changed = true; } break; case Intrinsic::lifetime_end: if (!STI.isShader()) { Changed |= toSpvOverloadedIntrinsic( II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1}); + } else { + II->eraseFromParent(); + Changed = true; } break; case Intrinsic::ptr_annotation: diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 768efb9..416d811 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -995,4 +995,27 @@ unsigned getArrayComponentCount(const MachineRegisterInfo *MRI, return foldImm(ResType->getOperand(2), MRI); } +MachineBasicBlock::iterator +getFirstValidInstructionInsertPoint(MachineBasicBlock &BB) { + // Find the position to insert the OpVariable instruction. + // We will insert it after the last OpFunctionParameter, if any, or + // after OpFunction otherwise. + MachineBasicBlock::iterator VarPos = BB.begin(); + while (VarPos != BB.end() && VarPos->getOpcode() != SPIRV::OpFunction) { + ++VarPos; + } + // Advance VarPos to the next instruction after OpFunction, it will either + // be an OpFunctionParameter, so that we can start the next loop, or the + // position to insert the OpVariable instruction. + ++VarPos; + while (VarPos != BB.end() && + VarPos->getOpcode() == SPIRV::OpFunctionParameter) { + ++VarPos; + } + // VarPos is now pointing at after the last OpFunctionParameter, if any, + // or after OpFunction, if no parameters. + return VarPos != BB.end() && VarPos->getOpcode() == SPIRV::OpLabel ? ++VarPos + : VarPos; +} + } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index d732188..45c520a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -506,6 +506,8 @@ MachineInstr *getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI); int64_t foldImm(const MachineOperand &MO, const MachineRegisterInfo *MRI); unsigned getArrayComponentCount(const MachineRegisterInfo *MRI, const MachineInstr *ResType); +MachineBasicBlock::iterator +getFirstValidInstructionInsertPoint(MachineBasicBlock &BB); } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 9b434d8..1aa8efe 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -2201,7 +2201,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op, SDValue Chain = DAG.getEntryNode(); SDValue InGlue; - Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InGlue); InGlue = Chain.getValue(1); SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT); @@ -2219,7 +2219,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op, InGlue}; Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops); InGlue = Chain.getValue(1); - Chain = DAG.getCALLSEQ_END(Chain, 1, 0, InGlue, DL); + Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL); InGlue = Chain.getValue(1); SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InGlue); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index bf2e04c..09b8864 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -46,6 +46,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( : TargetLowering(TM), Subtarget(&STI) { auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32; + // Set the load count for memcmp expand optimization + MaxLoadsPerMemcmp = 8; + MaxLoadsPerMemcmpOptSize = 4; + // Booleans always contain 0 or 1. setBooleanContents(ZeroOrOneBooleanContent); // Except in SIMD vectors @@ -2935,6 +2939,25 @@ performVectorExtendToFPCombine(SDNode *N, } static SDValue +performVectorNonNegToFPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + auto &DAG = DCI.DAG; + + SDNodeFlags Flags = N->getFlags(); + SDValue Op0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // Optimize uitofp to sitofp when the sign bit is known to be zero. + // Depending on the target (runtime) backend, this might be performance + // neutral (e.g. AArch64) or a significant improvement (e.g. x86_64). + if (VT.isVector() && (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0))) { + return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0); + } + + return SDValue(); +} + +static SDValue performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { auto &DAG = DCI.DAG; assert(N->getOpcode() == ISD::SIGN_EXTEND || @@ -3515,6 +3538,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, case ISD::ZERO_EXTEND: return performVectorExtendCombine(N, DCI); case ISD::UINT_TO_FP: + if (auto ExtCombine = performVectorExtendToFPCombine(N, DCI)) + return ExtCombine; + return performVectorNonNegToFPCombine(N, DCI); case ISD::SINT_TO_FP: return performVectorExtendToFPCombine(N, DCI); case ISD::FP_TO_SINT_SAT: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 4f15999..52e7065 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -141,6 +141,21 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost( return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); } +WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions +WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + + Options.AllowOverlappingLoads = true; + + // TODO: Teach WebAssembly backend about load v128. + + Options.LoadSizes.append({8, 4, 2, 1}); + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + Options.NumLoadsPerBlock = Options.MaxNumLoads; + + return Options; +} + InstructionCost WebAssemblyTTIImpl::getMemoryOpCost( unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index d83b8d1..c915eeb0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -73,6 +73,10 @@ public: getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) const override; + + TTI::MemCmpExpansionOptions + enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override; + InstructionCost getMemoryOpCost( unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 3d060c6..e213923 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -127,7 +127,6 @@ class X86AsmBackend : public MCAsmBackend { unsigned PrevInstOpcode = 0; MCBoundaryAlignFragment *PendingBA = nullptr; std::pair<MCFragment *, size_t> PrevInstPosition; - bool IsRightAfterData = false; uint8_t determinePaddingPrefix(const MCInst &Inst) const; bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const; @@ -156,10 +155,13 @@ public: AlignBranchType = X86AlignBranchKindLoc; if (X86PadMaxPrefixSize.getNumOccurrences()) TargetPrefixMax = X86PadMaxPrefixSize; + + AllowAutoPadding = + AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone; + AllowEnhancedRelaxation = + AllowAutoPadding && TargetPrefixMax != 0 && X86PadForBranchAlign; } - bool allowAutoPadding() const override; - bool allowEnhancedRelaxation() const override; void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst, const MCSubtargetInfo &STI); void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst); @@ -365,14 +367,6 @@ static bool hasVariantSymbol(const MCInst &MI) { return false; } -bool X86AsmBackend::allowAutoPadding() const { - return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone); -} - -bool X86AsmBackend::allowEnhancedRelaxation() const { - return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign; -} - /// X86 has certain instructions which enable interrupts exactly one /// instruction *after* the instruction which stores to SS. Return true if the /// given instruction may have such an interrupt delay slot. @@ -447,7 +441,7 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { // semantic. return false; - if (IsRightAfterData) + if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition)) // If this instruction follows any data, there is no clear // instruction boundary, inserting a nop/prefix would change semantic. return false; @@ -484,13 +478,26 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const { (AlignBranchType & X86::AlignBranchIndirect)); } +void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst, + const MCSubtargetInfo &STI) { + bool AutoPadding = S.getAllowAutoPadding(); + if (LLVM_LIKELY(!AutoPadding && !X86PadForAlign)) { + S.MCObjectStreamer::emitInstruction(Inst, STI); + return; + } + + auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend()); + Backend.emitInstructionBegin(S, Inst, STI); + S.MCObjectStreamer::emitInstruction(Inst, STI); + Backend.emitInstructionEnd(S, Inst); +} + /// Insert BoundaryAlignFragment before instructions to align branches. void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst, const MCSubtargetInfo &STI) { - // Used by canPadInst. Done here, because in emitInstructionEnd, the current - // fragment will have changed. - IsRightAfterData = - isRightAfterData(OS.getCurrentFragment(), PrevInstPosition); + bool CanPadInst = canPadInst(Inst, OS); + if (CanPadInst) + OS.getCurrentFragment()->setAllowAutoPadding(true); if (!canPadBranches(OS)) return; @@ -504,7 +511,7 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, // we call canPadInst (not cheap) twice. However, in the common case, we can // avoid unnecessary calls to that, as this is otherwise only used for // relaxable fragments. - if (!canPadInst(Inst, OS)) + if (!CanPadInst) return; if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) { @@ -542,11 +549,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, /// Set the last fragment to be aligned for the BoundaryAlignFragment. void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) { - MCFragment *CF = OS.getCurrentFragment(); - if (CF->getKind() == MCFragment::FT_Relaxable) - CF->setAllowAutoPadding(canPadInst(Inst, OS)); - // Update PrevInstOpcode here, canPadInst() reads that. + MCFragment *CF = OS.getCurrentFragment(); PrevInstOpcode = Inst.getOpcode(); PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF)); @@ -567,11 +571,10 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, // DataFragment, so that we can get the size of instructions later in // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty // DataFragment. - OS.insert(OS.getContext().allocFragment<MCFragment>()); + OS.newFragment(); // Update the maximum alignment on the current section if necessary. - MCSection *Sec = OS.getCurrentSectionOnly(); - Sec->ensureMinAlignment(AlignBoundary); + CF->getParent()->ensureMinAlignment(AlignBoundary); } std::optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const { @@ -923,13 +926,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const { continue; } - const uint64_t OrigSize = Asm.computeFragmentSize(F); - // To keep the effects local, prefer to relax instructions closest to // the align directive. This is purely about human understandability // of the resulting code. If we later find a reason to expand // particular instructions over others, we can adjust. - unsigned RemainingSize = OrigSize; + unsigned RemainingSize = Asm.computeFragmentSize(F) - F.getFixedSize(); while (!Relaxable.empty() && RemainingSize != 0) { auto &RF = *Relaxable.pop_back_val(); // Give the backend a chance to play any tricks it wishes to increase @@ -1542,14 +1543,6 @@ public: }; } // end anonymous namespace -void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst, - const MCSubtargetInfo &STI) { - auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend()); - Backend.emitInstructionBegin(S, Inst, STI); - S.MCObjectStreamer::emitInstruction(Inst, STI); - Backend.emitInstructionEnd(S, Inst); -} - void X86ELFStreamer::emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) { X86_MC::emitInstruction(*this, Inst, STI); diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h index efb951b..e02b556 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/llvm/lib/Target/X86/X86AsmPrinter.h @@ -151,6 +151,7 @@ private: MCSymbol *LazyPointer) override; void emitCallInstruction(const llvm::MCInst &MCI); + void maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI); // Emits a label to mark the next instruction as being relevant to Import Call // Optimization. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6281124..568a8c4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45059,6 +45059,10 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( unsigned NumElts = DemandedElts.getBitWidth(); switch (Op.getOpcode()) { + case X86ISD::GlobalBaseReg: + case X86ISD::Wrapper: + case X86ISD::WrapperRIP: + return true; case X86ISD::BLENDI: case X86ISD::PSHUFD: case X86ISD::UNPCKL: @@ -45098,27 +45102,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { switch (Op.getOpcode()) { + // SSE vector insert/extracts use modulo indices. + case X86ISD::PINSRB: + case X86ISD::PINSRW: + case X86ISD::PEXTRB: + case X86ISD::PEXTRW: + return false; // SSE vector multiplies are either inbounds or saturate. case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: + return false; // SSE vector shifts handle out of bounds shift amounts. case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: return false; - // SSE blends. + // SSE blends. case X86ISD::BLENDI: case X86ISD::BLENDV: return false; - // SSE target shuffles. + // SSE target shuffles. case X86ISD::PSHUFD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: case X86ISD::VPERMV3: return false; - // SSE comparisons handle all icmp/fcmp cases. - // TODO: Add CMPM/MM with test coverage. + // SSE comparisons handle all icmp/fcmp cases. + // TODO: Add CMPM/MM with test coverage. case X86ISD::CMPP: case X86ISD::PCMPEQ: case X86ISD::PCMPGT: diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 45d596b..481a9be 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Mangler.h" @@ -833,6 +834,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, CallInst.setOpcode(CallOpcode); CallInst.addOperand(CallTargetMCOp); OutStreamer->emitInstruction(CallInst, getSubtargetInfo()); + maybeEmitNopAfterCallForWindowsEH(&MI); } // Record our statepoint node in the same section used by STACKMAP @@ -1430,21 +1432,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, OutStreamer->emitLabel(FallthroughLabel); } -// Returns instruction preceding MBBI in MachineFunction. -// If MBBI is the first instruction of the first basic block, returns null. -static MachineBasicBlock::const_iterator -PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) { - const MachineBasicBlock *MBB = MBBI->getParent(); - while (MBBI == MBB->begin()) { - if (MBB == &MBB->getParent()->front()) - return MachineBasicBlock::const_iterator(); - MBB = MBB->getPrevNode(); - MBBI = MBB->end(); - } - --MBBI; - return MBBI; -} - static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) { if (X86II::isKMasked(MI->getDesc().TSFlags)) { // Skip mask operand. @@ -2271,6 +2258,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { OutStreamer->AddComment("EVEX TO EVEX Compression ", false); } + // We use this to suppress NOP padding for Windows EH. + bool IsTailJump = false; + switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); @@ -2325,6 +2315,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // Lower this as normal, but add a comment. OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPr: @@ -2340,6 +2331,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // Lower these as normal, but add some comments. OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPm64_REX: @@ -2349,6 +2341,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { } OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPr64_REX: { @@ -2361,6 +2354,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { } OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; } @@ -2537,26 +2531,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { case X86::SEH_BeginEpilogue: { assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - // Windows unwinder will not invoke function's exception handler if IP is - // either in prologue or in epilogue. This behavior causes a problem when a - // call immediately precedes an epilogue, because the return address points - // into the epilogue. To cope with that, we insert a 'nop' if it ends up - // immediately after a CALL in the final emitted code. - MachineBasicBlock::const_iterator MBBI(MI); - // Check if preceded by a call and emit nop if so. - for (MBBI = PrevCrossBBInst(MBBI); - MBBI != MachineBasicBlock::const_iterator(); - MBBI = PrevCrossBBInst(MBBI)) { - // Pseudo instructions that aren't a call are assumed to not emit any - // code. If they do, we worst case generate unnecessary noops after a - // call. - if (MBBI->isCall() || !MBBI->isPseudo()) { - if (MBBI->isCall()) - EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); - break; - } - } - EmitSEHInstruction(MI); return; } @@ -2585,6 +2559,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); emitCallInstruction(TmpInst); emitNop(*OutStreamer, 5, Subtarget); + maybeEmitNopAfterCallForWindowsEH(MI); return; } @@ -2605,6 +2580,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // For Import Call Optimization to work, we need a 3-byte nop after the // call instruction. emitNop(*OutStreamer, 3, Subtarget); + maybeEmitNopAfterCallForWindowsEH(MI); return; } break; @@ -2638,6 +2614,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { if (MI->isCall()) { emitCallInstruction(TmpInst); + // Since tail calls transfer control without leaving a stack frame, there is + // never a need for NOP padding tail calls. + if (!IsTailJump) + maybeEmitNopAfterCallForWindowsEH(MI); return; } @@ -2659,6 +2639,164 @@ void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) { OutStreamer->emitInstruction(MCI, getSubtargetInfo()); } +// Determines whether a NOP is required after a CALL, so that Windows EH +// IP2State tables have the correct information. +// +// On most Windows platforms (AMD64, ARM64, ARM32, IA64, but *not* x86-32), +// exception handling works by looking up instruction pointers in lookup +// tables. These lookup tables are stored in .xdata sections in executables. +// One element of the lookup tables are the "IP2State" tables (Instruction +// Pointer to State). +// +// If a function has any instructions that require cleanup during exception +// unwinding, then it will have an IP2State table. Each entry in the IP2State +// table describes a range of bytes in the function's instruction stream, and +// associates an "EH state number" with that range of instructions. A value of +// -1 means "the null state", which does not require any code to execute. +// A value other than -1 is an index into the State table. +// +// The entries in the IP2State table contain byte offsets within the instruction +// stream of the function. The Windows ABI requires that these offsets are +// aligned to instruction boundaries; they are not permitted to point to a byte +// that is not the first byte of an instruction. +// +// Unfortunately, CALL instructions present a problem during unwinding. CALL +// instructions push the address of the instruction after the CALL instruction, +// so that execution can resume after the CALL. If the CALL is the last +// instruction within an IP2State region, then the return address (on the stack) +// points to the *next* IP2State region. This means that the unwinder will +// use the wrong cleanup funclet during unwinding. +// +// To fix this problem, the Windows AMD64 ABI requires that CALL instructions +// are never placed at the end of an IP2State region. Stated equivalently, the +// end of a CALL instruction cannot be aligned to an IP2State boundary. If a +// CALL instruction would occur at the end of an IP2State region, then the +// compiler must insert a NOP instruction after the CALL. The NOP instruction +// is placed in the same EH region as the CALL instruction, so that the return +// address points to the NOP and the unwinder will locate the correct region. +// +// NOP padding is only necessary on Windows AMD64 targets. On ARM64 and ARM32, +// instructions have a fixed size so the unwinder knows how to "back up" by +// one instruction. +// +// Interaction with Import Call Optimization (ICO): +// +// Import Call Optimization (ICO) is a compiler + OS feature on Windows which +// improves the performance and security of DLL imports. ICO relies on using a +// specific CALL idiom that can be replaced by the OS DLL loader. This removes +// a load and indirect CALL and replaces it with a single direct CALL. +// +// To achieve this, ICO also inserts NOPs after the CALL instruction. If the +// end of the CALL is aligned with an EH state transition, we *also* insert +// a single-byte NOP. **Both forms of NOPs must be preserved.** They cannot +// be combined into a single larger NOP; nor can the second NOP be removed. +// +// This is necessary because, if ICO is active and the call site is modified +// by the loader, the loader will end up overwriting the NOPs that were inserted +// for ICO. That means that those NOPs cannot be used for the correct +// termination of the exception handling region (the IP2State transition), +// so we still need an additional NOP instruction. The NOPs cannot be combined +// into a longer NOP (which is ordinarily desirable) because then ICO would +// split one instruction, producing a malformed instruction after the ICO call. +void X86AsmPrinter::maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI) { + // We only need to insert NOPs after CALLs when targeting Windows on AMD64. + // (Don't let the name fool you: Itanium refers to table-based exception + // handling, not the Itanium architecture.) + if (MAI->getExceptionHandlingType() != ExceptionHandling::WinEH || + MAI->getWinEHEncodingType() != WinEH::EncodingType::Itanium) { + return; + } + + bool HasEHPersonality = MF->getWinEHFuncInfo() != nullptr; + + // Set up MBB iterator, initially positioned on the same MBB as MI. + MachineFunction::const_iterator MFI(MI->getParent()); + MachineFunction::const_iterator MFE(MF->end()); + + // Set up instruction iterator, positioned immediately *after* MI. + MachineBasicBlock::const_iterator MBBI(MI); + MachineBasicBlock::const_iterator MBBE = MI->getParent()->end(); + ++MBBI; // Step over MI + + // This loop iterates MBBs + for (;;) { + // This loop iterates instructions + for (; MBBI != MBBE; ++MBBI) { + // Check the instruction that follows this CALL. + const MachineInstr &NextMI = *MBBI; + + // If there is an EH_LABEL after this CALL, then there is an EH state + // transition after this CALL. This is exactly the situation which + // requires NOP padding. + if (NextMI.isEHLabel()) { + if (HasEHPersonality) { + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + return; + } + // We actually want to continue, in case there is an SEH_BeginEpilogue + // instruction after the EH_LABEL. In some situations, IR is produced + // that contains EH_LABEL pseudo-instructions, even when we are not + // generating IP2State tables. We still need to insert a NOP before + // SEH_BeginEpilogue in that case. + continue; + } + + // Somewhat similarly, if the CALL is the last instruction before the + // SEH prologue, then we also need a NOP. This is necessary because the + // Windows stack unwinder will not invoke a function's exception handler + // if the instruction pointer is in the function prologue or epilogue. + // + // We always emit a NOP before SEH_BeginEpilogue, even if there is no + // personality function (unwind info) for this frame. This is the same + // behavior as MSVC. + if (NextMI.getOpcode() == X86::SEH_BeginEpilogue) { + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + return; + } + + if (!NextMI.isPseudo() && !NextMI.isMetaInstruction()) { + // We found a real instruction. During the CALL, the return IP will + // point to this instruction. Since this instruction has the same EH + // state as the call itself (because there is no intervening EH_LABEL), + // the IP2State table will be accurate; there is no need to insert a + // NOP. + return; + } + + // The next instruction is a pseudo-op. Ignore it and keep searching. + // Because these instructions do not generate any machine code, they + // cannot prevent the IP2State table from pointing at the wrong + // instruction during a CALL. + } + + // We've reached the end of this MBB. Find the next MBB in program order. + // MBB order should be finalized by this point, so falling across MBBs is + // expected. + ++MFI; + if (MFI == MFE) { + // No more blocks; we've reached the end of the function. This should + // only happen with no-return functions, but double-check to be sure. + if (HasEHPersonality) { + // If the CALL has no successors, then it is a noreturn function. + // Insert an INT3 instead of a NOP. This accomplishes the same purpose, + // but is more clear to read. Also, analysis tools will understand + // that they should not continue disassembling after the CALL (unless + // there are other branches to that label). + if (MI->getParent()->succ_empty()) + EmitAndCountInstruction(MCInstBuilder(X86::INT3)); + else + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + } + return; + } + + // Set up iterator to scan the next basic block. + const MachineBasicBlock *NextMBB = &*MFI; + MBBI = NextMBB->instr_begin(); + MBBE = NextMBB->instr_end(); + } +} + void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization( ImportCallKind Kind) { assert(EnableImportCallOptimization); diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 8c156c9..7fa6e6c5 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -842,6 +842,156 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL, return true; } +/// ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset. +struct PartStore { + Value *PtrBase; + APInt PtrOffset; + Value *Val; + uint64_t ValOffset; + uint64_t ValWidth; + StoreInst *Store; + + bool isCompatibleWith(const PartStore &Other) const { + return PtrBase == Other.PtrBase && Val == Other.Val; + } + + bool operator<(const PartStore &Other) const { + return PtrOffset.slt(Other.PtrOffset); + } +}; + +static std::optional<PartStore> matchPartStore(Instruction &I, + const DataLayout &DL) { + auto *Store = dyn_cast<StoreInst>(&I); + if (!Store || !Store->isSimple()) + return std::nullopt; + + Value *StoredVal = Store->getValueOperand(); + Type *StoredTy = StoredVal->getType(); + if (!StoredTy->isIntegerTy() || !DL.typeSizeEqualsStoreSize(StoredTy)) + return std::nullopt; + + uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits(); + uint64_t ValOffset = 0; + Value *Val; + if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val), + m_ConstantInt(ValOffset))), + m_Trunc(m_Value(Val))))) + return std::nullopt; + + Value *Ptr = Store->getPointerOperand(); + APInt PtrOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0); + Value *PtrBase = Ptr->stripAndAccumulateConstantOffsets( + DL, PtrOffset, /*AllowNonInbounds=*/true); + return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}}; +} + +static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts, + unsigned Width, const DataLayout &DL, + TargetTransformInfo &TTI) { + if (Parts.size() < 2) + return false; + + // Check whether combining the stores is profitable. + // FIXME: We could generate smaller stores if we can't produce a large one. + const PartStore &First = Parts.front(); + LLVMContext &Ctx = First.Store->getContext(); + Type *NewTy = Type::getIntNTy(Ctx, Width); + unsigned Fast = 0; + if (!TTI.isTypeLegal(NewTy) || + !TTI.allowsMisalignedMemoryAccesses(Ctx, Width, + First.Store->getPointerAddressSpace(), + First.Store->getAlign(), &Fast) || + !Fast) + return false; + + // Generate the combined store. + IRBuilder<> Builder(First.Store); + Value *Val = First.Val; + if (First.ValOffset != 0) + Val = Builder.CreateLShr(Val, First.ValOffset); + Val = Builder.CreateTrunc(Val, NewTy); + StoreInst *Store = Builder.CreateAlignedStore( + Val, First.Store->getPointerOperand(), First.Store->getAlign()); + + AAMDNodes AATags = First.Store->getAAMetadata(); + for (const PartStore &Part : drop_begin(Parts)) + AATags = AATags.concat(Part.Store->getAAMetadata()); + Store->setAAMetadata(AATags); + + // Remove the old stores. + for (const PartStore &Part : Parts) + Part.Store->eraseFromParent(); + + return true; +} + +static bool mergePartStores(SmallVectorImpl<PartStore> &Parts, + const DataLayout &DL, TargetTransformInfo &TTI) { + if (Parts.size() < 2) + return false; + + // We now have multiple parts of the same value stored to the same pointer. + // Sort the parts by pointer offset, and make sure they are consistent with + // the value offsets. Also check that the value is fully covered without + // overlaps. + bool Changed = false; + llvm::sort(Parts); + int64_t LastEndOffsetFromFirst = 0; + const PartStore *First = &Parts[0]; + for (const PartStore &Part : Parts) { + APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset; + int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset; + if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst || + LastEndOffsetFromFirst != ValOffsetFromFirst) { + Changed |= mergeConsecutivePartStores(ArrayRef(First, &Part), + LastEndOffsetFromFirst, DL, TTI); + First = &Part; + LastEndOffsetFromFirst = Part.ValWidth; + continue; + } + + LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth; + } + + Changed |= mergeConsecutivePartStores(ArrayRef(First, Parts.end()), + LastEndOffsetFromFirst, DL, TTI); + return Changed; +} + +static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL, + TargetTransformInfo &TTI, AliasAnalysis &AA) { + // FIXME: Add big endian support. + if (DL.isBigEndian()) + return false; + + SmallVector<PartStore, 8> Parts; + bool MadeChange = false; + for (Instruction &I : make_early_inc_range(BB)) { + if (std::optional<PartStore> Part = matchPartStore(I, DL)) { + if (Parts.empty() || Part->isCompatibleWith(Parts[0])) { + Parts.push_back(std::move(*Part)); + continue; + } + + MadeChange |= mergePartStores(Parts, DL, TTI); + Parts.clear(); + Parts.push_back(std::move(*Part)); + continue; + } + + // FIXME: Use AA to make this more precise. + if (I.mayReadOrWriteMemory() || I.mayThrow()) { + MadeChange |= mergePartStores(Parts, DL, TTI); + Parts.clear(); + continue; + } + } + + MadeChange |= mergePartStores(Parts, DL, TTI); + return MadeChange; +} + /// Combine away instructions providing they are still equivalent when compared /// against 0. i.e do they have any bits set. static Value *optimizeShiftInOrChain(Value *V, IRBuilder<> &Builder) { @@ -1330,6 +1480,9 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT, // bugs. MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange); } + + // Do this separately to avoid redundantly scanning stores multiple times. + MadeChange |= foldConsecutiveStores(BB, DL, TTI, AA); } // We're done with transforms, so remove dead instructions. diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp index e279fec..6561b1c 100644 --- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp @@ -170,6 +170,12 @@ void Lowerer::hidePromiseAlloca(CoroIdInst *CoroId, CoroBeginInst *CoroBegin) { auto *PI = Builder.CreateIntrinsic( Builder.getPtrTy(), Intrinsic::coro_promise, Arg, {}, "promise.addr"); PI->setCannotDuplicate(); + // Remove lifetime markers, as these are only allowed on allocas. + for (User *U : make_early_inc_range(PA->users())) { + auto *I = cast<Instruction>(U); + if (I->isLifetimeStartOrEnd()) + I->eraseFromParent(); + } PA->replaceUsesWithIf(PI, [CoroId](Use &U) { bool IsBitcast = U == U.getUser()->stripPointerCasts(); bool IsCoroId = U.getUser() == CoroId; diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index a65d0fb..3320508 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -553,7 +553,6 @@ static void cacheDIVar(FrameDataInfo &FrameData, if (I != Container.end()) DIVarCache.insert({V, (*I)->getVariable()}); }; - CacheIt(findDbgDeclares(V)); CacheIt(findDVRDeclares(V)); } } @@ -1219,10 +1218,8 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { auto *G = GetFramePointer(Alloca); G->setName(Alloca->getName() + Twine(".reload.addr")); - SmallVector<DbgVariableIntrinsic *, 4> DIs; SmallVector<DbgVariableRecord *> DbgVariableRecords; - findDbgUsers(DIs, Alloca, &DbgVariableRecords); - assert(DIs.empty() && "Should never see debug-intrinsics"); + findDbgUsers(Alloca, DbgVariableRecords); for (auto *DVR : DbgVariableRecords) DVR->replaceVariableLocationOp(Alloca, G); diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp index 5fd5f7d..4e71768 100644 --- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp +++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp @@ -519,10 +519,8 @@ void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F, // We would handle the dbg.values for allocas specially for (auto &Iter : Spills) { auto *V = Iter.first; - SmallVector<DbgValueInst *, 16> DVIs; SmallVector<DbgVariableRecord *, 16> DVRs; - findDbgValues(DVIs, V, &DVRs); - assert(DVIs.empty()); + findDbgValues(V, DVRs); // Add the instructions which carry debug info that is in the frame. for (DbgVariableRecord *DVR : DVRs) if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr)) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 469f435..b803c97 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -3998,6 +3998,24 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); } +// Update the debug information attached to NewFunc to use the clone Name. Note +// this needs to be done for both any existing DISubprogram for the definition, +// as well as any separate declaration DISubprogram. +static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) { + assert(Name == NewFunc->getName()); + auto *SP = NewFunc->getSubprogram(); + if (!SP) + return; + auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name); + SP->replaceLinkageName(MDName); + DISubprogram *Decl = SP->getDeclaration(); + if (!Decl) + return; + TempDISubprogram NewDecl = Decl->clone(); + NewDecl->replaceLinkageName(MDName); + SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl))); +} + CallsiteContextGraph<ModuleCallsiteContextGraph, Function, Instruction *>::FuncInfo ModuleCallsiteContextGraph::cloneFunctionForCallsite( @@ -4009,9 +4027,7 @@ ModuleCallsiteContextGraph::cloneFunctionForCallsite( std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); assert(!Func.func()->getParent()->getFunction(Name)); NewFunc->setName(Name); - if (auto *SP = NewFunc->getSubprogram()) - SP->replaceLinkageName( - MDString::get(NewFunc->getParent()->getContext(), Name)); + updateSubprogramLinkageName(NewFunc, Name); for (auto &Inst : CallsWithMetadataInFunc) { // This map always has the initial version in it. assert(Inst.cloneNo() == 0); @@ -4950,9 +4966,7 @@ static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( PrevF->eraseFromParent(); } else NewF->setName(Name); - if (auto *SP = NewF->getSubprogram()) - SP->replaceLinkageName( - MDString::get(NewF->getParent()->getContext(), Name)); + updateSubprogramLinkageName(NewF, Name); ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) << "created clone " << ore::NV("NewFunction", NewF)); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 3321435..d88bc2c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4352,6 +4352,13 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy)); } + // Drop unnecessary callee_type metadata from calls that were converted + // into direct calls. + if (Call.getMetadata(LLVMContext::MD_callee_type) && !Call.isIndirectCall()) { + Call.setMetadata(LLVMContext::MD_callee_type, nullptr); + Changed = true; + } + // Drop unnecessary kcfi operand bundles from calls that were converted // into direct calls. auto Bundle = Call.getOperandBundle(LLVMContext::OB_kcfi); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 9df0855..c90ff2a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" @@ -21,8 +22,10 @@ #include "llvm/Analysis/Utils/Local.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/KnownBits.h" @@ -8222,6 +8225,98 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI, return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I); } +// Transform 'fptrunc(x) cmp C' to 'x cmp ext(C)' if possible. +// Patterns include: +// fptrunc(x) < C --> x < ext(C) +// fptrunc(x) <= C --> x <= ext(C) +// fptrunc(x) > C --> x > ext(C) +// fptrunc(x) >= C --> x >= ext(C) +// where 'ext(C)' is the extension of 'C' to the type of 'x' with a small bias +// due to precision loss. +static Instruction *foldFCmpFpTrunc(FCmpInst &I, const Instruction &FPTrunc, + const Constant &C) { + FCmpInst::Predicate Pred = I.getPredicate(); + bool RoundDown = false; + + if (Pred == FCmpInst::FCMP_OGE || Pred == FCmpInst::FCMP_UGE || + Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_ULT) + RoundDown = true; + else if (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT || + Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE) + RoundDown = false; + else + return nullptr; + + const APFloat *CValue; + if (!match(&C, m_APFloat(CValue))) + return nullptr; + + if (CValue->isNaN() || CValue->isInfinity()) + return nullptr; + + auto ConvertFltSema = [](const APFloat &Src, const fltSemantics &Sema) { + bool LosesInfo; + APFloat Dest = Src; + Dest.convert(Sema, APFloat::rmNearestTiesToEven, &LosesInfo); + return Dest; + }; + + auto NextValue = [](const APFloat &Value, bool RoundDown) { + APFloat NextValue = Value; + NextValue.next(RoundDown); + return NextValue; + }; + + APFloat NextCValue = NextValue(*CValue, RoundDown); + + Type *DestType = FPTrunc.getOperand(0)->getType(); + const fltSemantics &DestFltSema = + DestType->getScalarType()->getFltSemantics(); + + APFloat ExtCValue = ConvertFltSema(*CValue, DestFltSema); + APFloat ExtNextCValue = ConvertFltSema(NextCValue, DestFltSema); + + // When 'NextCValue' is infinity, use an imaged 'NextCValue' that equals + // 'CValue + bias' to avoid the infinity after conversion. The bias is + // estimated as 'CValue - PrevCValue', where 'PrevCValue' is the previous + // value of 'CValue'. + if (NextCValue.isInfinity()) { + APFloat PrevCValue = NextValue(*CValue, !RoundDown); + APFloat Bias = ConvertFltSema(*CValue - PrevCValue, DestFltSema); + + ExtNextCValue = ExtCValue + Bias; + } + + APFloat ExtMidValue = + scalbn(ExtCValue + ExtNextCValue, -1, APFloat::rmNearestTiesToEven); + + const fltSemantics &SrcFltSema = + C.getType()->getScalarType()->getFltSemantics(); + + // 'MidValue' might be rounded to 'NextCValue'. Correct it here. + APFloat MidValue = ConvertFltSema(ExtMidValue, SrcFltSema); + if (MidValue != *CValue) + ExtMidValue.next(!RoundDown); + + // Check whether 'ExtMidValue' is a valid result since the assumption on + // imaged 'NextCValue' might not hold for new float types. + // ppc_fp128 can't pass here when converting from max float because of + // APFloat implementation. + if (NextCValue.isInfinity()) { + // ExtMidValue --- narrowed ---> Finite + if (ConvertFltSema(ExtMidValue, SrcFltSema).isInfinity()) + return nullptr; + + // NextExtMidValue --- narrowed ---> Infinity + APFloat NextExtMidValue = NextValue(ExtMidValue, RoundDown); + if (ConvertFltSema(NextExtMidValue, SrcFltSema).isFinite()) + return nullptr; + } + + return new FCmpInst(Pred, FPTrunc.getOperand(0), + ConstantFP::get(DestType, ExtMidValue), "", &I); +} + /// Optimize fabs(X) compared with zero. static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) { Value *X; @@ -8712,6 +8807,10 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) { cast<LoadInst>(LHSI), GEP, GV, I)) return Res; break; + case Instruction::FPTrunc: + if (Instruction *NV = foldFCmpFpTrunc(I, *LHSI, *RHSC)) + return NV; + break; } } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 503611a..e2a9255 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -219,18 +219,64 @@ Value *InstCombinerImpl::EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP) { Value *InstCombinerImpl::EmitGEPOffsets(ArrayRef<GEPOperator *> GEPs, GEPNoWrapFlags NW, Type *IdxTy, bool RewriteGEPs) { - Value *Sum = nullptr; - for (GEPOperator *GEP : reverse(GEPs)) { - Value *Offset = EmitGEPOffset(GEP, RewriteGEPs); - if (Offset->getType() != IdxTy) - Offset = Builder.CreateVectorSplat( - cast<VectorType>(IdxTy)->getElementCount(), Offset); + auto Add = [&](Value *Sum, Value *Offset) -> Value * { if (Sum) - Sum = Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(), - NW.isInBounds()); + return Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(), + NW.isInBounds()); else - Sum = Offset; + return Offset; + }; + + Value *Sum = nullptr; + Value *OneUseSum = nullptr; + Value *OneUseBase = nullptr; + GEPNoWrapFlags OneUseFlags = GEPNoWrapFlags::all(); + for (GEPOperator *GEP : reverse(GEPs)) { + Value *Offset; + { + // Expand the offset at the point of the previous GEP to enable rewriting. + // However, use the original insertion point for calculating Sum. + IRBuilderBase::InsertPointGuard Guard(Builder); + auto *Inst = dyn_cast<Instruction>(GEP); + if (RewriteGEPs && Inst) + Builder.SetInsertPoint(Inst); + + Offset = llvm::emitGEPOffset(&Builder, DL, GEP); + if (Offset->getType() != IdxTy) + Offset = Builder.CreateVectorSplat( + cast<VectorType>(IdxTy)->getElementCount(), Offset); + if (GEP->hasOneUse()) { + // Offsets of one-use GEPs will be merged into the next multi-use GEP. + OneUseSum = Add(OneUseSum, Offset); + OneUseFlags = OneUseFlags.intersectForOffsetAdd(GEP->getNoWrapFlags()); + if (!OneUseBase) + OneUseBase = GEP->getPointerOperand(); + continue; + } + + if (OneUseSum) + Offset = Add(OneUseSum, Offset); + + // Rewrite the GEP to reuse the computed offset. This also includes + // offsets from preceding one-use GEPs. + if (RewriteGEPs && Inst && + !(GEP->getSourceElementType()->isIntegerTy(8) && + GEP->getOperand(1) == Offset)) { + replaceInstUsesWith( + *Inst, + Builder.CreatePtrAdd( + OneUseBase ? OneUseBase : GEP->getPointerOperand(), Offset, "", + OneUseFlags.intersectForOffsetAdd(GEP->getNoWrapFlags()))); + eraseInstFromFunction(*Inst); + } + } + + Sum = Add(Sum, Offset); + OneUseSum = OneUseBase = nullptr; + OneUseFlags = GEPNoWrapFlags::all(); } + if (OneUseSum) + Sum = Add(Sum, OneUseSum); if (!Sum) return Constant::getNullValue(IdxTy); return Sum; @@ -1417,10 +1463,8 @@ void InstCombinerImpl::freelyInvertAllUsersOf(Value *I, Value *IgnoredUser) { } // Update pre-existing debug value uses. - SmallVector<DbgValueInst *, 4> DbgValues; SmallVector<DbgVariableRecord *, 4> DbgVariableRecords; - llvm::findDbgValues(DbgValues, I, &DbgVariableRecords); - assert(DbgValues.empty()); + llvm::findDbgValues(I, DbgVariableRecords); for (DbgVariableRecord *DbgVal : DbgVariableRecords) { SmallVector<uint64_t, 1> Ops = {dwarf::DW_OP_not}; @@ -3565,12 +3609,10 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { // If we are removing an alloca with a dbg.declare, insert dbg.value calls // before each store. - SmallVector<DbgVariableIntrinsic *, 8> DVIs; SmallVector<DbgVariableRecord *, 8> DVRs; std::unique_ptr<DIBuilder> DIB; if (isa<AllocaInst>(MI)) { - findDbgUsers(DVIs, &MI, &DVRs); - assert(DVIs.empty()); + findDbgUsers(&MI, DVRs); DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false)); } @@ -3692,9 +3734,6 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { // // FIXME: the Assignment Tracking project has now likely made this // redundant (and it's sometimes harmful). - for (auto *DVI : DVIs) - if (DVI->isAddressOfVariable() || DVI->getExpression()->startsWithDeref()) - DVI->eraseFromParent(); for (auto *DVR : DVRs) if (DVR->isAddressOfVariable() || DVR->getExpression()->startsWithDeref()) DVR->eraseFromParent(); @@ -5246,10 +5285,8 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I, // maximise the range variables have location for. If we cannot salvage, then // mark the location undef: we know it was supposed to receive a new location // here, but that computation has been sunk. - SmallVector<DbgVariableIntrinsic *, 2> DbgUsers; SmallVector<DbgVariableRecord *, 2> DbgVariableRecords; - findDbgUsers(DbgUsers, I, &DbgVariableRecords); - assert(DbgUsers.empty()); + findDbgUsers(I, DbgVariableRecords); if (!DbgVariableRecords.empty()) tryToSinkInstructionDbgVariableRecords(I, InsertPos, SrcBlock, DestBlock, DbgVariableRecords); @@ -5376,7 +5413,7 @@ void InstCombinerImpl::tryToSinkInstructionDbgVariableRecords( if (DVRClones.empty()) return; - salvageDebugInfoForDbgValues(*I, {}, DbgVariableRecordsToSalvage); + salvageDebugInfoForDbgValues(*I, DbgVariableRecordsToSalvage); // The clones are in reverse order of original appearance. Assert that the // head bit is set on the iterator as we _should_ have received it via diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 5957940..fbaa651 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -3637,6 +3637,7 @@ void FunctionStackPoisoner::processStaticAllocas() { "Variable descriptions relative to ASan stack base will be dropped"); // Replace Alloca instructions with base+offset. + SmallVector<Value *> NewAllocaPtrs; for (const auto &Desc : SVD) { AllocaInst *AI = Desc.AI; replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags, @@ -3645,6 +3646,7 @@ void FunctionStackPoisoner::processStaticAllocas() { IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)), AI->getType()); AI->replaceAllUsesWith(NewAllocaPtr); + NewAllocaPtrs.push_back(NewAllocaPtr); } // The left-most redzone has enough space for at least 4 pointers. @@ -3694,6 +3696,15 @@ void FunctionStackPoisoner::processStaticAllocas() { } } + // Remove lifetime markers now that these are no longer allocas. + for (Value *NewAllocaPtr : NewAllocaPtrs) { + for (User *U : make_early_inc_range(NewAllocaPtr->users())) { + auto *I = cast<Instruction>(U); + if (I->isLifetimeStartOrEnd()) + I->eraseFromParent(); + } + } + SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0); SmallVector<uint8_t, 64> ShadowAfterReturn; @@ -3829,6 +3840,13 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) { Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType()); + // Remove lifetime markers now that this is no longer an alloca. + for (User *U : make_early_inc_range(AI->users())) { + auto *I = cast<Instruction>(U); + if (I->isLifetimeStartOrEnd()) + I->eraseFromParent(); + } + // Replace all uses of AddessReturnedByAlloca with NewAddressPtr. AI->replaceAllUsesWith(NewAddressPtr); diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp index 55f3239..2486e77 100644 --- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp +++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp @@ -72,7 +72,7 @@ static void emitRemark(IntrinsicInst *II, OptimizationRemarkEmitter &ORE, } } -static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI, +static bool lowerAllowChecks(Function &F, const BlockFrequencyInfo &BFI, const ProfileSummaryInfo *PSI, OptimizationRemarkEmitter &ORE, const LowerAllowCheckPass::Options &Opts) { @@ -160,7 +160,7 @@ PreservedAnalyses LowerAllowCheckPass::run(Function &F, OptimizationRemarkEmitter &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); - return removeUbsanTraps(F, BFI, PSI, ORE, Opts) + return lowerAllowChecks(F, BFI, PSI, ORE, Opts) // We do not change the CFG, we only replace the intrinsics with // true or false. ? PreservedAnalyses::none().preserveSet<CFGAnalyses>() diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index e5b357f..a9a0731 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -361,6 +361,131 @@ static void addVPMetadata(Module &M, Instruction &I, } } +static void +handleAllocSite(Instruction &I, CallBase *CI, + ArrayRef<uint64_t> InlinedCallStack, LLVMContext &Ctx, + OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize, + const std::set<const AllocationInfo *> &AllocInfoSet, + std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo> + &FullStackIdToAllocMatchInfo) { + // We may match this instruction's location list to multiple MIB + // contexts. Add them to a Trie specialized for trimming the contexts to + // the minimal needed to disambiguate contexts with unique behavior. + CallStackTrie AllocTrie(&ORE, MaxColdSize); + uint64_t TotalSize = 0; + uint64_t TotalColdSize = 0; + for (auto *AllocInfo : AllocInfoSet) { + // Check the full inlined call stack against this one. + // If we found and thus matched all frames on the call, include + // this MIB. + if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack, + InlinedCallStack)) { + NumOfMemProfMatchedAllocContexts++; + uint64_t FullStackId = 0; + if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) + FullStackId = computeFullStackId(AllocInfo->CallStack); + auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); + TotalSize += AllocInfo->Info.getTotalSize(); + if (AllocType == AllocationType::Cold) + TotalColdSize += AllocInfo->Info.getTotalSize(); + // Record information about the allocation if match info printing + // was requested. + if (ClPrintMemProfMatchInfo) { + assert(FullStackId != 0); + FullStackIdToAllocMatchInfo[std::make_pair(FullStackId, + InlinedCallStack.size())] = { + AllocInfo->Info.getTotalSize(), AllocType}; + } + } + } + // If the threshold for the percent of cold bytes is less than 100%, + // and not all bytes are cold, see if we should still hint this + // allocation as cold without context sensitivity. + if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 && + TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) { + AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold, "dominant"); + return; + } + + // We might not have matched any to the full inlined call stack. + // But if we did, create and attach metadata, or a function attribute if + // all contexts have identical profiled behavior. + if (!AllocTrie.empty()) { + NumOfMemProfMatchedAllocs++; + // MemprofMDAttached will be false if a function attribute was + // attached. + bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI); + assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof)); + if (MemprofMDAttached) { + // Add callsite metadata for the instruction's location list so that + // it simpler later on to identify which part of the MIB contexts + // are from this particular instruction (including during inlining, + // when the callsite metadata will be updated appropriately). + // FIXME: can this be changed to strip out the matching stack + // context ids from the MIB contexts and not add any callsite + // metadata here to save space? + addCallsiteMetadata(I, InlinedCallStack, Ctx); + } + } +} + +// Helper struct for maintaining refs to callsite data. As an alternative we +// could store a pointer to the CallSiteInfo struct but we also need the frame +// index. Using ArrayRefs instead makes it a little easier to read. +struct CallSiteEntry { + // Subset of frames for the corresponding CallSiteInfo. + ArrayRef<Frame> Frames; + // Potential targets for indirect calls. + ArrayRef<GlobalValue::GUID> CalleeGuids; + + // Only compare Frame contents. + // Use pointer-based equality instead of ArrayRef's operator== which does + // element-wise comparison. We want to check if it's the same slice of the + // underlying array, not just equivalent content. + bool operator==(const CallSiteEntry &Other) const { + return Frames.data() == Other.Frames.data() && + Frames.size() == Other.Frames.size(); + } +}; + +struct CallSiteEntryHash { + size_t operator()(const CallSiteEntry &Entry) const { + return computeFullStackId(Entry.Frames); + } +}; + +static void handleCallSite( + Instruction &I, const Function *CalledFunction, + ArrayRef<uint64_t> InlinedCallStack, + const std::unordered_set<CallSiteEntry, CallSiteEntryHash> &CallSiteEntries, + Module &M, std::set<std::vector<uint64_t>> &MatchedCallSites) { + auto &Ctx = M.getContext(); + for (const auto &CallSiteEntry : CallSiteEntries) { + // If we found and thus matched all frames on the call, create and + // attach call stack metadata. + if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames, + InlinedCallStack)) { + NumOfMemProfMatchedCallSites++; + addCallsiteMetadata(I, InlinedCallStack, Ctx); + + // Try to attach indirect call metadata if possible. + if (!CalledFunction) + addVPMetadata(M, I, CallSiteEntry.CalleeGuids); + + // Only need to find one with a matching call stack and add a single + // callsite metadata. + + // Accumulate call site matching information upon request. + if (ClPrintMemProfMatchInfo) { + std::vector<uint64_t> CallStack; + append_range(CallStack, InlinedCallStack); + MatchedCallSites.insert(std::move(CallStack)); + } + break; + } + } +} + static void readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, const TargetLibraryInfo &TLI, @@ -431,31 +556,6 @@ static void readMemprof(Module &M, Function &F, // (allocation info and the callsites). std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo; - // Helper struct for maintaining refs to callsite data. As an alternative we - // could store a pointer to the CallSiteInfo struct but we also need the frame - // index. Using ArrayRefs instead makes it a little easier to read. - struct CallSiteEntry { - // Subset of frames for the corresponding CallSiteInfo. - ArrayRef<Frame> Frames; - // Potential targets for indirect calls. - ArrayRef<GlobalValue::GUID> CalleeGuids; - - // Only compare Frame contents. - // Use pointer-based equality instead of ArrayRef's operator== which does - // element-wise comparison. We want to check if it's the same slice of the - // underlying array, not just equivalent content. - bool operator==(const CallSiteEntry &Other) const { - return Frames.data() == Other.Frames.data() && - Frames.size() == Other.Frames.size(); - } - }; - - struct CallSiteEntryHash { - size_t operator()(const CallSiteEntry &Entry) const { - return computeFullStackId(Entry.Frames); - } - }; - // For the callsites we need to record slices of the frame array (see comments // below where the map entries are added) along with their CalleeGuids. std::map<uint64_t, std::unordered_set<CallSiteEntry, CallSiteEntryHash>> @@ -553,100 +653,15 @@ static void readMemprof(Module &M, Function &F, // allocation context with the same leaf. if (AllocInfoIter != LocHashToAllocInfo.end() && // Only consider allocations which support hinting. - isAllocationWithHotColdVariant(CI->getCalledFunction(), TLI)) { - // We may match this instruction's location list to multiple MIB - // contexts. Add them to a Trie specialized for trimming the contexts to - // the minimal needed to disambiguate contexts with unique behavior. - CallStackTrie AllocTrie(&ORE, MaxColdSize); - uint64_t TotalSize = 0; - uint64_t TotalColdSize = 0; - for (auto *AllocInfo : AllocInfoIter->second) { - // Check the full inlined call stack against this one. - // If we found and thus matched all frames on the call, include - // this MIB. - if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack, - InlinedCallStack)) { - NumOfMemProfMatchedAllocContexts++; - uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) - FullStackId = computeFullStackId(AllocInfo->CallStack); - auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); - TotalSize += AllocInfo->Info.getTotalSize(); - if (AllocType == AllocationType::Cold) - TotalColdSize += AllocInfo->Info.getTotalSize(); - // Record information about the allocation if match info printing - // was requested. - if (ClPrintMemProfMatchInfo) { - assert(FullStackId != 0); - FullStackIdToAllocMatchInfo[std::make_pair( - FullStackId, InlinedCallStack.size())] = { - AllocInfo->Info.getTotalSize(), AllocType}; - } - } - } - // If the threshold for the percent of cold bytes is less than 100%, - // and not all bytes are cold, see if we should still hint this - // allocation as cold without context sensitivity. - if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 && - TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) { - AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold, - "dominant"); - continue; - } - - // We might not have matched any to the full inlined call stack. - // But if we did, create and attach metadata, or a function attribute if - // all contexts have identical profiled behavior. - if (!AllocTrie.empty()) { - NumOfMemProfMatchedAllocs++; - // MemprofMDAttached will be false if a function attribute was - // attached. - bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI); - assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof)); - if (MemprofMDAttached) { - // Add callsite metadata for the instruction's location list so that - // it simpler later on to identify which part of the MIB contexts - // are from this particular instruction (including during inlining, - // when the callsite metadata will be updated appropriately). - // FIXME: can this be changed to strip out the matching stack - // context ids from the MIB contexts and not add any callsite - // metadata here to save space? - addCallsiteMetadata(I, InlinedCallStack, Ctx); - } - } - continue; - } - - if (CallSitesIter == LocHashToCallSites.end()) - continue; - - // Otherwise, add callsite metadata. If we reach here then we found the - // instruction's leaf location in the callsites map and not the allocation - // map. - for (const auto &CallSiteEntry : CallSitesIter->second) { - // If we found and thus matched all frames on the call, create and - // attach call stack metadata. - if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames, - InlinedCallStack)) { - NumOfMemProfMatchedCallSites++; - addCallsiteMetadata(I, InlinedCallStack, Ctx); - - // Try to attach indirect call metadata if possible. - if (!CalledFunction) - addVPMetadata(M, I, CallSiteEntry.CalleeGuids); - - // Only need to find one with a matching call stack and add a single - // callsite metadata. - - // Accumulate call site matching information upon request. - if (ClPrintMemProfMatchInfo) { - std::vector<uint64_t> CallStack; - append_range(CallStack, InlinedCallStack); - MatchedCallSites.insert(std::move(CallStack)); - } - break; - } - } + isAllocationWithHotColdVariant(CI->getCalledFunction(), TLI)) + handleAllocSite(I, CI, InlinedCallStack, Ctx, ORE, MaxColdSize, + AllocInfoIter->second, FullStackIdToAllocMatchInfo); + else if (CallSitesIter != LocHashToCallSites.end()) + // Otherwise, add callsite metadata. If we reach here then we found the + // instruction's leaf location in the callsites map and not the + // allocation map. + handleCallSite(I, CalledFunction, InlinedCallStack, + CallSitesIter->second, M, MatchedCallSites); } } } diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index df31602..1ddb8ae 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -1486,10 +1486,8 @@ static bool checkAndReplaceCondition( // Update the debug value records that satisfy the same condition used // in replaceUsesWithIf. - SmallVector<DbgVariableIntrinsic *> DbgUsers; SmallVector<DbgVariableRecord *> DVRUsers; - findDbgUsers(DbgUsers, Cmp, &DVRUsers); - assert(DbgUsers.empty()); + findDbgUsers(Cmp, DVRUsers); for (auto *DVR : DVRUsers) { auto *DTN = DT.getNode(DVR->getParent()); diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 66836ef..85ee824 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -430,6 +430,8 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II, } case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { + // Always force lifetime markers to work directly on the alloca. + NewV = NewV->stripPointerCasts(); Function *NewDecl = Intrinsic::getOrInsertDeclaration( M, II->getIntrinsicID(), {NewV->getType()}); II->setArgOperand(1, NewV); diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 4d1f4407..c2a737d 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1960,7 +1960,6 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB, // PHI insertion, of which we are prepared to do, clean these up now. SSAUpdater SSAUpdate; SmallVector<Use *, 16> UsesToRename; - SmallVector<DbgValueInst *, 4> DbgValues; SmallVector<DbgVariableRecord *, 4> DbgVariableRecords; for (Instruction &I : *BB) { @@ -1978,8 +1977,7 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB, } // Find debug values outside of the block - findDbgValues(DbgValues, &I, &DbgVariableRecords); - assert(DbgValues.empty()); + findDbgValues(&I, DbgVariableRecords); llvm::erase_if(DbgVariableRecords, [&](const DbgVariableRecord *DbgVarRec) { return DbgVarRec->getParent() == BB; }); @@ -2000,7 +1998,6 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB, SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); if (!DbgVariableRecords.empty()) { SSAUpdate.UpdateDebugValues(&I, DbgVariableRecords); - DbgValues.clear(); DbgVariableRecords.clear(); } diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index 221094f..b9546c5 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -128,6 +128,8 @@ private: // from any other block. So this variable set to true means that loop's latch // has become unreachable from loop header. bool DeleteCurrentLoop = false; + // Whether or not we enter the loop through an indirectbr. + bool HasIndirectEntry = false; // The blocks of the original loop that will still be reachable from entry // after the constant folding. @@ -216,6 +218,19 @@ private: return; } + // We need a loop preheader to split in handleDeadExits(). If LoopSimplify + // wasn't able to form one because the loop can be entered through an + // indirectbr we cannot continue. + if (!L.getLoopPreheader()) { + assert(any_of(predecessors(L.getHeader()), + [&](BasicBlock *Pred) { + return isa<IndirectBrInst>(Pred->getTerminator()); + }) && + "Loop should have preheader if it is not entered indirectly"); + HasIndirectEntry = true; + return; + } + // Collect live and dead loop blocks and exits. LiveLoopBlocks.insert(L.getHeader()); for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) { @@ -546,6 +561,12 @@ public: return false; } + if (HasIndirectEntry) { + LLVM_DEBUG(dbgs() << "Loops which can be entered indirectly are not" + " supported!\n"); + return false; + } + // Nothing to constant-fold. if (FoldCandidates.empty()) { LLVM_DEBUG( diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 7eeaaa0..6a3f656 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -82,6 +82,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" @@ -3044,6 +3045,7 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B, if (isInstructionTriviallyDead(&I, TLI)) { InstrDFS[&I] = 0; LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n"); + salvageDebugInfo(I); markInstructionForDeletion(&I); continue; } @@ -4076,6 +4078,12 @@ bool NewGVN::eliminateInstructions(Function &F) { if (!match(DefI, m_Intrinsic<Intrinsic::ssa_copy>())) patchReplacementInstruction(DefI, DominatingLeader); + SmallVector<DbgVariableRecord *> DVRUsers; + findDbgUsers(DefI, DVRUsers); + + for (auto *DVR : DVRUsers) + DVR->replaceVariableLocationOp(DefI, DominatingLeader); + markInstructionForDeletion(DefI); } } diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 820c8e1..ced61cb 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -1105,7 +1105,9 @@ bool ScalarizerVisitor::visitExtractValueInst(ExtractValueInst &EVI) { Res.push_back(ResElem); } - gather(&EVI, Res, *VS); + Type *ActualVecType = cast<FixedVectorType>(OpTy->getContainedType(Index)); + std::optional<VectorSplit> AVS = getVectorSplit(ActualVecType); + gather(&EVI, Res, *AVS); return true; } diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 1d1af42..7a9dd37 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1219,10 +1219,8 @@ void CodeExtractor::calculateNewCallTerminatorWeights( /// \p F. static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) { for (Instruction &I : instructions(F)) { - SmallVector<DbgVariableIntrinsic *, 4> DbgUsers; SmallVector<DbgVariableRecord *, 4> DbgVariableRecords; - findDbgUsers(DbgUsers, &I, &DbgVariableRecords); - assert(DbgUsers.empty()); + findDbgUsers(&I, DbgVariableRecords); for (DbgVariableRecord *DVR : DbgVariableRecords) if (DVR->getFunction() != &F) DVR->eraseFromParent(); @@ -1284,10 +1282,8 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, NewFunc.getEntryBlock().getTerminator()->getIterator()); }; for (auto [Input, NewVal] : zip_equal(Inputs, NewValues)) { - SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; SmallVector<DbgVariableRecord *, 1> DPUsers; - findDbgUsers(DbgUsers, Input, &DPUsers); - assert(DbgUsers.empty()); + findDbgUsers(Input, DPUsers); DIExpression *Expr = DIB.createExpression(); // Iterate the debud users of the Input values. If they are in the extracted diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp index c3c3cdf..8d18c75 100644 --- a/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -243,26 +243,10 @@ formLCSSAForInstructionsImpl(SmallVectorImpl<Instruction *> &Worklist, SSAUpdate.RewriteUse(*UseToRewrite); } - SmallVector<DbgValueInst *, 4> DbgValues; SmallVector<DbgVariableRecord *, 4> DbgVariableRecords; - llvm::findDbgValues(DbgValues, I, &DbgVariableRecords); + llvm::findDbgValues(I, DbgVariableRecords); // Update pre-existing debug value uses that reside outside the loop. - for (auto *DVI : DbgValues) { - BasicBlock *UserBB = DVI->getParent(); - if (InstBB == UserBB || L->contains(UserBB)) - continue; - // We currently only handle debug values residing in blocks that were - // traversed while rewriting the uses. If we inserted just a single PHI, - // we will handle all relevant debug values. - Value *V = AddedPHIs.size() == 1 ? AddedPHIs[0] - : SSAUpdate.FindValueForBlock(UserBB); - if (V) - DVI->replaceVariableLocationOp(I, V); - } - - // RemoveDIs: copy-paste of block above, using non-instruction debug-info - // records. for (DbgVariableRecord *DVR : DbgVariableRecords) { BasicBlock *UserBB = DVR->getMarker()->getParent(); if (InstBB == UserBB || L->contains(UserBB)) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index d481ad9..f89d36f 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -610,10 +610,8 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions( } bool llvm::replaceDbgUsesWithUndef(Instruction *I) { - SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; SmallVector<DbgVariableRecord *, 1> DPUsers; - findDbgUsers(DbgUsers, I, &DPUsers); - assert(DbgUsers.empty()); + findDbgUsers(I, DPUsers); for (auto *DVR : DPUsers) DVR->setKillLocation(); return !DPUsers.empty(); @@ -1603,10 +1601,8 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar, // Since we can't guarantee that the original dbg.declare intrinsic // is removed by LowerDbgDeclare(), we need to make sure that we are // not inserting the same dbg.value intrinsic over and over. - SmallVector<DbgValueInst *, 1> DbgValues; SmallVector<DbgVariableRecord *, 1> DbgVariableRecords; - findDbgValues(DbgValues, APN, &DbgVariableRecords); - assert(DbgValues.empty()); + findDbgValues(APN, DbgVariableRecords); for (DbgVariableRecord *DVR : DbgVariableRecords) { assert(is_contained(DVR->location_ops(), APN)); if ((DVR->getVariable() == DIVar) && (DVR->getExpression() == DIExpr)) @@ -1987,10 +1983,8 @@ static void updateOneDbgValueForAlloca(const DebugLoc &Loc, void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress, DIBuilder &Builder, int Offset) { - SmallVector<DbgValueInst *, 1> DbgUsers; SmallVector<DbgVariableRecord *, 1> DPUsers; - findDbgValues(DbgUsers, AI, &DPUsers); - assert(DbgUsers.empty()); + findDbgValues(AI, DPUsers); // Replace any DbgVariableRecords that use this alloca. for (DbgVariableRecord *DVR : DPUsers) @@ -2002,11 +1996,9 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress, /// Where possible to salvage debug information for \p I do so. /// If not possible mark undef. void llvm::salvageDebugInfo(Instruction &I) { - SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; SmallVector<DbgVariableRecord *, 1> DPUsers; - findDbgUsers(DbgUsers, &I, &DPUsers); - assert(DbgUsers.empty()); - salvageDebugInfoForDbgValues(I, DbgUsers, DPUsers); + findDbgUsers(&I, DPUsers); + salvageDebugInfoForDbgValues(I, DPUsers); } template <typename T> static void salvageDbgAssignAddress(T *Assign) { @@ -2044,9 +2036,8 @@ template <typename T> static void salvageDbgAssignAddress(T *Assign) { } } -void llvm::salvageDebugInfoForDbgValues( - Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers, - ArrayRef<DbgVariableRecord *> DPUsers) { +void llvm::salvageDebugInfoForDbgValues(Instruction &I, + ArrayRef<DbgVariableRecord *> DPUsers) { // These are arbitrary chosen limits on the maximum number of values and the // maximum size of a debug expression we can salvage up to, used for // performance reasons. @@ -2054,9 +2045,6 @@ void llvm::salvageDebugInfoForDbgValues( const unsigned MaxExpressionSize = 128; bool Salvaged = false; - // We should never see debug intrinsics nowadays. - assert(DbgUsers.empty()); - for (auto *DVR : DPUsers) { if (DVR->isDbgAssign()) { if (DVR->getAddress() == &I) { @@ -2343,16 +2331,11 @@ static bool rewriteDebugUsers( Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT, function_ref<DbgValReplacement(DbgVariableRecord &DVR)> RewriteDVRExpr) { // Find debug users of From. - SmallVector<DbgVariableIntrinsic *, 1> Users; SmallVector<DbgVariableRecord *, 1> DPUsers; - findDbgUsers(Users, &From, &DPUsers); - if (Users.empty() && DPUsers.empty()) + findDbgUsers(&From, DPUsers); + if (DPUsers.empty()) return false; - // Ignore intrinsic-users: they are no longer supported and should never - // appear. - assert(Users.empty()); - // Prevent use-before-def of To. bool Changed = false; @@ -3005,6 +2988,12 @@ static void combineMetadata(Instruction *K, const Instruction *J, case LLVMContext::MD_memprof: case LLVMContext::MD_callsite: break; + case LLVMContext::MD_callee_type: + if (!AAOnly) { + K->setMetadata(LLVMContext::MD_callee_type, + MDNode::getMergedCalleeTypeMetadata(KMD, JMD)); + } + break; case LLVMContext::MD_align: if (!AAOnly && (DoesKMove || !K->hasMetadata(LLVMContext::MD_noundef))) K->setMetadata( @@ -3350,10 +3339,8 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI, } void llvm::dropDebugUsers(Instruction &I) { - SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; SmallVector<DbgVariableRecord *, 1> DPUsers; - findDbgUsers(DbgUsers, &I, &DPUsers); - assert(DbgUsers.empty()); + findDbgUsers(&I, DPUsers); for (auto *DVR : DPUsers) DVR->eraseFromParent(); } @@ -3870,6 +3857,10 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) { if (Op->isSwiftError()) return false; + // Cannot replace alloca argument with phi/select. + if (I->isLifetimeStartOrEnd()) + return false; + // Early exit. if (!isa<Constant, InlineAsm>(Op)) return true; diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 06115e0..7cc9ff8 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -158,10 +158,8 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug // intrinsics. - SmallVector<DbgValueInst *, 1> DbgValues; SmallVector<DbgVariableRecord *, 1> DbgVariableRecords; - llvm::findDbgValues(DbgValues, OrigHeaderVal, &DbgVariableRecords); - assert(DbgValues.empty()); + llvm::findDbgValues(OrigHeaderVal, DbgVariableRecords); for (DbgVariableRecord *DVR : DbgVariableRecords) { // The original users in the OrigHeader are already using the original diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 200d1fb..e7623aa 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -938,8 +938,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) { case RecurKind::UMin: return Intrinsic::vector_reduce_umin; case RecurKind::FMax: + case RecurKind::FMaxNum: return Intrinsic::vector_reduce_fmax; case RecurKind::FMin: + case RecurKind::FMinNum: return Intrinsic::vector_reduce_fmin; case RecurKind::FMaximum: return Intrinsic::vector_reduce_fmaximum; @@ -1037,8 +1039,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) { case RecurKind::SMax: return Intrinsic::smax; case RecurKind::FMin: + case RecurKind::FMinNum: return Intrinsic::minnum; case RecurKind::FMax: + case RecurKind::FMaxNum: return Intrinsic::maxnum; case RecurKind::FMinimum: return Intrinsic::minimum; @@ -1096,9 +1100,9 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right) { Type *Ty = Left->getType(); if (Ty->isIntOrIntVectorTy() || - (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum || + (RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum || + RK == RecurKind::FMinimum || RK == RecurKind::FMaximum || RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) { - // TODO: Add float minnum/maxnum support when FMF nnan is set. Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK); return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr, "rdx.minmax"); @@ -1308,6 +1312,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, case RecurKind::UMin: case RecurKind::FMax: case RecurKind::FMin: + case RecurKind::FMinNum: + case RecurKind::FMaxNum: case RecurKind::FMinimum: case RecurKind::FMaximum: case RecurKind::FMinimumNum: diff --git a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp index 8f55d7b..2743931 100644 --- a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp +++ b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp @@ -319,9 +319,9 @@ void MemoryOpRemark::visitVariable(const Value *V, // If we find some information in the debug info, take that. bool FoundDI = false; - // Try to get an llvm.dbg.declare, which has a DILocalVariable giving us the + // Try to get a dbg.declare, which has a DILocalVariable giving us the // real debug info name and size of the variable. - auto FindDI = [&](const auto *DVI) { + auto FindDI = [&](const DbgVariableRecord *DVI) { if (DILocalVariable *DILV = DVI->getVariable()) { std::optional<uint64_t> DISize = getSizeInBytes(DILV->getSizeInBits()); VariableInfo Var{DILV->getName(), DISize}; @@ -331,7 +331,6 @@ void MemoryOpRemark::visitVariable(const Value *V, } } }; - for_each(findDbgDeclares(const_cast<Value *>(V)), FindDI); for_each(findDVRDeclares(const_cast<Value *>(V)), FindDI); if (FoundDI) { diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 73b5f48..d96f1d6 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -243,10 +243,8 @@ struct AllocaInfo { OnlyUsedInOneBlock = false; } } - SmallVector<DbgVariableIntrinsic *> AllDbgUsers; SmallVector<DbgVariableRecord *> AllDPUsers; - findDbgUsers(AllDbgUsers, AI, &AllDPUsers); - assert(AllDbgUsers.empty()); + findDbgUsers(AI, AllDPUsers); std::copy_if(AllDPUsers.begin(), AllDPUsers.end(), std::back_inserter(DPUsers), [](DbgVariableRecord *DVR) { return !DVR->isDbgAssign(); }); diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 586874f..b9292af 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -19,7 +19,9 @@ #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/NoFolder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" @@ -245,11 +247,43 @@ static Value *simplifyInstruction(SCCPSolver &Solver, const APInt *RHSC; // Remove masking operations. if (match(&Inst, m_And(m_Value(X), m_LowBitMask(RHSC)))) { - ConstantRange LRange = GetRange(Inst.getOperand(0)); + ConstantRange LRange = GetRange(X); if (LRange.getUnsignedMax().ule(*RHSC)) return X; } + // Check if we can simplify [us]cmp(X, Y) to X - Y. + if (auto *Cmp = dyn_cast<CmpIntrinsic>(&Inst)) { + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); + // Bail out on 1-bit comparisons. + if (BitWidth == 1) + return nullptr; + ConstantRange LRange = GetRange(LHS); + if (LRange.isSizeLargerThan(3)) + return nullptr; + ConstantRange RRange = GetRange(RHS); + if (RRange.isSizeLargerThan(3)) + return nullptr; + ConstantRange RHSLower = RRange.sub(APInt(BitWidth, 1)); + ConstantRange RHSUpper = RRange.add(APInt(BitWidth, 1)); + ICmpInst::Predicate Pred = + Cmp->isSigned() ? CmpInst::ICMP_SLE : CmpInst::ICMP_ULE; + if (!RHSLower.icmp(Pred, LRange) || !LRange.icmp(Pred, RHSUpper)) + return nullptr; + + IRBuilder<NoFolder> Builder(&Inst); + Value *Sub = Builder.CreateSub(LHS, RHS, Inst.getName(), /*HasNUW=*/false, + /*HasNSW=*/Cmp->isSigned()); + InsertedValues.insert(Sub); + if (Sub->getType() != Inst.getType()) { + Sub = Builder.CreateSExtOrTrunc(Sub, Inst.getType()); + InsertedValues.insert(Sub); + } + return Sub; + } + return nullptr; } diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 561c898..49d0d95 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -197,10 +197,8 @@ void SSAUpdater::RewriteUse(Use &U) { } void SSAUpdater::UpdateDebugValues(Instruction *I) { - SmallVector<DbgValueInst *, 4> DbgValues; SmallVector<DbgVariableRecord *, 4> DbgVariableRecords; - llvm::findDbgValues(DbgValues, I, &DbgVariableRecords); - assert(DbgValues.empty()); + llvm::findDbgValues(I, DbgVariableRecords); for (auto &DVR : DbgVariableRecords) { if (DVR->getParent() == I->getParent()) continue; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 75c9650..94b0ab8 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -2227,16 +2227,6 @@ static bool canSinkInstructions( return I->getOperand(OI) == I0->getOperand(OI); }; if (!all_of(Insts, SameAsI0)) { - // SROA can't speculate lifetime markers of selects/phis, and the - // backend may handle such lifetimes incorrectly as well (#104776). - // Don't sink lifetimes if it would introduce a phi on the pointer - // argument. - if (isa<LifetimeIntrinsic>(I0) && OI == 1 && - any_of(Insts, [](const Instruction *I) { - return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts()); - })) - return false; - if ((isa<Constant>(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) || !canReplaceOperandWithVariable(I0, OI)) // We can't create a PHI from this GEP. diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp index 7ba95e2..8d8a60b 100644 --- a/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -987,6 +987,13 @@ void Mapper::remapInstruction(Instruction *I) { "Referenced value not in value map!"); } + // Drop callee_type metadata from calls that were remapped + // into a direct call from an indirect one. + if (auto *CB = dyn_cast<CallBase>(I)) { + if (CB->getMetadata(LLVMContext::MD_callee_type) && !CB->isIndirectCall()) + CB->setMetadata(LLVMContext::MD_callee_type, nullptr); + } + // Remap phi nodes' incoming blocks. if (PHINode *PN = dyn_cast<PHINode>(I)) { for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 1185385..f57ce0c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -230,7 +230,6 @@ public: /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A /// and \p B. - /// TODO: add createFCmp when needed. VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { @@ -240,6 +239,17 @@ public: new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name)); } + /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A + /// and \p B. + VPInstruction *createFCmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { + assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE && + Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate"); + return tryInsertInstruction( + new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name)); + } + VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f142e07..46bc26c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1354,9 +1354,10 @@ public: ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(), ForceTailFoldingStyle.getValue()}; - if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) + if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL && + ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL) return; - // Override forced styles if needed. + // Override EVL styles if needed. // FIXME: Investigate opportunity for fixed vector factor. bool EVLIsLegal = UserIC <= 1 && IsScalableVF && TTI.hasActiveVectorLength() && !EnableVPlanNativePath; @@ -4361,10 +4362,14 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { - // Cross iteration phis such as reductions need special handling and are - // currently unsupported. - if (any_of(OrigLoop->getHeader()->phis(), - [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) + // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum + // reductions need special handling and are currently unsupported. + if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { + if (!Legal->isReductionVariable(&Phi)) + return Legal->isFixedOrderRecurrence(&Phi); + RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); + return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum; + })) return false; // Phis with uses outside of the loop require special handling and are @@ -4475,6 +4480,28 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( Type *TCType = Legal->getWidestInductionType(); const SCEV *RemainingIterations = nullptr; unsigned MaxTripCount = 0; + if (MainLoopVF.isFixed()) { + // TODO: extend to support scalable VFs. + const SCEV *TC = vputils::getSCEVExprForVPValue( + getPlanFor(MainLoopVF).getTripCount(), SE); + assert(!isa<SCEVCouldNotCompute>(TC) && + "Trip count SCEV must be computable"); + RemainingIterations = SE.getURemExpr( + TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC)); + + // No iterations left to process in the epilogue. + if (RemainingIterations->isZero()) + return Result; + + MaxTripCount = MainLoopVF.getFixedValue() * IC - 1; + if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, + SE.getConstant(TCType, MaxTripCount))) { + MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); + } + LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " + << MaxTripCount << "\n"); + } + for (auto &NextVF : ProfitableVFs) { // Skip candidate VFs without a corresponding VPlan. if (!hasPlanWithVF(NextVF.Width)) @@ -4492,24 +4519,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( // If NextVF is greater than the number of remaining iterations, the // epilogue loop would be dead. Skip such factors. - if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { - // TODO: extend to support scalable VFs. - if (!RemainingIterations) { - const SCEV *TC = vputils::getSCEVExprForVPValue( - getPlanFor(NextVF.Width).getTripCount(), SE); - assert(!isa<SCEVCouldNotCompute>(TC) && - "Trip count SCEV must be computable"); - RemainingIterations = SE.getURemExpr( - TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC)); - MaxTripCount = MainLoopVF.getFixedValue() * IC - 1; - if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, - SE.getConstant(TCType, MaxTripCount))) { - MaxTripCount = - SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); - } - LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " - << MaxTripCount << "\n"); - } + if (RemainingIterations && !NextVF.Width.isScalable()) { if (SE.isKnownPredicate( CmpInst::ICMP_UGT, SE.getConstant(TCType, NextVF.Width.getFixedValue()), @@ -8787,6 +8797,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + // Apply mandatory transformation to handle FP maxnum/minnum reduction with + // NaNs if possible, bail out otherwise. + if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions, + *Plan)) + return nullptr; + // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 6ad5c601..0d0b342 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -23202,6 +23202,8 @@ private: case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FMaxNum: + case RecurKind::FMinNum: case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: case RecurKind::None: @@ -23339,6 +23341,8 @@ private: case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FMaxNum: + case RecurKind::FMinNum: case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: case RecurKind::None: @@ -23441,6 +23445,8 @@ private: case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FMaxNum: + case RecurKind::FMinNum: case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: case RecurKind::None: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 204268e..db40ce2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4188,13 +4188,11 @@ public: return VPB; } - /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set - /// to nullptr. If \p IsReplicator is true, the region is a replicate region. - /// The returned block is owned by the VPlan and deleted once the VPlan is - /// destroyed. - VPRegionBlock *createVPRegionBlock(const std::string &Name = "", - bool IsReplicator = false) { - auto *VPB = new VPRegionBlock(Name, IsReplicator); + /// Create a new loop VPRegionBlock with \p Name and entry and exiting blocks set + /// to nullptr. The returned block is owned by the VPlan and deleted once the + /// VPlan is destroyed. + VPRegionBlock *createVPRegionBlock(const std::string &Name = "") { + auto *VPB = new VPRegionBlock(Name); CreatedBlocks.push_back(VPB); return VPB; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index ca8729a..3499e65 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -84,6 +84,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return ResTy; } case Instruction::ICmp: + case Instruction::FCmp: case VPInstruction::ActiveLaneMask: assert(inferScalarType(R->getOperand(0)) == inferScalarType(R->getOperand(1)) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 7fb5e82..194874a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -411,7 +411,7 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) { // LatchExitVPB, taking care to preserve the original predecessor & successor // order of blocks. Set region entry and exiting after both HeaderVPB and // LatchVPBB have been disconnected from their predecessors/successors. - auto *R = Plan.createVPRegionBlock("", false /*isReplicator*/); + auto *R = Plan.createVPRegionBlock(); VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R); VPBlockUtils::disconnectBlocks(LatchVPBB, R); VPBlockUtils::connectBlocks(PreheaderVPBB, R); @@ -652,3 +652,164 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, Term->addMetadata(LLVMContext::MD_prof, BranchWeights); } } + +bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { + auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { + auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>( + RedPhiR->getBackedgeValue()->getDefiningRecipe()); + if (!MinMaxR) + return nullptr; + + auto *RepR = dyn_cast<VPReplicateRecipe>(MinMaxR); + if (!isa<VPWidenIntrinsicRecipe>(MinMaxR) && + !(RepR && isa<IntrinsicInst>(RepR->getUnderlyingInstr()))) + return nullptr; + +#ifndef NDEBUG + Intrinsic::ID RdxIntrinsicId = + RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum + : Intrinsic::minnum; + assert((isa<VPWidenIntrinsicRecipe>(MinMaxR) && + cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() == + RdxIntrinsicId) || + (RepR && + cast<IntrinsicInst>(RepR->getUnderlyingInstr())->getIntrinsicID() == + RdxIntrinsicId) && + "Intrinsic did not match recurrence kind"); +#endif + + if (MinMaxR->getOperand(0) == RedPhiR) + return MinMaxR->getOperand(1); + + assert(MinMaxR->getOperand(1) == RedPhiR && + "Reduction phi operand expected"); + return MinMaxR->getOperand(0); + }; + + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + VPReductionPHIRecipe *RedPhiR = nullptr; + bool HasUnsupportedPhi = false; + for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) { + if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R)) + continue; + auto *Cur = dyn_cast<VPReductionPHIRecipe>(&R); + if (!Cur) { + // TODO: Also support fixed-order recurrence phis. + HasUnsupportedPhi = true; + continue; + } + // For now, only a single reduction is supported. + // TODO: Support multiple MaxNum/MinNum reductions and other reductions. + if (RedPhiR) + return false; + if (Cur->getRecurrenceKind() != RecurKind::FMaxNum && + Cur->getRecurrenceKind() != RecurKind::FMinNum) { + HasUnsupportedPhi = true; + continue; + } + RedPhiR = Cur; + } + + if (!RedPhiR) + return true; + + // We won't be able to resume execution in the scalar tail, if there are + // unsupported header phis or there is no scalar tail at all, due to + // tail-folding. + if (HasUnsupportedPhi || !Plan.hasScalarTail()) + return false; + + VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR); + if (!MinMaxOp) + return false; + + RecurKind RedPhiRK = RedPhiR->getRecurrenceKind(); + assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) && + "unsupported reduction"); + (void)RedPhiRK; + + /// Check if the vector loop of \p Plan can early exit and restart + /// execution of last vector iteration in the scalar loop. This requires all + /// recipes up to early exit point be side-effect free as they are + /// re-executed. Currently we check that the loop is free of any recipe that + /// may write to memory. Expected to operate on an early VPlan w/o nested + /// regions. + for (VPBlockBase *VPB : vp_depth_first_shallow( + Plan.getVectorLoopRegion()->getEntryBasicBlock())) { + auto *VPBB = cast<VPBasicBlock>(VPB); + for (auto &R : *VPBB) { + if (R.mayWriteToMemory() && + !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) + return false; + } + } + + VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); + VPBuilder Builder(LatchVPBB->getTerminator()); + auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator()); + assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount && + "Unexpected terminator"); + auto *IsLatchExitTaken = + Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0), + LatchExitingBranch->getOperand(1)); + + VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp); + VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN}); + auto *AnyExitTaken = + Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken}); + Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); + LatchExitingBranch->eraseFromParent(); + + // If we exit early due to NaNs, compute the final reduction result based on + // the reduction phi at the beginning of the last vector iteration. + auto *RdxResult = find_singleton<VPSingleDefRecipe>( + RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * { + auto *VPI = dyn_cast<VPInstruction>(U); + if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult) + return VPI; + return nullptr; + }); + + auto *MiddleVPBB = Plan.getMiddleBlock(); + Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin()); + auto *NewSel = + Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1)); + RdxResult->setOperand(1, NewSel); + + auto *ScalarPH = Plan.getScalarPreheader(); + // Update resume phis for inductions in the scalar preheader. If AnyNaN is + // true, the resume from the start of the last vector iteration via the + // canonical IV, otherwise from the original value. + for (auto &R : ScalarPH->phis()) { + auto *ResumeR = cast<VPPhi>(&R); + VPValue *VecV = ResumeR->getOperand(0); + if (VecV == RdxResult) + continue; + if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) { + if (DerivedIV->getNumUsers() == 1 && + DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) { + auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), + &Plan.getVectorTripCount()); + DerivedIV->moveAfter(&*Builder.getInsertPoint()); + DerivedIV->setOperand(1, NewSel); + continue; + } + } + // Bail out and abandon the current, partially modified, VPlan if we + // encounter resume phi that cannot be updated yet. + if (VecV != &Plan.getVectorTripCount()) { + LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with " + "FMaxNum/FMinNum reduction.\n"); + return false; + } + auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV); + ResumeR->setOperand(0, NewSel); + } + + auto *MiddleTerm = MiddleVPBB->getTerminator(); + Builder.setInsertPoint(MiddleTerm); + VPValue *MiddleCond = MiddleTerm->getOperand(0); + VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN)); + MiddleTerm->setOperand(0, NewCond); + return true; +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1664bcc..1fbc3f3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -587,6 +587,7 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); return Builder.CreateFreeze(Op, Name); } + case Instruction::FCmp: case Instruction::ICmp: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); @@ -860,7 +861,7 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Res = State.get(getOperand(0)); for (VPValue *Op : drop_begin(operands())) Res = Builder.CreateOr(Res, State.get(Op)); - return Builder.CreateOrReduce(Res); + return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); } case VPInstruction::FirstActiveLane: { if (getNumOperands() == 1) { @@ -1033,6 +1034,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { switch (getOpcode()) { case Instruction::ExtractElement: case Instruction::Freeze: + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: @@ -1068,6 +1070,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { return Op == getOperand(1); case Instruction::PHI: return true; + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: case Instruction::Or: @@ -1100,6 +1103,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { switch (getOpcode()) { default: return false; + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: return vputils::onlyFirstPartUsed(this); @@ -1786,7 +1790,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { return Opcode == Instruction::ZExt; break; case OperationType::Cmp: - return Opcode == Instruction::ICmp; + return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp; case OperationType::Other: return true; } @@ -3441,7 +3445,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { VPValue *BlockInMask = getMask(); VPValue *Addr = getAddr(); Value *ResAddr = State.get(Addr, VPLane(0)); - Value *PoisonVec = PoisonValue::get(VecTy); auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](Value *MaskForGaps) -> Value * { @@ -3480,6 +3483,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { Instruction *NewLoad; if (BlockInMask || MaskForGaps) { Value *GroupMask = CreateGroupMask(MaskForGaps); + Value *PoisonVec = PoisonValue::get(VecTy); NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, Group->getAlign(), GroupMask, PoisonVec, "wide.masked.vec"); @@ -3489,57 +3493,39 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { Group->addMetadata(NewLoad); ArrayRef<VPValue *> VPDefs = definedValues(); - const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); if (VecTy->isScalableTy()) { // Scalable vectors cannot use arbitrary shufflevectors (only splats), // so must use intrinsics to deinterleave. assert(InterleaveFactor <= 8 && "Unsupported deinterleave factor for scalable vectors"); - Value *Deinterleave = State.Builder.CreateIntrinsic( + NewLoad = State.Builder.CreateIntrinsic( getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(), NewLoad, /*FMFSource=*/nullptr, "strided.vec"); + } - for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I); - if (!Member) { - // This value is not needed as it's not used - cast<Instruction>(StridedVec)->eraseFromParent(); - continue; - } - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); - StridedVec = - createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); - } - - if (Group->isReverse()) - StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); - - State.set(VPDefs[J], StridedVec); - ++J; - } + auto CreateStridedVector = [&InterleaveFactor, &State, + &NewLoad](unsigned Index) -> Value * { + assert(Index < InterleaveFactor && "Illegal group index"); + if (State.VF.isScalable()) + return State.Builder.CreateExtractValue(NewLoad, Index); - return; - } - assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); + // For fixed length VF, use shuffle to extract the sub-vectors from the + // wide load. + auto StrideMask = + createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue()); + return State.Builder.CreateShuffleVector(NewLoad, StrideMask, + "strided.vec"); + }; - // For each member in the group, shuffle out the appropriate data from the - // wide loads. - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); // Skip the gaps in the group. if (!Member) continue; - auto StrideMask = - createStrideMask(I, InterleaveFactor, State.VF.getFixedValue()); - Value *StridedVec = - State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); + Value *StridedVec = CreateStridedVector(I); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 6a3b3e6..cb370fe 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1481,9 +1481,9 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, // (BranchOnCond true). auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); - if (all_of( - Header->phis(), - IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) { + if (all_of(Header->phis(), + IsaPred<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe, + VPFirstOrderRecurrencePHIRecipe>)) { for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) { auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR); HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue()); @@ -3275,10 +3275,13 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, } auto *WideLoad = cast<VPWidenLoadRecipe>(R); + VPValue *PtrOp = WideLoad->getAddr(); + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp)) + PtrOp = VecPtr->getOperand(0); // Narrow wide load to uniform scalar load, as transformed VPlan will only // process one original iteration. - auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), - WideLoad->operands(), /*IsUniform*/ true, + auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp}, + /*IsUniform*/ true, /*Mask*/ nullptr, *WideLoad); N->insertBefore(WideLoad); return N; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 84a1247..ab189f6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -103,6 +103,12 @@ struct VPlanTransforms { /// not valid. static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder); + /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do, + /// try to update the vector loop to exit early if any input is NaN and resume + /// executing in the scalar loop to handle the NaNs there. Return false if + /// this attempt was unsuccessful. + static bool handleMaxMinNumReductions(VPlan &Plan); + /// Clear NSW/NUW flags from reduction instructions if necessary. static void clearReductionWrapFlags(VPlan &Plan); diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index fe8d74c..82adc34 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -115,7 +115,7 @@ private: bool foldInsExtFNeg(Instruction &I); bool foldInsExtBinop(Instruction &I); bool foldInsExtVectorToShuffle(Instruction &I); - bool foldBitOpOfBitcasts(Instruction &I); + bool foldBitOpOfCastops(Instruction &I); bool foldBitcastShuffle(Instruction &I); bool scalarizeOpOrCmp(Instruction &I); bool scalarizeVPIntrinsic(Instruction &I); @@ -808,48 +808,87 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) { return true; } -bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) { - // Match: bitop(bitcast(x), bitcast(y)) -> bitcast(bitop(x, y)) - Value *LHSSrc, *RHSSrc; - if (!match(&I, m_BitwiseLogic(m_BitCast(m_Value(LHSSrc)), - m_BitCast(m_Value(RHSSrc))))) +/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y)) +/// Supports: bitcast, trunc, sext, zext +bool VectorCombine::foldBitOpOfCastops(Instruction &I) { + // Check if this is a bitwise logic operation + auto *BinOp = dyn_cast<BinaryOperator>(&I); + if (!BinOp || !BinOp->isBitwiseLogicOp()) return false; + // Get the cast instructions + auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0)); + auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1)); + if (!LHSCast || !RHSCast) { + LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n"); + return false; + } + + // Both casts must be the same type + Instruction::CastOps CastOpcode = LHSCast->getOpcode(); + if (CastOpcode != RHSCast->getOpcode()) + return false; + + // Only handle supported cast operations + switch (CastOpcode) { + case Instruction::BitCast: + case Instruction::Trunc: + case Instruction::SExt: + case Instruction::ZExt: + break; + default: + return false; + } + + Value *LHSSrc = LHSCast->getOperand(0); + Value *RHSSrc = RHSCast->getOperand(0); + // Source types must match if (LHSSrc->getType() != RHSSrc->getType()) return false; - if (!LHSSrc->getType()->getScalarType()->isIntegerTy()) - return false; - // Only handle vector types + // Only handle vector types with integer elements auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType()); auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType()); if (!SrcVecTy || !DstVecTy) return false; - // Same total bit width - assert(SrcVecTy->getPrimitiveSizeInBits() == - DstVecTy->getPrimitiveSizeInBits() && - "Bitcast should preserve total bit width"); + if (!SrcVecTy->getScalarType()->isIntegerTy() || + !DstVecTy->getScalarType()->isIntegerTy()) + return false; // Cost Check : - // OldCost = bitlogic + 2*bitcasts - // NewCost = bitlogic + bitcast - auto *BinOp = cast<BinaryOperator>(&I); + // OldCost = bitlogic + 2*casts + // NewCost = bitlogic + cast + + // Calculate specific costs for each cast with instruction context + InstructionCost LHSCastCost = + TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy, + TTI::CastContextHint::None, CostKind, LHSCast); + InstructionCost RHSCastCost = + TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy, + TTI::CastContextHint::None, CostKind, RHSCast); + InstructionCost OldCost = - TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy) + - TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, LHSSrc->getType(), - TTI::CastContextHint::None) + - TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, RHSSrc->getType(), - TTI::CastContextHint::None); + TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) + + LHSCastCost + RHSCastCost; + + // For new cost, we can't provide an instruction (it doesn't exist yet) + InstructionCost GenericCastCost = TTI.getCastInstrCost( + CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind); + InstructionCost NewCost = - TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy) + - TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, SrcVecTy, - TTI::CastContextHint::None); + TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) + + GenericCastCost; - LLVM_DEBUG(dbgs() << "Found a bitwise logic op of bitcasted values: " << I - << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost - << "\n"); + // Account for multi-use casts using specific costs + if (!LHSCast->hasOneUse()) + NewCost += LHSCastCost; + if (!RHSCast->hasOneUse()) + NewCost += RHSCastCost; + + LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost + << " NewCost=" << NewCost << "\n"); if (NewCost > OldCost) return false; @@ -862,8 +901,16 @@ bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) { Worklist.pushValue(NewOp); - // Bitcast the result back - Value *Result = Builder.CreateBitCast(NewOp, I.getType()); + // Create the cast operation directly to ensure we get a new instruction + Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType()); + + // Preserve cast instruction flags + NewCast->copyIRFlags(LHSCast); + NewCast->andIRFlags(RHSCast); + + // Insert the new instruction + Value *Result = Builder.Insert(NewCast); + replaceValue(I, *Result); return true; } @@ -3773,7 +3820,7 @@ bool VectorCombine::run() { case Instruction::And: case Instruction::Or: case Instruction::Xor: - MadeChange |= foldBitOpOfBitcasts(I); + MadeChange |= foldBitOpOfCastops(I); break; default: MadeChange |= shrinkType(I); |