diff options
Diffstat (limited to 'llvm/lib/Transforms')
22 files changed, 204 insertions, 114 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 8d9933b..92fca90 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3496,7 +3496,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (isPowerOf2_64(AlignMask + 1)) { uint64_t Offset = 0; match(A, m_Add(m_Value(A), m_ConstantInt(Offset))); - if (match(A, m_PtrToInt(m_Value(A)))) { + if (match(A, m_PtrToIntOrAddr(m_Value(A)))) { /// Note: this doesn't preserve the offset information but merges /// offset and alignment. /// TODO: we can generate a GEP instead of merging the alignment with diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 9b9fe26..614c6eb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1525,7 +1525,15 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) { } // Try to extend the entire expression tree to the wide destination type. - if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) { + bool ShouldExtendExpression = true; + Value *TruncSrc = nullptr; + // It is not desirable to extend expression in the trunc + sext pattern when + // destination type is narrower than original (pre-trunc) type. + if (match(Src, m_Trunc(m_Value(TruncSrc)))) + if (TruncSrc->getType()->getScalarSizeInBits() > DestBitSize) + ShouldExtendExpression = false; + if (ShouldExtendExpression && shouldChangeType(SrcTy, DestTy) && + canEvaluateSExtd(Src, DestTy)) { // Okay, we can transform this! Insert the new expression now. LLVM_DEBUG( dbgs() << "ICE: EvaluateInDifferentType converting expression type" @@ -1545,13 +1553,18 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) { ShAmt); } - Value *X; - if (match(Src, m_Trunc(m_Value(X)))) { + Value *X = TruncSrc; + if (X) { // If the input has more sign bits than bits truncated, then convert // directly to final type. unsigned XBitSize = X->getType()->getScalarSizeInBits(); - if (ComputeNumSignBits(X, &Sext) > XBitSize - SrcBitSize) - return CastInst::CreateIntegerCast(X, DestTy, /* isSigned */ true); + bool HasNSW = cast<TruncInst>(Src)->hasNoSignedWrap(); + if (HasNSW || (ComputeNumSignBits(X, &Sext) > XBitSize - SrcBitSize)) { + auto *Res = CastInst::CreateIntegerCast(X, DestTy, /* isSigned */ true); + if (auto *ResTrunc = dyn_cast<TruncInst>(Res); ResTrunc && HasNSW) + ResTrunc->setHasNoSignedWrap(true); + return Res; + } // If input is a trunc from the destination type, then convert into shifts. if (Src->hasOneUse() && X->getType() == DestTy) { @@ -2135,7 +2148,7 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) { return nullptr; } -Value *InstCombinerImpl::foldPtrToIntOfGEP(Type *IntTy, Value *Ptr) { +Value *InstCombinerImpl::foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr) { // Look through chain of one-use GEPs. Type *PtrTy = Ptr->getType(); SmallVector<GEPOperator *> GEPs; @@ -2197,7 +2210,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { Mask->getType() == Ty) return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask); - if (Value *V = foldPtrToIntOfGEP(Ty, SrcOp)) + if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp)) return replaceInstUsesWith(CI, V); Value *Vec, *Scalar, *Index; @@ -2215,6 +2228,21 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { } Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) { + Value *SrcOp = CI.getPointerOperand(); + Type *Ty = CI.getType(); + + // (ptrtoaddr (ptrmask P, M)) + // -> (and (ptrtoaddr P), M) + // This is generally beneficial as `and` is better supported than `ptrmask`. + Value *Ptr, *Mask; + if (match(SrcOp, m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(Ptr), + m_Value(Mask)))) && + Mask->getType() == Ty) + return BinaryOperator::CreateAnd(Builder.CreatePtrToAddr(Ptr), Mask); + + if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp)) + return replaceInstUsesWith(CI, V); + // FIXME: Implement variants of ptrtoint folds. return commonCastTransforms(CI); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 9c75d9a..d85e4f7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -700,7 +700,7 @@ public: /// folded operation. void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN); - Value *foldPtrToIntOfGEP(Type *IntTy, Value *Ptr); + Value *foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr); Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, CmpPredicate Cond, Instruction &I); Instruction *foldSelectICmp(CmpPredicate Pred, SelectInst *SI, Value *RHS, diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 651e305..550dfc5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -105,6 +105,8 @@ static Value *simplifyShiftSelectingPackedElement(Instruction *I, if (~KnownShrBits.Zero != ShlAmt) return nullptr; + IRBuilderBase::InsertPointGuard Guard(IC.Builder); + IC.Builder.SetInsertPoint(I); Value *ShrAmtZ = IC.Builder.CreateICmpEQ(ShrAmt, Constant::getNullValue(ShrAmt->getType()), ShrAmt->getName() + ".z"); diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index cb6ca72..7c364f8 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1539,7 +1539,7 @@ void AddressSanitizer::getInterestingMemoryOperands( IID == Intrinsic::experimental_vp_strided_load) { Stride = VPI->getOperand(PtrOpNo + 1); // Use the pointer alignment as the element alignment if the stride is a - // mutiple of the pointer alignment. Otherwise, the element alignment + // multiple of the pointer alignment. Otherwise, the element alignment // should be Align(1). unsigned PointerAlign = Alignment.valueOrOne().value(); if (!isa<ConstantInt>(Stride) || @@ -2399,7 +2399,7 @@ void ModuleAddressSanitizer::instrumentGlobalsELF( // Putting globals in a comdat changes the semantic and potentially cause // false negative odr violations at link time. If odr indicators are used, we - // keep the comdat sections, as link time odr violations will be dectected on + // keep the comdat sections, as link time odr violations will be detected on // the odr indicator symbols. bool UseComdatForGlobalsGC = UseOdrIndicator && !UniqueModuleId.empty(); @@ -3858,7 +3858,7 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) { I->eraseFromParent(); } - // Replace all uses of AddessReturnedByAlloca with NewAddressPtr. + // Replace all uses of AddressReturnedByAlloca with NewAddressPtr. AI->replaceAllUsesWith(NewAddressPtr); // We are done. Erase old alloca from parent. diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 72e8e50..0688bc7 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -359,7 +359,7 @@ class CHR { unsigned Count = 0; // Find out how many times region R is cloned. Note that if the parent // of R is cloned, R is also cloned, but R's clone count is not updated - // from the clone of the parent. We need to accumlate all the counts + // from the clone of the parent. We need to accumulate all the counts // from the ancestors to get the clone count. while (R) { Count += DuplicationCount[R]; @@ -1513,7 +1513,7 @@ static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp, BI->swapSuccessors(); // Don't need to swap this in terms of // TrueBiasedRegions/FalseBiasedRegions because true-based/false-based - // mean whehter the branch is likely go into the if-then rather than + // mean whether the branch is likely go into the if-then rather than // successor0/successor1 and because we can tell which edge is the then or // the else one by comparing the destination to the region exit block. continue; diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index cf87e35..1e5946a 100644 --- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -83,7 +83,7 @@ static cl::opt<unsigned> // ICP the candidate function even when only a declaration is present. static cl::opt<bool> ICPAllowDecls( "icp-allow-decls", cl::init(false), cl::Hidden, - cl::desc("Promote the target candidate even when the defintion " + cl::desc("Promote the target candidate even when the definition " " is not available")); // ICP hot candidate functions only. When setting to false, non-cold functions diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 5e7548b..7795cce 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -139,7 +139,7 @@ cl::opt<bool> ConditionalCounterUpdate( cl::init(false)); // If the option is not specified, the default behavior about whether -// counter promotion is done depends on how instrumentaiton lowering +// counter promotion is done depends on how instrumentation lowering // pipeline is setup, i.e., the default value of true of this option // does not mean the promotion will be done by default. Explicitly // setting this option can override the default behavior. @@ -1052,7 +1052,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { GlobalVariable *Name = Ind->getName(); auto It = ProfileDataMap.find(Name); assert(It != ProfileDataMap.end() && It->second.DataVar && - "value profiling detected in function with no counter incerement"); + "value profiling detected in function with no counter increment"); GlobalVariable *DataVar = It->second.DataVar; uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp index 3c0f185..05616d8 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp @@ -490,7 +490,7 @@ void createProfileFileNameVar(Module &M) { } } -// Set MemprofHistogramFlag as a Global veriable in IR. This makes it accessible +// Set MemprofHistogramFlag as a Global variable in IR. This makes it accessible // to the runtime, changing shadow count behavior. void createMemprofHistogramFlagVar(Module &M) { const StringRef VarName(MemProfHistogramFlagVar); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 10b03bb..471c6ec 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3136,7 +3136,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// If we don't instrument it and it gets inlined, /// our interceptor will not kick in and we will lose the memmove. /// If we instrument the call here, but it does not get inlined, - /// we will memove the shadow twice: which is bad in case + /// we will memmove the shadow twice: which is bad in case /// of overlapping regions. So, we simply lower the intrinsic to a call. /// /// Similar situation exists for memcpy and memset. @@ -4775,7 +4775,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // _mm_round_ps / _mm_round_ps. // Similar to maybeHandleSimpleNomemIntrinsic except - // the second argument is guranteed to be a constant integer. + // the second argument is guaranteed to be a constant integer. void handleRoundPdPsIntrinsic(IntrinsicInst &I) { assert(I.getArgOperand(0)->getType() == I.getType()); assert(I.arg_size() == 2); diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp index f5b6686..5f87ed6 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp @@ -176,7 +176,7 @@ PreservedAnalyses PGOCtxProfFlatteningPass::run(Module &M, assert(areAllBBsReachable( F, MAM.getResult<FunctionAnalysisManagerModuleProxy>(M) .getManager()) && - "Function has unreacheable basic blocks. The expectation was that " + "Function has unreachable basic blocks. The expectation was that " "DCE was run before."); auto It = FlattenedProfile.find(AssignGUIDPass::getGUID(F)); diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp index 0a358d4..de7c169 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp @@ -253,7 +253,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { Value *RealContext = nullptr; StructType *ThisContextType = nullptr; - Value *TheRootFuctionData = nullptr; + Value *TheRootFunctionData = nullptr; Value *ExpectedCalleeTLSAddr = nullptr; Value *CallsiteInfoTLSAddr = nullptr; const bool HasMusttail = [&F]() { @@ -283,7 +283,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { Guid = Builder.getInt64( AssignGUIDPass::getGUID(cast<Function>(*Mark->getNameValue()))); // The type of the context of this function is now knowable since we have - // NumCallsites and NumCounters. We delcare it here because it's more + // NumCallsites and NumCounters. We declare it here because it's more // convenient - we have the Builder. ThisContextType = StructType::get( F.getContext(), @@ -291,28 +291,27 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { ArrayType::get(Builder.getPtrTy(), NumCallsites)}); // Figure out which way we obtain the context object for this function - // if it's an entrypoint, then we call StartCtx, otherwise GetCtx. In the - // former case, we also set TheRootFuctionData since we need to release it - // at the end (plus it can be used to know if we have an entrypoint or a - // regular function) - // Don't set a name, they end up taking a lot of space and we don't need - // them. + // former case, we also set TheRootFunctionData since we need to release + // it at the end (plus it can be used to know if we have an entrypoint or + // a regular function). Don't set a name, they end up taking a lot of + // space and we don't need them. // Zero-initialize the FunctionData, except for functions that have // musttail calls. There, we set the CtxRoot field to 1, which will be // treated as a "can't be set as root". - TheRootFuctionData = new GlobalVariable( + TheRootFunctionData = new GlobalVariable( M, FunctionDataTy, false, GlobalVariable::InternalLinkage, HasMusttail ? CannotBeRootInitializer : Constant::getNullValue(FunctionDataTy)); if (ContextRootSet.contains(&F)) { Context = Builder.CreateCall( - StartCtx, {TheRootFuctionData, Guid, Builder.getInt32(NumCounters), + StartCtx, {TheRootFunctionData, Guid, Builder.getInt32(NumCounters), Builder.getInt32(NumCallsites)}); ORE.emit( [&] { return OptimizationRemark(DEBUG_TYPE, "Entrypoint", &F); }); } else { - Context = Builder.CreateCall(GetCtx, {TheRootFuctionData, &F, Guid, + Context = Builder.CreateCall(GetCtx, {TheRootFunctionData, &F, Guid, Builder.getInt32(NumCounters), Builder.getInt32(NumCallsites)}); ORE.emit([&] { @@ -399,7 +398,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { } else if (!HasMusttail && isa<ReturnInst>(I)) { // Remember to release the context if we are an entrypoint. IRBuilder<> Builder(&I); - Builder.CreateCall(ReleaseCtx, {TheRootFuctionData}); + Builder.CreateCall(ReleaseCtx, {TheRootFunctionData}); ContextWasReleased = true; } } diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 120c4f6..71736cf 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1957,7 +1957,7 @@ static bool InstrumentAllFunctions( function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, function_ref<LoopInfo *(Function &)> LookupLI, PGOInstrumentationType InstrumentationType) { - // For the context-sensitve instrumentation, we should have a separated pass + // For the context-sensitive instrumentation, we should have a separated pass // (before LTO/ThinLTO linking) to create these variables. if (InstrumentationType == PGOInstrumentationType::FDO) createIRLevelProfileFlagVar(M, InstrumentationType); @@ -2248,7 +2248,7 @@ static bool annotateAllFunctions( Func.populateCoverage(); continue; } - // When PseudoKind is set to a vaule other than InstrProfRecord::NotPseudo, + // When PseudoKind is set to a value other than InstrProfRecord::NotPseudo, // it means the profile for the function is unrepresentative and this // function is actually hot / warm. We will reset the function hot / cold // attribute and drop all the profile counters. diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp index 4801ac7..210b126 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp @@ -481,15 +481,18 @@ StringRef SanitizerBinaryMetadata::getSectionEnd(StringRef SectionSuffix) { } // namespace SanitizerBinaryMetadataPass::SanitizerBinaryMetadataPass( - SanitizerBinaryMetadataOptions Opts, ArrayRef<std::string> IgnorelistFiles) - : Options(std::move(Opts)), IgnorelistFiles(std::move(IgnorelistFiles)) {} + SanitizerBinaryMetadataOptions Opts, + IntrusiveRefCntPtr<vfs::FileSystem> VFS, + ArrayRef<std::string> IgnorelistFiles) + : Options(std::move(Opts)), + VFS(VFS ? std::move(VFS) : vfs::getRealFileSystem()), + IgnorelistFiles(std::move(IgnorelistFiles)) {} PreservedAnalyses SanitizerBinaryMetadataPass::run(Module &M, AnalysisManager<Module> &AM) { std::unique_ptr<SpecialCaseList> Ignorelist; if (!IgnorelistFiles.empty()) { - Ignorelist = SpecialCaseList::createOrDie(IgnorelistFiles, - *vfs::getRealFileSystem()); + Ignorelist = SpecialCaseList::createOrDie(IgnorelistFiles, *VFS); if (Ignorelist->inSection("metadata", "src", M.getSourceFileName())) return PreservedAnalyses::all(); } diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index b74a070..09abf6a 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -318,6 +318,18 @@ private: }; } // namespace +SanitizerCoveragePass::SanitizerCoveragePass( + SanitizerCoverageOptions Options, IntrusiveRefCntPtr<vfs::FileSystem> VFS, + const std::vector<std::string> &AllowlistFiles, + const std::vector<std::string> &BlocklistFiles) + : Options(std::move(Options)), + VFS(VFS ? std::move(VFS) : vfs::getRealFileSystem()) { + if (AllowlistFiles.size() > 0) + Allowlist = SpecialCaseList::createOrDie(AllowlistFiles, *this->VFS); + if (BlocklistFiles.size() > 0) + Blocklist = SpecialCaseList::createOrDie(BlocklistFiles, *this->VFS); +} + PreservedAnalyses SanitizerCoveragePass::run(Module &M, ModuleAnalysisManager &MAM) { auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 0d48a35..fd0e9f1 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -355,7 +355,7 @@ static bool isVtableAccess(Instruction *I) { } // Do not instrument known races/"benign races" that come from compiler -// instrumentatin. The user has no way of suppressing them. +// instrumentation. The user has no way of suppressing them. static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) { // Peel off GEPs and BitCasts. Addr = Addr->stripInBoundsOffsets(); diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 9471ae3..78d4a57e 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -643,7 +643,7 @@ bool TypeSanitizer::instrumentWithShadowUpdate( // doesn't match, then we call the runtime (which may yet determine that // the mismatch is okay). // - // The checks generated below have the following strucutre. + // The checks generated below have the following structure. // // ; First we load the descriptor for the load from shadow memory and // ; compare it against the type descriptor for the current access type. diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index 371d9e6..a9ab3b3 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -819,7 +819,7 @@ public: OS << "]"; } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) { OS << "; switch predicate info { CaseValue: " << *PS->CaseValue - << " Switch:" << *PS->Switch << " Edge: ["; + << " Edge: ["; PS->From->printAsOperand(OS); OS << ","; PS->To->printAsOperand(OS); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index facb0fa..f7968ab 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7543,12 +7543,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, } if (LoadInst *Load = dyn_cast<LoadInst>(I)) return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, - VPIRMetadata(*Load, LVer), I->getDebugLoc()); + Load->getAlign(), VPIRMetadata(*Load, LVer), + I->getDebugLoc()); StoreInst *Store = cast<StoreInst>(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, - Reverse, VPIRMetadata(*Store, LVer), - I->getDebugLoc()); + Reverse, Store->getAlign(), + VPIRMetadata(*Store, LVer), I->getDebugLoc()); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5b9f005..1f10058 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3179,6 +3179,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, protected: Instruction &Ingredient; + /// Alignment information for this memory access. + Align Alignment; + /// Whether the accessed addresses are consecutive. bool Consecutive; @@ -3198,10 +3201,10 @@ protected: VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, std::initializer_list<VPValue *> Operands, - bool Consecutive, bool Reverse, + bool Consecutive, bool Reverse, Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL) : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), - Consecutive(Consecutive), Reverse(Reverse) { + Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); } @@ -3242,6 +3245,9 @@ public: return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; } + /// Returns the alignment of the memory access. + Align getAlign() const { return Alignment; } + /// Generate the wide load/store. void execute(VPTransformState &State) override { llvm_unreachable("VPWidenMemoryRecipe should not be instantiated."); @@ -3259,18 +3265,18 @@ public: struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse, + bool Consecutive, bool Reverse, Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, - Reverse, Metadata, DL), + Reverse, Alignment, Metadata, DL), VPValue(this, &Load) { setMask(Mask); } VPWidenLoadRecipe *clone() override { return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(), - getMask(), Consecutive, Reverse, *this, - getDebugLoc()); + getMask(), Consecutive, Reverse, getAlign(), + *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); @@ -3301,8 +3307,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), - {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L, - L.getDebugLoc()), + {Addr, &EVL}, L.isConsecutive(), L.isReverse(), + L.getAlign(), L, L.getDebugLoc()), VPValue(this, &getIngredient()) { setMask(Mask); } @@ -3340,16 +3346,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, VPValue *Mask, bool Consecutive, bool Reverse, - const VPIRMetadata &Metadata, DebugLoc DL) + Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, - Consecutive, Reverse, Metadata, DL) { + Consecutive, Reverse, Alignment, Metadata, DL) { setMask(Mask); } VPWidenStoreRecipe *clone() override { return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(), getStoredValue(), getMask(), Consecutive, - Reverse, *this, getDebugLoc()); + Reverse, getAlign(), *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); @@ -3384,7 +3390,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(), - S.isReverse(), S, S.getDebugLoc()) { + S.isReverse(), S.getAlign(), S, S.getDebugLoc()) { setMask(Mask); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 931a5b7..9a63c80 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -70,6 +70,7 @@ bool VPRecipeBase::mayWriteToMemory() const { return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory(); case VPCanonicalIVPHISC: case VPBranchOnMaskSC: + case VPDerivedIVSC: case VPFirstOrderRecurrencePHISC: case VPReductionPHISC: case VPScalarIVStepsSC: @@ -86,6 +87,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: + case VPWidenPointerInductionSC: case VPWidenSC: case VPWidenSelectSC: { const Instruction *I = @@ -119,6 +121,7 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPWidenIntrinsicSC: return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory(); case VPBranchOnMaskSC: + case VPDerivedIVSC: case VPFirstOrderRecurrencePHISC: case VPPredInstPHISC: case VPScalarIVStepsSC: @@ -134,6 +137,7 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenPHISC: + case VPWidenPointerInductionSC: case VPWidenSC: case VPWidenSelectSC: { const Instruction *I = @@ -3358,7 +3362,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); + unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace(); TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); @@ -3525,7 +3529,6 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) @@ -3575,7 +3578,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, void VPWidenLoadRecipe::execute(VPTransformState &State) { Type *ScalarDataTy = getLoadStoreType(&Ingredient); auto *DataTy = VectorType::get(ScalarDataTy, State.VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); bool CreateGather = !isConsecutive(); auto &Builder = State.Builder; @@ -3630,7 +3632,6 @@ static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Type *ScalarDataTy = getLoadStoreType(&Ingredient); auto *DataTy = VectorType::get(ScalarDataTy, State.VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); bool CreateGather = !isConsecutive(); auto &Builder = State.Builder; @@ -3674,8 +3675,8 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we // don't need to compare to the legacy cost model. Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); - unsigned AS = getLoadStoreAddressSpace(&Ingredient); + unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr())) + ->getAddressSpace(); InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( Instruction::Load, Ty, Alignment, AS, Ctx.CostKind); if (!Reverse) @@ -3699,7 +3700,6 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, void VPWidenStoreRecipe::execute(VPTransformState &State) { VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); - const Align Alignment = getLoadStoreAlignment(&Ingredient); auto &Builder = State.Builder; @@ -3742,7 +3742,6 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { VPValue *StoredValue = getStoredValue(); bool CreateScatter = !isConsecutive(); - const Align Alignment = getLoadStoreAlignment(&Ingredient); auto &Builder = State.Builder; @@ -3785,8 +3784,8 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we // don't need to compare to the legacy cost model. Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); - unsigned AS = getLoadStoreAddressSpace(&Ingredient); + unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr())) + ->getAddressSpace(); InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( Instruction::Store, Ty, Alignment, AS, Ctx.CostKind); if (!Reverse) @@ -4252,7 +4251,8 @@ InstructionCost VPInterleaveBase::computeCost(ElementCount VF, getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) : getStoredValues()[InsertPosIdx]); auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); - unsigned AS = getLoadStoreAddressSpace(InsertPos); + unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr())) + ->getAddressSpace(); unsigned InterleaveFactor = IG->getFactor(); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 84817d7..d9ac26bb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -91,13 +91,14 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) { NewRecipe = new VPWidenLoadRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load), - Ingredient.getDebugLoc()); + false /*Consecutive*/, false /*Reverse*/, Load->getAlign(), + VPIRMetadata(*Load), Ingredient.getDebugLoc()); } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) { NewRecipe = new VPWidenStoreRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, - VPIRMetadata(*Store), Ingredient.getDebugLoc()); + Store->getAlign(), VPIRMetadata(*Store), + Ingredient.getDebugLoc()); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) { @@ -130,6 +131,24 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( return true; } +/// Return true if we do not know how to (mechanically) hoist or sink \p R out +/// of a loop region. +static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) { + // Assumes don't alias anything or throw; as long as they're guaranteed to + // execute, they're safe to hoist. + if (match(&R, m_Intrinsic<Intrinsic::assume>())) + return false; + + // TODO: Relax checks in the future, e.g. we could also hoist reads, if their + // memory location is not modified in the vector loop. + if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi()) + return true; + + // Allocas cannot be hoisted. + auto *RepR = dyn_cast<VPReplicateRecipe>(&R); + return RepR && RepR->getOpcode() == Instruction::Alloca; +} + static bool sinkScalarOperands(VPlan &Plan) { auto Iter = vp_depth_first_deep(Plan.getEntry()); bool Changed = false; @@ -1825,7 +1844,7 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPDT.properlyDominates(Previous, SinkCandidate)) return true; - if (SinkCandidate->mayHaveSideEffects()) + if (cannotHoistOrSinkRecipe(*SinkCandidate)) return false; WorkList.push_back(SinkCandidate); @@ -1865,7 +1884,7 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT) { - if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory()) + if (cannotHoistOrSinkRecipe(*Previous)) return false; // Collect recipes that need hoisting. @@ -1912,11 +1931,6 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, return nullptr; return HoistCandidate; }; - auto CanHoist = [&](VPRecipeBase *HoistCandidate) { - // Avoid hoisting candidates with side-effects, as we do not yet analyze - // associated dependencies. - return !HoistCandidate->mayHaveSideEffects(); - }; if (!NeedsHoisting(Previous->getVPSingleValue())) return true; @@ -1928,7 +1942,7 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Current = HoistCandidates[I]; assert(Current->getNumDefinedValues() == 1 && "only recipes with a single defined value expected"); - if (!CanHoist(Current)) + if (cannotHoistOrSinkRecipe(*Current)) return false; for (VPValue *Op : Current->operands()) { @@ -2143,24 +2157,6 @@ void VPlanTransforms::cse(VPlan &Plan) { static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); - // Return true if we do not know how to (mechanically) hoist a given recipe - // out of a loop region. - auto CannotHoistRecipe = [](VPRecipeBase &R) { - // Assumes don't alias anything or throw; as long as they're guaranteed to - // execute, they're safe to hoist. - if (match(&R, m_Intrinsic<Intrinsic::assume>())) - return false; - - // TODO: Relax checks in the future, e.g. we could also hoist reads, if - // their memory location is not modified in the vector loop. - if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi()) - return true; - - // Allocas cannot be hoisted. - auto *RepR = dyn_cast<VPReplicateRecipe>(&R); - return RepR && RepR->getOpcode() == Instruction::Alloca; - }; - // Hoist any loop invariant recipes from the vector loop region to the // preheader. Preform a shallow traversal of the vector loop region, to // exclude recipes in replicate regions. Since the top-level blocks in the @@ -2172,7 +2168,7 @@ static void licm(VPlan &Plan) { for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_shallow(LoopRegion->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (CannotHoistRecipe(R)) + if (cannotHoistOrSinkRecipe(R)) continue; if (any_of(R.operands(), [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); @@ -3652,6 +3648,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Sub = VecOp->getDefiningRecipe(); VecOp = Tmp; } + + // If ValB is a constant and can be safely extended, truncate it to the same + // type as ExtA's operand, then extend it to the same type as ExtA. This + // creates two uniform extends that can more easily be matched by the rest of + // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all + // replaced with the new extend of the constant. + auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA, + VPWidenCastRecipe *&ExtB, + VPValue *&ValB, VPWidenRecipe *Mul) { + if (!ExtA || ExtB || !ValB->isLiveIn()) + return; + Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0)); + Instruction::CastOps ExtOpc = ExtA->getOpcode(); + const APInt *Const; + if (!match(ValB, m_APInt(Const)) || + !llvm::canConstantBeExtended( + Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc))) + return; + // The truncate ensures that the type of each extended operand is the + // same, and it's been proven that the constant can be extended from + // NarrowTy safely. Necessary since ExtA's extended operand would be + // e.g. an i8, while the const will likely be an i32. This will be + // elided by later optimisations. + VPBuilder Builder(Mul); + auto *Trunc = + Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy); + Type *WideTy = Ctx.Types.inferScalarType(ExtA); + ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy); + Mul->setOperand(1, ExtB); + }; + // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { auto *RecipeA = @@ -3660,6 +3687,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe()); + // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const))) + ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul); + // Match reduce.add/sub(mul(ext, ext)). if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) && match(RecipeB, m_ZExtOrSExt(m_VPValue())) && @@ -3669,7 +3699,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, cast<VPWidenRecipe>(Sub), Red); return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red); } - // Match reduce.add(mul). // TODO: Add an expression type for this variant with a negated mul if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr)) return new VPExpressionRecipe(Mul, Red); @@ -3678,18 +3707,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // variants. if (Sub) return nullptr; - // Match reduce.add(ext(mul(ext(A), ext(B)))). - // All extend recipes must have same opcode or A == B - // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). - if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()), - m_ZExtOrSExt(m_VPValue()))))) { + + // Match reduce.add(ext(mul(A, B))). + if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) { auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe()); auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe()); - auto *Ext0 = - cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe()); - auto *Ext1 = - cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe()); - if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && + auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe()); + auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); + + // reduce.add(ext(mul(ext, const))) + // -> reduce.add(ext(mul(ext, ext(const)))) + ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul); + + // reduce.add(ext(mul(ext(A), ext(B)))) + // -> reduce.add(mul(wider_ext(A), wider_ext(B))) + // The inner extends must either have the same opcode as the outer extend or + // be the same, in which case the multiply can never result in a negative + // value and the outer extend can be folded away by doing wider + // extends for the operands of the mul. + if (Ext0 && Ext1 && + (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && Ext0->getOpcode() == Ext1->getOpcode() && IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) { auto *NewExt0 = new VPWidenCastRecipe( @@ -4234,10 +4271,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { // Narrow interleave group to wide load, as transformed VPlan will only // process one original iteration. + auto *LI = + cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); auto *L = new VPWidenLoadRecipe( - *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()), - LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, - /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); + *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, + /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc()); L->insertBefore(LoadGroup); NarrowedOps.insert(L); return L; @@ -4280,10 +4318,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, Res = NarrowOp(Member0); } + auto *SI = + cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()); auto *S = new VPWidenStoreRecipe( - *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()), - StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, - /*Reverse=*/false, {}, StoreGroup->getDebugLoc()); + *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, + /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc()); S->insertBefore(StoreGroup); StoreGroup->eraseFromParent(); } |
