diff options
Diffstat (limited to 'llvm/lib')
75 files changed, 1526 insertions, 410 deletions
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 757f689..c4fee39 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -751,7 +751,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { if (CA.analyze().isSuccess()) { // We were able to inline the indirect call! Subtract the cost from the // threshold to get the bonus we want to apply, but don't go below zero. - Cost -= std::max(0, CA.getThreshold() - CA.getCost()); + addCost(-std::max(0, CA.getThreshold() - CA.getCost())); } } else // Otherwise simply add the cost for merely making the call. @@ -1191,7 +1191,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { // If this function uses the coldcc calling convention, prefer not to inline // it. if (F.getCallingConv() == CallingConv::Cold) - Cost += InlineConstants::ColdccPenalty; + addCost(InlineConstants::ColdccPenalty); LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n"); @@ -2193,7 +2193,7 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { // the cost of inlining it drops dramatically. It may seem odd to update // Cost in updateThreshold, but the bonus depends on the logic in this method. if (isSoleCallToLocalFunction(Call, F)) { - Cost -= LastCallToStaticBonus; + addCost(-LastCallToStaticBonus); StaticBonusApplied = LastCallToStaticBonus; } } diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp index 050c327..424a7fe 100644 --- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp +++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp @@ -436,10 +436,9 @@ bool IndexedReference::delinearize(const LoopInfo &LI) { const SCEV *StepRec = AccessFnAR ? AccessFnAR->getStepRecurrence(SE) : nullptr; if (StepRec && SE.isKnownNegative(StepRec)) - AccessFn = SE.getAddRecExpr(AccessFnAR->getStart(), - SE.getNegativeSCEV(StepRec), - AccessFnAR->getLoop(), - AccessFnAR->getNoWrapFlags()); + AccessFn = SE.getAddRecExpr( + AccessFnAR->getStart(), SE.getNegativeSCEV(StepRec), + AccessFnAR->getLoop(), SCEV::NoWrapFlags::FlagAnyWrap); const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize); Subscripts.push_back(Div); Sizes.push_back(ElemSize); diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp index 1c5f08e..edca387 100644 --- a/llvm/lib/Analysis/MemoryLocation.cpp +++ b/llvm/lib/Analysis/MemoryLocation.cpp @@ -288,6 +288,34 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, LocationSize::precise(DL.getTypeStoreSize( II->getArgOperand(1)->getType())), AATags); + case Intrinsic::matrix_column_major_load: + case Intrinsic::matrix_column_major_store: { + bool IsLoad = II->getIntrinsicID() == Intrinsic::matrix_column_major_load; + assert(ArgIdx == (IsLoad ? 0 : 1) && "Invalid argument index"); + + auto *Stride = dyn_cast<ConstantInt>(II->getArgOperand(IsLoad ? 1 : 2)); + uint64_t Rows = + cast<ConstantInt>(II->getArgOperand(IsLoad ? 3 : 4))->getZExtValue(); + uint64_t Cols = + cast<ConstantInt>(II->getArgOperand(IsLoad ? 4 : 5))->getZExtValue(); + + // The stride is dynamic, so there's nothing we can say. + if (!Stride) + return MemoryLocation(Arg, LocationSize::afterPointer(), AATags); + + uint64_t ConstStride = Stride->getZExtValue(); + auto *VT = cast<VectorType>(IsLoad ? II->getType() + : II->getArgOperand(0)->getType()); + assert(Cols != 0 && "Matrix cannot have 0 columns"); + TypeSize Size = DL.getTypeAllocSize(VT->getScalarType()) * + (ConstStride * (Cols - 1) + Rows); + + // In the unstrided case, we have a precise size, ... + if (ConstStride == Rows) + return MemoryLocation(Arg, LocationSize::precise(Size), AATags); + // otherwise we merely obtain an upper bound. + return MemoryLocation(Arg, LocationSize::upperBound(Size), AATags); + } } assert( diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index bf62623..c47a1c1 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1001,13 +1001,25 @@ InstructionCost TargetTransformInfo::getShuffleCost( TargetTransformInfo::PartialReductionExtendKind TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) { - if (isa<SExtInst>(I)) - return PR_SignExtend; - if (isa<ZExtInst>(I)) - return PR_ZeroExtend; + if (auto *Cast = dyn_cast<CastInst>(I)) + return getPartialReductionExtendKind(Cast->getOpcode()); return PR_None; } +TargetTransformInfo::PartialReductionExtendKind +TargetTransformInfo::getPartialReductionExtendKind( + Instruction::CastOps CastOpc) { + switch (CastOpc) { + case Instruction::CastOps::ZExt: + return PR_ZeroExtend; + case Instruction::CastOps::SExt: + return PR_SignExtend; + default: + return PR_None; + } + llvm_unreachable("Unhandled cast opcode"); +} + TTI::CastContextHint TargetTransformInfo::getCastContextHint(const Instruction *I) { if (!I) diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp index b1cd939..aeb4a4d 100644 --- a/llvm/lib/CGData/CodeGenDataReader.cpp +++ b/llvm/lib/CGData/CodeGenDataReader.cpp @@ -125,7 +125,7 @@ Error IndexedCodeGenDataReader::read() { FunctionMapRecord.setReadStableFunctionMapNames( IndexedCodeGenDataReadFunctionMapNames); if (IndexedCodeGenDataLazyLoading) - FunctionMapRecord.lazyDeserialize(SharedDataBuffer, + FunctionMapRecord.lazyDeserialize(std::move(SharedDataBuffer), Header.StableFunctionMapOffset); else FunctionMapRecord.deserialize(Ptr); diff --git a/llvm/lib/CGData/StableFunctionMap.cpp b/llvm/lib/CGData/StableFunctionMap.cpp index 46e04bd..d0fae3a 100644 --- a/llvm/lib/CGData/StableFunctionMap.cpp +++ b/llvm/lib/CGData/StableFunctionMap.cpp @@ -137,6 +137,7 @@ size_t StableFunctionMap::size(SizeType Type) const { const StableFunctionMap::StableFunctionEntries & StableFunctionMap::at(HashFuncsMapType::key_type FunctionHash) const { auto It = HashToFuncs.find(FunctionHash); + assert(It != HashToFuncs.end() && "FunctionHash not found!"); if (isLazilyLoaded()) deserializeLazyLoadingEntry(It); return It->second.Entries; diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index fefde64f..8aa488f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -41,6 +41,7 @@ #include "llvm/CodeGen/GCMetadataPrinter.h" #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockHashInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" @@ -184,6 +185,8 @@ static cl::opt<bool> PrintLatency( cl::desc("Print instruction latencies as verbose asm comments"), cl::Hidden, cl::init(false)); +extern cl::opt<bool> EmitBBHash; + STATISTIC(EmittedInsts, "Number of machine instrs printed"); char AsmPrinter::ID = 0; @@ -474,6 +477,8 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<GCModuleInfo>(); AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); AU.addRequired<MachineBranchProbabilityInfoWrapperPass>(); + if (EmitBBHash) + AU.addRequired<MachineBlockHashInfo>(); } bool AsmPrinter::doInitialization(Module &M) { @@ -1434,14 +1439,11 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges, "BB entries info is required for BBFreq and BrProb " "features"); } - return {FuncEntryCountEnabled, - BBFreqEnabled, - BrProbEnabled, + return {FuncEntryCountEnabled, BBFreqEnabled, BrProbEnabled, MF.hasBBSections() && NumMBBSectionRanges > 1, // Use static_cast to avoid breakage of tests on windows. - static_cast<bool>(BBAddrMapSkipEmitBBEntries), - HasCalls, - false}; + static_cast<bool>(BBAddrMapSkipEmitBBEntries), HasCalls, + static_cast<bool>(EmitBBHash)}; } void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { @@ -1500,6 +1502,9 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { PrevMBBEndSymbol = MBBSymbol; } + auto MBHI = + Features.BBHash ? &getAnalysis<MachineBlockHashInfo>() : nullptr; + if (!Features.OmitBBEntries) { OutStreamer->AddComment("BB id"); // Emit the BB ID for this basic block. @@ -1527,6 +1532,10 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { emitLabelDifferenceAsULEB128(MBB.getEndSymbol(), CurrentLabel); // Emit the Metadata. OutStreamer->emitULEB128IntValue(getBBAddrMapMetadata(MBB)); + // Emit the Hash. + if (MBHI) { + OutStreamer->emitInt64(MBHI->getMBBHash(MBB)); + } } PrevMBBEndSymbol = MBB.getEndSymbol(); } diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index b6872605..4373c53 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -108,6 +108,7 @@ add_llvm_component_library(LLVMCodeGen LowerEmuTLS.cpp MachineBasicBlock.cpp MachineBlockFrequencyInfo.cpp + MachineBlockHashInfo.cpp MachineBlockPlacement.cpp MachineBranchProbabilityInfo.cpp MachineCFGPrinter.cpp diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp index 2b5ced3..f44eb22 100644 --- a/llvm/lib/CodeGen/ExpandFp.cpp +++ b/llvm/lib/CodeGen/ExpandFp.cpp @@ -1108,8 +1108,8 @@ public: }; } // namespace -ExpandFpPass::ExpandFpPass(const TargetMachine *TM, CodeGenOptLevel OptLevel) - : TM(TM), OptLevel(OptLevel) {} +ExpandFpPass::ExpandFpPass(const TargetMachine &TM, CodeGenOptLevel OptLevel) + : TM(&TM), OptLevel(OptLevel) {} void ExpandFpPass::printPipeline( raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 1fe38d6..b49040b 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1862,15 +1862,19 @@ bool IRTranslator::translateVectorDeinterleave2Intrinsic( void IRTranslator::getStackGuard(Register DstReg, MachineIRBuilder &MIRBuilder) { + Value *Global = TLI->getSDagStackGuard(*MF->getFunction().getParent()); + if (!Global) { + LLVMContext &Ctx = MIRBuilder.getContext(); + Ctx.diagnose(DiagnosticInfoGeneric("unable to lower stackguard")); + MIRBuilder.buildUndef(DstReg); + return; + } + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); MRI->setRegClass(DstReg, TRI->getPointerRegClass()); auto MIB = MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD, {DstReg}, {}); - Value *Global = TLI->getSDagStackGuard(*MF->getFunction().getParent()); - if (!Global) - return; - unsigned AddrSpace = Global->getType()->getPointerAddressSpace(); LLT PtrTy = LLT::pointer(AddrSpace, DL->getPointerSizeInBits(AddrSpace)); diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index d6e8505..c3e0964 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -721,6 +721,9 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { // Allocate a new register for the remat. Register NewVReg = Edit->createFrom(Original); + // Constrain it to the register class of MI. + MRI.constrainRegClass(NewVReg, MRI.getRegClass(VirtReg.reg())); + // Finally we can rematerialize OrigMI before MI. SlotIndex DefIdx = Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI); diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp index 75ca06a..00b37e7 100644 --- a/llvm/lib/CodeGen/MIR2Vec.cpp +++ b/llvm/lib/CodeGen/MIR2Vec.cpp @@ -417,24 +417,39 @@ Expected<MIRVocabulary> MIRVocabulary::createDummyVocabForTest( } //===----------------------------------------------------------------------===// -// MIR2VecVocabLegacyAnalysis Implementation +// MIR2VecVocabProvider and MIR2VecVocabLegacyAnalysis //===----------------------------------------------------------------------===// -char MIR2VecVocabLegacyAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis", - "MIR2Vec Vocabulary Analysis", false, true) -INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass) -INITIALIZE_PASS_END(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis", - "MIR2Vec Vocabulary Analysis", false, true) +Expected<mir2vec::MIRVocabulary> +MIR2VecVocabProvider::getVocabulary(const Module &M) { + VocabMap OpcVocab, CommonOperandVocab, PhyRegVocabMap, VirtRegVocabMap; -StringRef MIR2VecVocabLegacyAnalysis::getPassName() const { - return "MIR2Vec Vocabulary Analysis"; + if (Error Err = readVocabulary(OpcVocab, CommonOperandVocab, PhyRegVocabMap, + VirtRegVocabMap)) + return std::move(Err); + + for (const auto &F : M) { + if (F.isDeclaration()) + continue; + + if (auto *MF = MMI.getMachineFunction(F)) { + auto &Subtarget = MF->getSubtarget(); + if (const auto *TII = Subtarget.getInstrInfo()) + if (const auto *TRI = Subtarget.getRegisterInfo()) + return mir2vec::MIRVocabulary::create( + std::move(OpcVocab), std::move(CommonOperandVocab), + std::move(PhyRegVocabMap), std::move(VirtRegVocabMap), *TII, *TRI, + MF->getRegInfo()); + } + } + return createStringError(errc::invalid_argument, + "No machine functions found in module"); } -Error MIR2VecVocabLegacyAnalysis::readVocabulary(VocabMap &OpcodeVocab, - VocabMap &CommonOperandVocab, - VocabMap &PhyRegVocabMap, - VocabMap &VirtRegVocabMap) { +Error MIR2VecVocabProvider::readVocabulary(VocabMap &OpcodeVocab, + VocabMap &CommonOperandVocab, + VocabMap &PhyRegVocabMap, + VocabMap &VirtRegVocabMap) { if (VocabFile.empty()) return createStringError( errc::invalid_argument, @@ -483,49 +498,15 @@ Error MIR2VecVocabLegacyAnalysis::readVocabulary(VocabMap &OpcodeVocab, return Error::success(); } -Expected<mir2vec::MIRVocabulary> -MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) { - if (Vocab.has_value()) - return std::move(Vocab.value()); - - VocabMap OpcMap, CommonOperandMap, PhyRegMap, VirtRegMap; - if (Error Err = - readVocabulary(OpcMap, CommonOperandMap, PhyRegMap, VirtRegMap)) - return std::move(Err); - - // Get machine module info to access machine functions and target info - MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); - - // Find first available machine function to get target instruction info - for (const auto &F : M) { - if (F.isDeclaration()) - continue; - - if (auto *MF = MMI.getMachineFunction(F)) { - auto &Subtarget = MF->getSubtarget(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - if (!TII) { - return createStringError(errc::invalid_argument, - "No TargetInstrInfo available; cannot create " - "MIR2Vec vocabulary"); - } - - const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - if (!TRI) { - return createStringError(errc::invalid_argument, - "No TargetRegisterInfo available; cannot " - "create MIR2Vec vocabulary"); - } - - return mir2vec::MIRVocabulary::create( - std::move(OpcMap), std::move(CommonOperandMap), std::move(PhyRegMap), - std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo()); - } - } +char MIR2VecVocabLegacyAnalysis::ID = 0; +INITIALIZE_PASS_BEGIN(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis", + "MIR2Vec Vocabulary Analysis", false, true) +INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass) +INITIALIZE_PASS_END(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis", + "MIR2Vec Vocabulary Analysis", false, true) - // No machine functions available - return error - return createStringError(errc::invalid_argument, - "No machine functions found in module"); +StringRef MIR2VecVocabLegacyAnalysis::getPassName() const { + return "MIR2Vec Vocabulary Analysis"; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/MachineBlockHashInfo.cpp b/llvm/lib/CodeGen/MachineBlockHashInfo.cpp new file mode 100644 index 0000000..c4d9c0f --- /dev/null +++ b/llvm/lib/CodeGen/MachineBlockHashInfo.cpp @@ -0,0 +1,115 @@ +//===- llvm/CodeGen/MachineBlockHashInfo.cpp---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Compute the hashes of basic blocks. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineBlockHashInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +uint64_t hashBlock(const MachineBasicBlock &MBB, bool HashOperands) { + uint64_t Hash = 0; + for (const MachineInstr &MI : MBB) { + if (MI.isMetaInstruction() || MI.isTerminator()) + continue; + Hash = hashing::detail::hash_16_bytes(Hash, MI.getOpcode()); + if (HashOperands) { + for (unsigned i = 0; i < MI.getNumOperands(); i++) { + Hash = + hashing::detail::hash_16_bytes(Hash, hash_value(MI.getOperand(i))); + } + } + } + return Hash; +} + +/// Fold a 64-bit integer to a 16-bit one. +uint16_t fold_64_to_16(const uint64_t Value) { + uint16_t Res = static_cast<uint16_t>(Value); + Res ^= static_cast<uint16_t>(Value >> 16); + Res ^= static_cast<uint16_t>(Value >> 32); + Res ^= static_cast<uint16_t>(Value >> 48); + return Res; +} + +INITIALIZE_PASS(MachineBlockHashInfo, "machine-block-hash", + "Machine Block Hash Analysis", true, true) + +char MachineBlockHashInfo::ID = 0; + +MachineBlockHashInfo::MachineBlockHashInfo() : MachineFunctionPass(ID) { + initializeMachineBlockHashInfoPass(*PassRegistry::getPassRegistry()); +} + +void MachineBlockHashInfo::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +struct CollectHashInfo { + uint64_t Offset; + uint64_t OpcodeHash; + uint64_t InstrHash; + uint64_t NeighborHash; +}; + +bool MachineBlockHashInfo::runOnMachineFunction(MachineFunction &F) { + DenseMap<const MachineBasicBlock *, CollectHashInfo> HashInfos; + uint16_t Offset = 0; + // Initialize hash components + for (const MachineBasicBlock &MBB : F) { + // offset of the machine basic block + HashInfos[&MBB].Offset = Offset; + Offset += MBB.size(); + // Hashing opcodes + HashInfos[&MBB].OpcodeHash = hashBlock(MBB, /*HashOperands=*/false); + // Hash complete instructions + HashInfos[&MBB].InstrHash = hashBlock(MBB, /*HashOperands=*/true); + } + + // Initialize neighbor hash + for (const MachineBasicBlock &MBB : F) { + uint64_t Hash = HashInfos[&MBB].OpcodeHash; + // Append hashes of successors + for (const MachineBasicBlock *SuccMBB : MBB.successors()) { + uint64_t SuccHash = HashInfos[SuccMBB].OpcodeHash; + Hash = hashing::detail::hash_16_bytes(Hash, SuccHash); + } + // Append hashes of predecessors + for (const MachineBasicBlock *PredMBB : MBB.predecessors()) { + uint64_t PredHash = HashInfos[PredMBB].OpcodeHash; + Hash = hashing::detail::hash_16_bytes(Hash, PredHash); + } + HashInfos[&MBB].NeighborHash = Hash; + } + + // Assign hashes + for (const MachineBasicBlock &MBB : F) { + const auto &HashInfo = HashInfos[&MBB]; + BlendedBlockHash BlendedHash(fold_64_to_16(HashInfo.Offset), + fold_64_to_16(HashInfo.OpcodeHash), + fold_64_to_16(HashInfo.InstrHash), + fold_64_to_16(HashInfo.NeighborHash)); + MBBHashInfo[&MBB] = BlendedHash.combine(); + } + + return false; +} + +uint64_t MachineBlockHashInfo::getMBBHash(const MachineBasicBlock &MBB) { + return MBBHashInfo[&MBB]; +} + +MachineFunctionPass *llvm::createMachineBlockHashInfoPass() { + return new MachineBlockHashInfo(); +} diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index f54e2f2..620d3d3 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -593,7 +593,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const { case Intrinsic::log: Changed |= forEachCall(F, [&](CallInst *CI) { Type *Ty = CI->getArgOperand(0)->getType(); - if (!isa<ScalableVectorType>(Ty)) + if (!TM || !isa<ScalableVectorType>(Ty)) return false; const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering(); unsigned Op = TL->IntrinsicIDToISD(F.getIntrinsicID()); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 603dc34..9656a30 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -890,6 +890,7 @@ private: SDValue ScalarizeVecRes_UnaryOpWithExtraInput(SDNode *N); SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N); SDValue ScalarizeVecRes_LOAD(LoadSDNode *N); + SDValue ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N); SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N); SDValue ScalarizeVecRes_VSELECT(SDNode *N); SDValue ScalarizeVecRes_SELECT(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 3b5f83f..bb4a8d9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -69,6 +69,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { R = ScalarizeVecRes_UnaryOpWithExtraInput(N); break; case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break; + case ISD::ATOMIC_LOAD: + R = ScalarizeVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N)); + break; case ISD::LOAD: R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break; case ISD::SCALAR_TO_VECTOR: R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break; case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break; @@ -475,6 +478,18 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) { return Op; } +SDValue DAGTypeLegalizer::ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N) { + SDValue Result = DAG.getAtomicLoad( + N->getExtensionType(), SDLoc(N), N->getMemoryVT().getVectorElementType(), + N->getValueType(0).getVectorElementType(), N->getChain(), N->getBasePtr(), + N->getMemOperand()); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Result.getValue(1)); + return Result; +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) { assert(N->isUnindexed() && "Indexed vector load?"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 90edaf3..379242e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8620,7 +8620,10 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset, if (TLI->shouldPreservePtrArith(this->getMachineFunction().getFunction(), BasePtrVT)) return getNode(ISD::PTRADD, DL, BasePtrVT, Ptr, Offset, Flags); - return getNode(ISD::ADD, DL, BasePtrVT, Ptr, Offset, Flags); + // InBounds only applies to PTRADD, don't set it if we generate ADD. + SDNodeFlags AddFlags = Flags; + AddFlags.setInBounds(false); + return getNode(ISD::ADD, DL, BasePtrVT, Ptr, Offset, AddFlags); } /// Returns true if memcpy source is constant data. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index dcf2df3..bfa566a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3131,12 +3131,16 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, if (TLI.useLoadStackGuardNode(M)) { Guard = getLoadStackGuard(DAG, dl, Chain); } else { - const Value *IRGuard = TLI.getSDagStackGuard(M); - SDValue GuardPtr = getValue(IRGuard); - - Guard = DAG.getLoad(PtrMemTy, dl, Chain, GuardPtr, - MachinePointerInfo(IRGuard, 0), Align, - MachineMemOperand::MOVolatile); + if (const Value *IRGuard = TLI.getSDagStackGuard(M)) { + SDValue GuardPtr = getValue(IRGuard); + Guard = DAG.getLoad(PtrMemTy, dl, Chain, GuardPtr, + MachinePointerInfo(IRGuard, 0), Align, + MachineMemOperand::MOVolatile); + } else { + LLVMContext &Ctx = *DAG.getContext(); + Ctx.diagnose(DiagnosticInfoGeneric("unable to lower stackguard")); + Guard = DAG.getPOISON(PtrMemTy); + } } // Perform the comparison via a getsetcc. @@ -4386,6 +4390,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { if (NW.hasNoUnsignedWrap() || (int64_t(Offset) >= 0 && NW.hasNoUnsignedSignedWrap())) Flags |= SDNodeFlags::NoUnsignedWrap; + Flags.setInBounds(NW.isInBounds()); N = DAG.getMemBasePlusOffset( N, DAG.getConstant(Offset, dl, N.getValueType()), dl, Flags); @@ -4429,6 +4434,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { if (NW.hasNoUnsignedWrap() || (Offs.isNonNegative() && NW.hasNoUnsignedSignedWrap())) Flags.setNoUnsignedWrap(true); + Flags.setInBounds(NW.isInBounds()); OffsVal = DAG.getSExtOrTrunc(OffsVal, dl, N.getValueType()); @@ -4498,6 +4504,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { // pointer index type (add nuw). SDNodeFlags AddFlags; AddFlags.setNoUnsignedWrap(NW.hasNoUnsignedWrap()); + AddFlags.setInBounds(NW.isInBounds()); N = DAG.getMemBasePlusOffset(N, IdxN, dl, AddFlags); } @@ -7324,6 +7331,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, Res = DAG.getPtrExtOrTrunc(Res, sdl, PtrTy); } else { const Value *Global = TLI.getSDagStackGuard(M); + if (!Global) { + LLVMContext &Ctx = *DAG.getContext(); + Ctx.diagnose(DiagnosticInfoGeneric("unable to lower stackguard")); + setValue(&I, DAG.getPOISON(PtrTy)); + return; + } + Align Align = DAG.getDataLayout().getPrefTypeAlign(Global->getType()); Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global), MachinePointerInfo(Global, 0), Align, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 39cbfad..77377d3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -689,6 +689,9 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { if (getFlags().hasSameSign()) OS << " samesign"; + if (getFlags().hasInBounds()) + OS << " inbounds"; + if (getFlags().hasNonNeg()) OS << " nneg"; diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index b6169e6..10b7238 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -272,6 +272,12 @@ static cl::opt<bool> cl::desc("Split static data sections into hot and cold " "sections using profile information")); +cl::opt<bool> EmitBBHash( + "emit-bb-hash", + cl::desc( + "Emit the hash of basic block in the SHT_LLVM_BB_ADDR_MAP section."), + cl::init(false), cl::Optional); + /// Allow standard passes to be disabled by command line options. This supports /// simple binary flags that either suppress the pass or do nothing. /// i.e. -disable-mypass=false has no effect. @@ -1281,6 +1287,8 @@ void TargetPassConfig::addMachinePasses() { // address map (or both). if (TM->getBBSectionsType() != llvm::BasicBlockSection::None || TM->Options.BBAddrMap) { + if (EmitBBHash) + addPass(llvm::createMachineBlockHashInfoPass()); if (TM->getBBSectionsType() == llvm::BasicBlockSection::List) { addPass(llvm::createBasicBlockSectionsProfileReaderWrapperPass( TM->getBBSectionsFuncListBuf())); diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp index 6c7e27e..fa04976 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp @@ -247,7 +247,7 @@ public: StandardSegments(std::move(StandardSegments)), FinalizationSegments(std::move(FinalizationSegments)) {} - ~IPInFlightAlloc() { + ~IPInFlightAlloc() override { assert(!G && "InFlight alloc neither abandoned nor finalized"); } diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp index 75ae80f..4ceff48 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp @@ -38,7 +38,7 @@ public: MachODebugObjectSynthesizerBase(LinkGraph &G, ExecutorAddr RegisterActionAddr) : G(G), RegisterActionAddr(RegisterActionAddr) {} - virtual ~MachODebugObjectSynthesizerBase() = default; + ~MachODebugObjectSynthesizerBase() override = default; Error preserveDebugSections() { if (G.findSectionByName(SynthDebugSectionName)) { diff --git a/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp index d1a6eaf..a2990ab 100644 --- a/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp @@ -55,7 +55,7 @@ public: Plugins = Layer.Plugins; } - ~JITLinkCtx() { + ~JITLinkCtx() override { // If there is an object buffer return function then use it to // return ownership of the buffer. if (Layer.ReturnObjectBuffer && ObjBuffer) diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp index fd805fbf..cdde733 100644 --- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp +++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp @@ -92,7 +92,7 @@ public: Name(std::move(Name)), Ctx(Ctx), Materialize(Materialize), Discard(Discard), Destroy(Destroy) {} - ~OrcCAPIMaterializationUnit() { + ~OrcCAPIMaterializationUnit() override { if (Ctx) Destroy(Ctx); } @@ -264,7 +264,7 @@ public: LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction TryToGenerate) : Dispose(Dispose), Ctx(Ctx), TryToGenerate(TryToGenerate) {} - ~CAPIDefinitionGenerator() { + ~CAPIDefinitionGenerator() override { if (Dispose) Dispose(Ctx); } diff --git a/llvm/lib/Frontend/HLSL/CBuffer.cpp b/llvm/lib/Frontend/HLSL/CBuffer.cpp index 407b6ad..1f53c87 100644 --- a/llvm/lib/Frontend/HLSL/CBuffer.cpp +++ b/llvm/lib/Frontend/HLSL/CBuffer.cpp @@ -43,8 +43,13 @@ std::optional<CBufferMetadata> CBufferMetadata::get(Module &M) { for (const MDNode *MD : CBufMD->operands()) { assert(MD->getNumOperands() && "Invalid cbuffer metadata"); - auto *Handle = cast<GlobalVariable>( - cast<ValueAsMetadata>(MD->getOperand(0))->getValue()); + // For an unused cbuffer, the handle may have been optimized out + Metadata *OpMD = MD->getOperand(0); + if (!OpMD) + continue; + + auto *Handle = + cast<GlobalVariable>(cast<ValueAsMetadata>(OpMD)->getValue()); CBufferMapping &Mapping = Result->Mappings.emplace_back(Handle); for (int I = 1, E = MD->getNumOperands(); I < E; ++I) { diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 03da154..7917712 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4446,10 +4446,12 @@ void Verifier::visitLoadInst(LoadInst &LI) { Check(LI.getOrdering() != AtomicOrdering::Release && LI.getOrdering() != AtomicOrdering::AcquireRelease, "Load cannot have Release ordering", &LI); - Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(), - "atomic load operand must have integer, pointer, or floating point " - "type!", + Check(ElTy->getScalarType()->isIntOrPtrTy() || + ElTy->getScalarType()->isFloatingPointTy(), + "atomic load operand must have integer, pointer, floating point, " + "or vector type!", ElTy, &LI); + checkAtomicMemAccessSize(ElTy, &LI); } else { Check(LI.getSyncScopeID() == SyncScope::System, @@ -4472,9 +4474,10 @@ void Verifier::visitStoreInst(StoreInst &SI) { Check(SI.getOrdering() != AtomicOrdering::Acquire && SI.getOrdering() != AtomicOrdering::AcquireRelease, "Store cannot have Acquire ordering", &SI); - Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(), - "atomic store operand must have integer, pointer, or floating point " - "type!", + Check(ElTy->getScalarType()->isIntOrPtrTy() || + ElTy->getScalarType()->isFloatingPointTy(), + "atomic store operand must have integer, pointer, floating point, " + "or vector type!", ElTy, &SI); checkAtomicMemAccessSize(ElTy, &SI); } else { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 72ae064..9d0fa11 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -477,6 +477,10 @@ static void thinLTOInternalizeAndPromoteGUID( return !GlobalValue::isLocalLinkage(Summary->linkage()); }); + // Before performing index-based internalization and promotion for this GUID, + // the local flag should be consistent with the summary list linkage types. + VI.verifyLocal(); + for (auto &S : VI.getSummaryList()) { // First see if we need to promote an internal value because it is not // exported. @@ -2220,6 +2224,7 @@ class OutOfProcessThinBackend : public CGThinBackend { ArrayRef<StringRef> DistributorArgs; SString RemoteCompiler; + ArrayRef<StringRef> RemoteCompilerPrependArgs; ArrayRef<StringRef> RemoteCompilerArgs; bool SaveTemps; @@ -2256,12 +2261,14 @@ public: bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles, StringRef LinkerOutputFile, StringRef Distributor, ArrayRef<StringRef> DistributorArgs, StringRef RemoteCompiler, + ArrayRef<StringRef> RemoteCompilerPrependArgs, ArrayRef<StringRef> RemoteCompilerArgs, bool SaveTemps) : CGThinBackend(Conf, CombinedIndex, ModuleToDefinedGVSummaries, AddStream, OnWrite, ShouldEmitIndexFiles, ShouldEmitImportsFiles, ThinLTOParallelism), LinkerOutputFile(LinkerOutputFile), DistributorPath(Distributor), DistributorArgs(DistributorArgs), RemoteCompiler(RemoteCompiler), + RemoteCompilerPrependArgs(RemoteCompilerPrependArgs), RemoteCompilerArgs(RemoteCompilerArgs), SaveTemps(SaveTemps) {} virtual void setup(unsigned ThinLTONumTasks, unsigned ThinLTOTaskOffset, @@ -2383,6 +2390,11 @@ public: JOS.attributeArray("args", [&]() { JOS.value(RemoteCompiler); + // Forward any supplied prepend options. + if (!RemoteCompilerPrependArgs.empty()) + for (auto &A : RemoteCompilerPrependArgs) + JOS.value(A); + JOS.value("-c"); JOS.value(Saver.save("--target=" + Triple.str())); @@ -2513,6 +2525,7 @@ ThinBackend lto::createOutOfProcessThinBackend( bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles, StringRef LinkerOutputFile, StringRef Distributor, ArrayRef<StringRef> DistributorArgs, StringRef RemoteCompiler, + ArrayRef<StringRef> RemoteCompilerPrependArgs, ArrayRef<StringRef> RemoteCompilerArgs, bool SaveTemps) { auto Func = [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex, @@ -2522,7 +2535,7 @@ ThinBackend lto::createOutOfProcessThinBackend( Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream, OnWrite, ShouldEmitIndexFiles, ShouldEmitImportsFiles, LinkerOutputFile, Distributor, DistributorArgs, RemoteCompiler, - RemoteCompilerArgs, SaveTemps); + RemoteCompilerPrependArgs, RemoteCompilerArgs, SaveTemps); }; return ThinBackend(Func, Parallelism); } diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp index 98090d3..6670971 100644 --- a/llvm/lib/MC/MCAsmInfoELF.cpp +++ b/llvm/lib/MC/MCAsmInfoELF.cpp @@ -197,6 +197,8 @@ void MCAsmInfoELF::printSwitchToSection(const MCSection &Section, OS << "llvm_jt_sizes"; else if (Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE) OS << "llvm_cfi_jump_table"; + else if (Sec.Type == ELF::SHT_LLVM_CALL_GRAPH) + OS << "llvm_call_graph"; else OS << "0x" << Twine::utohexstr(Sec.Type); diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index aee3c3b..b2f5000 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -554,7 +554,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) { Ctx->getELFSection(".sframe", ELF::SHT_GNU_SFRAME, ELF::SHF_ALLOC); CallGraphSection = - Ctx->getELFSection(".llvm.callgraph", ELF::SHT_PROGBITS, 0); + Ctx->getELFSection(".llvm.callgraph", ELF::SHT_LLVM_CALL_GRAPH, 0); StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0); @@ -1172,7 +1172,7 @@ MCObjectFileInfo::getCallGraphSection(const MCSection &TextSec) const { } return Ctx->getELFSection( - ".llvm.callgraph", ELF::SHT_PROGBITS, Flags, 0, GroupName, + ".llvm.callgraph", ELF::SHT_LLVM_CALL_GRAPH, Flags, 0, GroupName, /*IsComdat=*/true, ElfSec.getUniqueID(), static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol())); } diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 6195355..1a3752f 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -637,6 +637,8 @@ EndStmt: Type = ELF::SHT_LLVM_JT_SIZES; else if (TypeName == "llvm_cfi_jump_table") Type = ELF::SHT_LLVM_CFI_JUMP_TABLE; + else if (TypeName == "llvm_call_graph") + Type = ELF::SHT_LLVM_CALL_GRAPH; else if (TypeName.getAsInteger(0, Type)) return TokError("unknown section type"); } diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index f256e7b..6da97f9 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -322,6 +322,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) { STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_LTO); STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_JT_SIZES) STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_CFI_JUMP_TABLE) + STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_CALL_GRAPH); STRINGIFY_ENUM_CASE(ELF, SHT_GNU_SFRAME); STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES); STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH); diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index c3a27c9..f8a84b0 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -744,6 +744,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration( ECase(SHT_LLVM_BB_ADDR_MAP); ECase(SHT_LLVM_OFFLOADING); ECase(SHT_LLVM_LTO); + ECase(SHT_LLVM_CALL_GRAPH); ECase(SHT_GNU_SFRAME); ECase(SHT_GNU_ATTRIBUTES); ECase(SHT_GNU_HASH); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 048c58d..3c9a27a 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -669,7 +669,14 @@ void PassBuilder::registerFunctionAnalyses(FunctionAnalysisManager &FAM) { FAM.registerPass([&] { return buildDefaultAAPipeline(); }); #define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \ - FAM.registerPass([&] { return CREATE_PASS; }); + if constexpr (std::is_constructible_v< \ + std::remove_reference_t<decltype(CREATE_PASS)>, \ + const TargetMachine &>) { \ + if (TM) \ + FAM.registerPass([&] { return CREATE_PASS; }); \ + } else { \ + FAM.registerPass([&] { return CREATE_PASS; }); \ + } #include "PassRegistry.def" for (auto &C : FunctionAnalysisRegistrationCallbacks) @@ -2038,6 +2045,14 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, } #define FUNCTION_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ + if constexpr (std::is_constructible_v< \ + std::remove_reference_t<decltype(CREATE_PASS)>, \ + const TargetMachine &>) { \ + if (!TM) \ + return make_error<StringError>( \ + formatv("pass '{0}' requires TargetMachine", Name).str(), \ + inconvertibleErrorCode()); \ + } \ MPM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS)); \ return Error::success(); \ } @@ -2046,6 +2061,18 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, auto Params = parsePassParameters(PARSER, Name, NAME); \ if (!Params) \ return Params.takeError(); \ + auto CreatePass = CREATE_PASS; \ + if constexpr (std::is_constructible_v< \ + std::remove_reference_t<decltype(CreatePass( \ + Params.get()))>, \ + const TargetMachine &, \ + std::remove_reference_t<decltype(Params.get())>>) { \ + if (!TM) { \ + return make_error<StringError>( \ + formatv("pass '{0}' requires TargetMachine", Name).str(), \ + inconvertibleErrorCode()); \ + } \ + } \ MPM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS(Params.get()))); \ return Error::success(); \ } @@ -2152,6 +2179,14 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, } #define FUNCTION_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ + if constexpr (std::is_constructible_v< \ + std::remove_reference_t<decltype(CREATE_PASS)>, \ + const TargetMachine &>) { \ + if (!TM) \ + return make_error<StringError>( \ + formatv("pass '{0}' requires TargetMachine", Name).str(), \ + inconvertibleErrorCode()); \ + } \ CGPM.addPass(createCGSCCToFunctionPassAdaptor(CREATE_PASS)); \ return Error::success(); \ } @@ -2160,6 +2195,18 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, auto Params = parsePassParameters(PARSER, Name, NAME); \ if (!Params) \ return Params.takeError(); \ + auto CreatePass = CREATE_PASS; \ + if constexpr (std::is_constructible_v< \ + std::remove_reference_t<decltype(CreatePass( \ + Params.get()))>, \ + const TargetMachine &, \ + std::remove_reference_t<decltype(Params.get())>>) { \ + if (!TM) { \ + return make_error<StringError>( \ + formatv("pass '{0}' requires TargetMachine", Name).str(), \ + inconvertibleErrorCode()); \ + } \ + } \ CGPM.addPass(createCGSCCToFunctionPassAdaptor(CREATE_PASS(Params.get()))); \ return Error::success(); \ } @@ -2239,6 +2286,14 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, // Now expand the basic registered passes from the .inc file. #define FUNCTION_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ + if constexpr (std::is_constructible_v< \ + std::remove_reference_t<decltype(CREATE_PASS)>, \ + const TargetMachine &>) { \ + if (!TM) \ + return make_error<StringError>( \ + formatv("pass '{0}' requires TargetMachine", Name).str(), \ + inconvertibleErrorCode()); \ + } \ FPM.addPass(CREATE_PASS); \ return Error::success(); \ } @@ -2247,14 +2302,34 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, auto Params = parsePassParameters(PARSER, Name, NAME); \ if (!Params) \ return Params.takeError(); \ + auto CreatePass = CREATE_PASS; \ + if constexpr (std::is_constructible_v< \ + std::remove_reference_t<decltype(CreatePass( \ + Params.get()))>, \ + const TargetMachine &, \ + std::remove_reference_t<decltype(Params.get())>>) { \ + if (!TM) { \ + return make_error<StringError>( \ + formatv("pass '{0}' requires TargetMachine", Name).str(), \ + inconvertibleErrorCode()); \ + } \ + } \ FPM.addPass(CREATE_PASS(Params.get())); \ return Error::success(); \ } #define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \ if (Name == "require<" NAME ">") { \ + if constexpr (std::is_constructible_v< \ + std::remove_reference_t<decltype(CREATE_PASS)>, \ + const TargetMachine &>) { \ + if (!TM) \ + return make_error<StringError>( \ + formatv("pass '{0}' requires TargetMachine", Name).str(), \ + inconvertibleErrorCode()); \ + } \ FPM.addPass( \ - RequireAnalysisPass< \ - std::remove_reference_t<decltype(CREATE_PASS)>, Function>()); \ + RequireAnalysisPass<std::remove_reference_t<decltype(CREATE_PASS)>, \ + Function>()); \ return Error::success(); \ } \ if (Name == "invalidate<" NAME ">") { \ diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index a66b6e4..1853cdd 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -345,7 +345,7 @@ FUNCTION_ANALYSIS("aa", AAManager()) FUNCTION_ANALYSIS("access-info", LoopAccessAnalysis()) FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis()) FUNCTION_ANALYSIS("bb-sections-profile-reader", - BasicBlockSectionsProfileReaderAnalysis(TM)) + BasicBlockSectionsProfileReaderAnalysis(*TM)) FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis()) FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis()) FUNCTION_ANALYSIS("cycles", CycleAnalysis()) @@ -356,7 +356,7 @@ FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis()) FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis()) FUNCTION_ANALYSIS("ephemerals", EphemeralValuesAnalysis()) FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis()) -FUNCTION_ANALYSIS("machine-function-info", MachineFunctionAnalysis(TM)) +FUNCTION_ANALYSIS("machine-function-info", MachineFunctionAnalysis(*TM)) FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis()) FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis()) FUNCTION_ANALYSIS("last-run-tracking", LastRunTrackingAnalysis()) @@ -406,14 +406,14 @@ FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass()) FUNCTION_PASS("annotation-remarks", AnnotationRemarksPass()) FUNCTION_PASS("assume-builder", AssumeBuilderPass()) FUNCTION_PASS("assume-simplify", AssumeSimplifyPass()) -FUNCTION_PASS("atomic-expand", AtomicExpandPass(TM)) +FUNCTION_PASS("atomic-expand", AtomicExpandPass(*TM)) FUNCTION_PASS("bdce", BDCEPass()) FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass()) FUNCTION_PASS("callbr-prepare", CallBrPreparePass()) FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass()) FUNCTION_PASS("chr", ControlHeightReductionPass()) -FUNCTION_PASS("codegenprepare", CodeGenPreparePass(TM)) -FUNCTION_PASS("complex-deinterleaving", ComplexDeinterleavingPass(TM)) +FUNCTION_PASS("codegenprepare", CodeGenPreparePass(*TM)) +FUNCTION_PASS("complex-deinterleaving", ComplexDeinterleavingPass(*TM)) FUNCTION_PASS("consthoist", ConstantHoistingPass()) FUNCTION_PASS("constraint-elimination", ConstraintEliminationPass()) FUNCTION_PASS("coro-elide", CoroElidePass()) @@ -430,10 +430,10 @@ FUNCTION_PASS("dot-dom-only", DomOnlyPrinter()) FUNCTION_PASS("dot-post-dom", PostDomPrinter()) FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter()) FUNCTION_PASS("dse", DSEPass()) -FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM)) +FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(*TM)) FUNCTION_PASS("drop-unnecessary-assumes", DropUnnecessaryAssumesPass()) -FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM)) -FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM)) +FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(*TM)) +FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(*TM)) FUNCTION_PASS("expand-reductions", ExpandReductionsPass()) FUNCTION_PASS("extra-vector-passes", ExtraFunctionPassManager<ShouldRunExtraVectorPasses>()) @@ -446,15 +446,15 @@ FUNCTION_PASS("guard-widening", GuardWideningPass()) FUNCTION_PASS("gvn-hoist", GVNHoistPass()) FUNCTION_PASS("gvn-sink", GVNSinkPass()) FUNCTION_PASS("helloworld", HelloWorldPass()) -FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(TM)) +FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(*TM)) FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass()) FUNCTION_PASS("infer-alignment", InferAlignmentPass()) FUNCTION_PASS("inject-tli-mappings", InjectTLIMappings()) FUNCTION_PASS("instcount", InstCountPass()) FUNCTION_PASS("instnamer", InstructionNamerPass()) FUNCTION_PASS("instsimplify", InstSimplifyPass()) -FUNCTION_PASS("interleaved-access", InterleavedAccessPass(TM)) -FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass(TM)) +FUNCTION_PASS("interleaved-access", InterleavedAccessPass(*TM)) +FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass(*TM)) FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass()) FUNCTION_PASS("irce", IRCEPass()) FUNCTION_PASS("jump-threading", JumpThreadingPass()) @@ -533,25 +533,25 @@ FUNCTION_PASS("reassociate", ReassociatePass()) FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass()) FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib()) FUNCTION_PASS("reg2mem", RegToMemPass()) -FUNCTION_PASS("safe-stack", SafeStackPass(TM)) +FUNCTION_PASS("safe-stack", SafeStackPass(*TM)) FUNCTION_PASS("sandbox-vectorizer", SandboxVectorizerPass()) FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass()) FUNCTION_PASS("sccp", SCCPPass()) -FUNCTION_PASS("select-optimize", SelectOptimizePass(TM)) +FUNCTION_PASS("select-optimize", SelectOptimizePass(*TM)) FUNCTION_PASS("separate-const-offset-from-gep", SeparateConstOffsetFromGEPPass()) FUNCTION_PASS("sink", SinkingPass()) FUNCTION_PASS("sjlj-eh-prepare", SjLjEHPreparePass(TM)) FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass()) FUNCTION_PASS("slsr", StraightLineStrengthReducePass()) -FUNCTION_PASS("stack-protector", StackProtectorPass(TM)) +FUNCTION_PASS("stack-protector", StackProtectorPass(*TM)) FUNCTION_PASS("strip-gc-relocates", StripGCRelocates()) FUNCTION_PASS("tailcallelim", TailCallElimPass()) FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass()) FUNCTION_PASS("trigger-crash-function", TriggerCrashFunctionPass()) FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass()) FUNCTION_PASS("tsan", ThreadSanitizerPass()) -FUNCTION_PASS("typepromotion", TypePromotionPass(TM)) +FUNCTION_PASS("typepromotion", TypePromotionPass(*TM)) FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass()) FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass()) FUNCTION_PASS("vector-combine", VectorCombinePass()) @@ -730,7 +730,7 @@ FUNCTION_PASS_WITH_PARAMS( FUNCTION_PASS_WITH_PARAMS( "expand-fp", "ExpandFpPass", [TM = TM](CodeGenOptLevel OL) { - return ExpandFpPass(TM, OL); + return ExpandFpPass(*TM, OL); }, parseExpandFpOptions, "O0;O1;O2;O3") diff --git a/llvm/lib/Support/GlobPattern.cpp b/llvm/lib/Support/GlobPattern.cpp index 0ecf47d..2715229 100644 --- a/llvm/lib/Support/GlobPattern.cpp +++ b/llvm/lib/Support/GlobPattern.cpp @@ -132,24 +132,70 @@ parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) { return std::move(SubPatterns); } +static StringRef maxPlainSubstring(StringRef S) { + StringRef Best; + while (!S.empty()) { + size_t PrefixSize = S.find_first_of("?*[{\\"); + if (PrefixSize == std::string::npos) + PrefixSize = S.size(); + + if (Best.size() < PrefixSize) + Best = S.take_front(PrefixSize); + + S = S.drop_front(PrefixSize); + + // It's impossible, as the first and last characters of the input string + // must be Glob special characters, otherwise they would be parts of + // the prefix or the suffix. + assert(!S.empty()); + + switch (S.front()) { + case '\\': + S = S.drop_front(2); + break; + case '[': { + // Drop '[' and the first character which can be ']'. + S = S.drop_front(2); + size_t EndBracket = S.find_first_of("]"); + // Should not be possible, SubGlobPattern::create should fail on invalid + // pattern before we get here. + assert(EndBracket != std::string::npos); + S = S.drop_front(EndBracket + 1); + break; + } + case '{': + // TODO: implement. + // Fallback to whatever is best for now. + return Best; + default: + S = S.drop_front(1); + } + } + + return Best; +} + Expected<GlobPattern> GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) { GlobPattern Pat; + Pat.Pattern = S; // Store the prefix that does not contain any metacharacter. - size_t PrefixSize = S.find_first_of("?*[{\\"); - Pat.Prefix = S.substr(0, PrefixSize); - if (PrefixSize == std::string::npos) + Pat.PrefixSize = S.find_first_of("?*[{\\"); + if (Pat.PrefixSize == std::string::npos) { + Pat.PrefixSize = S.size(); return Pat; - S = S.substr(PrefixSize); + } + S = S.substr(Pat.PrefixSize); // Just in case we stop on unmatched opening brackets. size_t SuffixStart = S.find_last_of("?*[]{}\\"); assert(SuffixStart != std::string::npos); if (S[SuffixStart] == '\\') ++SuffixStart; - ++SuffixStart; - Pat.Suffix = S.substr(SuffixStart); + if (SuffixStart < S.size()) + ++SuffixStart; + Pat.SuffixSize = S.size() - SuffixStart; S = S.substr(0, SuffixStart); SmallVector<std::string, 1> SubPats; @@ -199,10 +245,15 @@ GlobPattern::SubGlobPattern::create(StringRef S) { return Pat; } +StringRef GlobPattern::longest_substr() const { + return maxPlainSubstring( + Pattern.drop_front(PrefixSize).drop_back(SuffixSize)); +} + bool GlobPattern::match(StringRef S) const { - if (!S.consume_front(Prefix)) + if (!S.consume_front(prefix())) return false; - if (!S.consume_back(Suffix)) + if (!S.consume_back(suffix())) return false; if (SubGlobs.empty() && S.empty()) return true; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f788c75..92f260f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4005,24 +4005,20 @@ def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; // load zero-extended i32, bitcast to f64 -def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; - +def : Pat<(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; // load zero-extended i16, bitcast to f64 -def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; - +def : Pat<(f64 (bitconvert (i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; // load zero-extended i8, bitcast to f64 -def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; - +def : Pat<(f64 (bitconvert (i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; // load zero-extended i16, bitcast to f32 -def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; - +def : Pat<(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; // load zero-extended i8, bitcast to f32 -def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))), - (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; +def : Pat<(f32 (bitconvert (i32 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; // Pre-fetch. def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm", diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e3370d3..2053fc4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1577,18 +1577,26 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) { } static bool isAllActivePredicate(Value *Pred) { - // Look through convert.from.svbool(convert.to.svbool(...) chain. Value *UncastedPred; + + // Look through predicate casts that only remove lanes. if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( - m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( - m_Value(UncastedPred))))) - // If the predicate has the same or less lanes than the uncasted - // predicate then we know the casting has no effect. - if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= - cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) - Pred = UncastedPred; + m_Value(UncastedPred)))) { + auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType()); + Pred = UncastedPred; + + if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( + m_Value(UncastedPred)))) + // If the predicate has the same or less lanes than the uncasted predicate + // then we know the casting has no effect. + if (OrigPredTy->getMinNumElements() <= + cast<ScalableVectorType>(UncastedPred->getType()) + ->getMinNumElements()) + Pred = UncastedPred; + } + auto *C = dyn_cast<Constant>(Pred); - return (C && C->isAllOnesValue()); + return C && C->isAllOnesValue(); } // Simplify `V` by only considering the operations that affect active lanes. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index ea32748..1c8383c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1430,6 +1430,18 @@ def FeatureAddSubU64Insts def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst", "true", "Has v_mad_u32 instruction">; +def FeatureAddMinMaxInsts : SubtargetFeature<"add-min-max-insts", + "HasAddMinMaxInsts", + "true", + "Has v_add_{min|max}_{i|u}32 instructions" +>; + +def FeaturePkAddMinMaxInsts : SubtargetFeature<"pk-add-min-max-insts", + "HasPkAddMinMaxInsts", + "true", + "Has v_pk_add_{min|max}_{i|u}16 instructions" +>; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -2115,6 +2127,8 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureLshlAddU64Inst, FeatureAddSubU64Insts, FeatureMadU32Inst, + FeatureAddMinMaxInsts, + FeaturePkAddMinMaxInsts, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, Feature45BitNumRecordsBufferResource, @@ -2658,11 +2672,11 @@ def HasFmaakFmamkF64Insts : def HasAddMinMaxInsts : Predicate<"Subtarget->hasAddMinMaxInsts()">, - AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + AssemblerPredicate<(any_of FeatureAddMinMaxInsts)>; def HasPkAddMinMaxInsts : Predicate<"Subtarget->hasPkAddMinMaxInsts()">, - AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + AssemblerPredicate<(any_of FeaturePkAddMinMaxInsts)>; def HasPkMinMax3Insts : Predicate<"Subtarget->hasPkMinMax3Insts()">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 56807a4..54ba2f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4835,6 +4835,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_perm_pk16_b4_u4: case Intrinsic::amdgcn_perm_pk16_b6_u4: case Intrinsic::amdgcn_perm_pk16_b8_u4: + case Intrinsic::amdgcn_add_max_i32: + case Intrinsic::amdgcn_add_max_u32: + case Intrinsic::amdgcn_add_min_i32: + case Intrinsic::amdgcn_add_min_u32: + case Intrinsic::amdgcn_pk_add_max_i16: + case Intrinsic::amdgcn_pk_add_max_u16: + case Intrinsic::amdgcn_pk_add_min_i16: + case Intrinsic::amdgcn_pk_add_min_u16: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 996b55f..02c5390 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -2086,7 +2086,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy)); - addPass(AtomicExpandPass(&TM)); + addPass(AtomicExpandPass(TM)); if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(AMDGPUPromoteAllocaPass(TM)); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a466780..ac660d5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -277,6 +277,8 @@ protected: bool HasLshlAddU64Inst = false; bool HasAddSubU64Insts = false; bool HasMadU32Inst = false; + bool HasAddMinMaxInsts = false; + bool HasPkAddMinMaxInsts = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -1567,10 +1569,10 @@ public: bool hasIntMinMax64() const { return GFX1250Insts; } // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions. - bool hasAddMinMaxInsts() const { return GFX1250Insts; } + bool hasAddMinMaxInsts() const { return HasAddMinMaxInsts; } // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. - bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } + bool hasPkAddMinMaxInsts() const { return HasPkAddMinMaxInsts; } // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions. bool hasPkMinMax3Insts() const { return GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 7cce033..ee10190 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in { - defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; - defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; - defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; - defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_i32>; + defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_u32>; + defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_i32>; + defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_u32>; } defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 6500fce..c4692b7 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -75,7 +75,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit IsDOT = 0> { def NAME : VOP3P_Pseudo<OpName, P, !if (P.HasModifiers, - getVOP3PModPat<P, node, IsDOT, IsDOT>.ret, + getVOP3PModPat<P, node, !or(P.EnableClamp, IsDOT), IsDOT>.ret, getVOP3Pat<P, node>.ret)>; let SubtargetPredicate = isGFX11Plus in { if P.HasExtVOP3DPP then @@ -434,15 +434,16 @@ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>; } // End SubtargetPredicate = HasFmaMixBF16Insts def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> { - let HasModifiers = 0; + let HasNeg = 0; + let EnableClamp = 1; } let isCommutable = 1, isReMaterializable = 1 in { let SubtargetPredicate = HasPkAddMinMaxInsts in { -defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>; -defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>; -defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>; -defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>; +defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_i16>; +defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_u16>; +defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_i16>; +defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_u16>; } let SubtargetPredicate = HasPkMinMax3Insts in { defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>; diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 3368a50..36b9908 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -1471,6 +1471,435 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { // instructions) auto-generated. #include "ARMGenMCPseudoLowering.inc" +// Helper function to check if a register is live (used as an implicit operand) +// in the given call instruction. +static bool isRegisterLiveInCall(const MachineInstr &Call, MCRegister Reg) { + for (const MachineOperand &MO : Call.implicit_operands()) { + if (MO.isReg() && MO.getReg() == Reg && MO.isUse()) { + return true; + } + } + return false; +} + +void ARMAsmPrinter::EmitKCFI_CHECK_ARM32(Register AddrReg, int64_t Type, + const MachineInstr &Call, + int64_t PrefixNops) { + // Choose scratch register: r12 primary, r3 if target is r12. + unsigned ScratchReg = ARM::R12; + if (AddrReg == ARM::R12) { + ScratchReg = ARM::R3; + } + + // Calculate ESR for ARM mode (16-bit): 0x8000 | (scratch_reg << 5) | addr_reg + // Note: scratch_reg is always 0x1F since the EOR sequence clobbers it. + const ARMBaseRegisterInfo *TRI = static_cast<const ARMBaseRegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); + unsigned AddrIndex = TRI->getEncodingValue(AddrReg); + unsigned ESR = 0x8000 | (31 << 5) | (AddrIndex & 31); + + // Check if r3 is live and needs to be spilled. + bool NeedSpillR3 = + (ScratchReg == ARM::R3) && isRegisterLiveInCall(Call, ARM::R3); + + // If we need to spill r3, push it first. + if (NeedSpillR3) { + // push {r3} + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::STMDB_UPD) + .addReg(ARM::SP) + .addReg(ARM::SP) + .addImm(ARMCC::AL) + .addReg(0) + .addReg(ARM::R3)); + } + + // Clear bit 0 of target address to handle Thumb function pointers. + // In 32-bit ARM, function pointers may have the low bit set to indicate + // Thumb state when ARM/Thumb interworking is enabled (ARMv4T and later). + // We need to clear it to avoid an alignment fault when loading. + // bic scratch, target, #1 + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BICri) + .addReg(ScratchReg) + .addReg(AddrReg) + .addImm(1) + .addImm(ARMCC::AL) + .addReg(0) + .addReg(0)); + + // ldr scratch, [scratch, #-(PrefixNops * 4 + 4)] + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(-(PrefixNops * 4 + 4)) + .addImm(ARMCC::AL) + .addReg(0)); + + // Each EOR instruction XORs one byte of the type, shifted to its position. + for (int i = 0; i < 4; i++) { + uint8_t byte = (Type >> (i * 8)) & 0xFF; + uint32_t imm = byte << (i * 8); + bool isLast = (i == 3); + + // Encode as ARM modified immediate. + int SOImmVal = ARM_AM::getSOImmVal(imm); + assert(SOImmVal != -1 && + "Cannot encode immediate as ARM modified immediate"); + + // eor[s] scratch, scratch, #imm (last one sets flags with CPSR) + EmitToStreamer(*OutStreamer, + MCInstBuilder(ARM::EORri) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(SOImmVal) + .addImm(ARMCC::AL) + .addReg(0) + .addReg(isLast ? ARM::CPSR : ARM::NoRegister)); + } + + // If we spilled r3, restore it immediately after the comparison. + // This must happen before the branch so r3 is valid on both paths. + if (NeedSpillR3) { + // pop {r3} + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDMIA_UPD) + .addReg(ARM::SP) + .addReg(ARM::SP) + .addImm(ARMCC::AL) + .addReg(0) + .addReg(ARM::R3)); + } + + // beq .Lpass (branch if types match, i.e., scratch is zero) + MCSymbol *Pass = OutContext.createTempSymbol(); + EmitToStreamer(*OutStreamer, + MCInstBuilder(ARM::Bcc) + .addExpr(MCSymbolRefExpr::create(Pass, OutContext)) + .addImm(ARMCC::EQ) + .addReg(ARM::CPSR)); + + // udf #ESR (trap with encoded diagnostic) + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::UDF).addImm(ESR)); + + OutStreamer->emitLabel(Pass); +} + +void ARMAsmPrinter::EmitKCFI_CHECK_Thumb2(Register AddrReg, int64_t Type, + const MachineInstr &Call, + int64_t PrefixNops) { + // Choose scratch register: r12 primary, r3 if target is r12. + unsigned ScratchReg = ARM::R12; + if (AddrReg == ARM::R12) { + ScratchReg = ARM::R3; + } + + // Calculate ESR for Thumb mode (8-bit): 0x80 | addr_reg + // Bit 7: KCFI trap indicator + // Bits 6-5: Reserved + // Bits 4-0: Address register encoding + const ARMBaseRegisterInfo *TRI = static_cast<const ARMBaseRegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); + unsigned AddrIndex = TRI->getEncodingValue(AddrReg); + unsigned ESR = 0x80 | (AddrIndex & 0x1F); + + // Check if r3 is live and needs to be spilled. + bool NeedSpillR3 = + (ScratchReg == ARM::R3) && isRegisterLiveInCall(Call, ARM::R3); + + // If we need to spill r3, push it first. + if (NeedSpillR3) { + // push {r3} + EmitToStreamer( + *OutStreamer, + MCInstBuilder(ARM::tPUSH).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3)); + } + + // Clear bit 0 of target address to handle Thumb function pointers. + // In 32-bit ARM, function pointers may have the low bit set to indicate + // Thumb state when ARM/Thumb interworking is enabled (ARMv4T and later). + // We need to clear it to avoid an alignment fault when loading. + // bic scratch, target, #1 + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2BICri) + .addReg(ScratchReg) + .addReg(AddrReg) + .addImm(1) + .addImm(ARMCC::AL) + .addReg(0) + .addReg(0)); + + // ldr scratch, [scratch, #-(PrefixNops * 4 + 4)] + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi8) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(-(PrefixNops * 4 + 4)) + .addImm(ARMCC::AL) + .addReg(0)); + + // Each EOR instruction XORs one byte of the type, shifted to its position. + for (int i = 0; i < 4; i++) { + uint8_t byte = (Type >> (i * 8)) & 0xFF; + uint32_t imm = byte << (i * 8); + bool isLast = (i == 3); + + // Verify the immediate can be encoded as Thumb2 modified immediate. + assert(ARM_AM::getT2SOImmVal(imm) != -1 && + "Cannot encode immediate as Thumb2 modified immediate"); + + // eor[s] scratch, scratch, #imm (last one sets flags with CPSR) + EmitToStreamer(*OutStreamer, + MCInstBuilder(ARM::t2EORri) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(imm) + .addImm(ARMCC::AL) + .addReg(0) + .addReg(isLast ? ARM::CPSR : ARM::NoRegister)); + } + + // If we spilled r3, restore it immediately after the comparison. + // This must happen before the branch so r3 is valid on both paths. + if (NeedSpillR3) { + // pop {r3} + EmitToStreamer( + *OutStreamer, + MCInstBuilder(ARM::tPOP).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3)); + } + + // beq .Lpass (branch if types match, i.e., scratch is zero) + MCSymbol *Pass = OutContext.createTempSymbol(); + EmitToStreamer(*OutStreamer, + MCInstBuilder(ARM::t2Bcc) + .addExpr(MCSymbolRefExpr::create(Pass, OutContext)) + .addImm(ARMCC::EQ) + .addReg(ARM::CPSR)); + + // udf #ESR (trap with encoded diagnostic) + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tUDF).addImm(ESR)); + + OutStreamer->emitLabel(Pass); +} + +void ARMAsmPrinter::EmitKCFI_CHECK_Thumb1(Register AddrReg, int64_t Type, + const MachineInstr &Call, + int64_t PrefixNops) { + // For Thumb1, use R2 unconditionally as scratch register (a low register + // required for tLDRi). R3 is used for building the type hash. + unsigned ScratchReg = ARM::R2; + unsigned TempReg = ARM::R3; + + // Check if r3 is live and needs to be spilled. + bool NeedSpillR3 = isRegisterLiveInCall(Call, ARM::R3); + + // Spill r3 if needed + if (NeedSpillR3) { + EmitToStreamer( + *OutStreamer, + MCInstBuilder(ARM::tPUSH).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3)); + } + + // Check if r2 is live and needs to be spilled. + bool NeedSpillR2 = isRegisterLiveInCall(Call, ARM::R2); + + // Push R2 if it's live + if (NeedSpillR2) { + EmitToStreamer( + *OutStreamer, + MCInstBuilder(ARM::tPUSH).addImm(ARMCC::AL).addReg(0).addReg(ARM::R2)); + } + + // Clear bit 0 from target address + // TempReg (R3) is used first as helper for BIC, then later for building type + // hash. + + // movs temp, #1 + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8) + .addReg(TempReg) + .addReg(ARM::CPSR) + .addImm(1) + .addImm(ARMCC::AL) + .addReg(0)); + + // mov scratch, target + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr) + .addReg(ScratchReg) + .addReg(AddrReg) + .addImm(ARMCC::AL)); + + // bics scratch, temp (scratch = scratch & ~temp) + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBIC) + .addReg(ScratchReg) + .addReg(ARM::CPSR) + .addReg(ScratchReg) + .addReg(TempReg) + .addImm(ARMCC::AL) + .addReg(0)); + + // Load type hash. Thumb1 doesn't support negative offsets, so subtract. + int offset = PrefixNops * 4 + 4; + + // subs scratch, #offset + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tSUBi8) + .addReg(ScratchReg) + .addReg(ARM::CPSR) + .addReg(ScratchReg) + .addImm(offset) + .addImm(ARMCC::AL) + .addReg(0)); + + // ldr scratch, [scratch, #0] + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(0) + .addImm(ARMCC::AL) + .addReg(0)); + + // Load expected type inline (instead of EOR sequence) + // + // This creates the 32-bit value byte-by-byte in the temp register: + // movs temp, #byte3 (high byte) + // lsls temp, temp, #8 + // adds temp, #byte2 + // lsls temp, temp, #8 + // adds temp, #byte1 + // lsls temp, temp, #8 + // adds temp, #byte0 (low byte) + + uint8_t byte0 = (Type >> 0) & 0xFF; + uint8_t byte1 = (Type >> 8) & 0xFF; + uint8_t byte2 = (Type >> 16) & 0xFF; + uint8_t byte3 = (Type >> 24) & 0xFF; + + // movs temp, #byte3 (start with high byte) + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8) + .addReg(TempReg) + .addReg(ARM::CPSR) + .addImm(byte3) + .addImm(ARMCC::AL) + .addReg(0)); + + // lsls temp, temp, #8 + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri) + .addReg(TempReg) + .addReg(ARM::CPSR) + .addReg(TempReg) + .addImm(8) + .addImm(ARMCC::AL) + .addReg(0)); + + // adds temp, #byte2 + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi8) + .addReg(TempReg) + .addReg(ARM::CPSR) + .addReg(TempReg) + .addImm(byte2) + .addImm(ARMCC::AL) + .addReg(0)); + + // lsls temp, temp, #8 + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri) + .addReg(TempReg) + .addReg(ARM::CPSR) + .addReg(TempReg) + .addImm(8) + .addImm(ARMCC::AL) + .addReg(0)); + + // adds temp, #byte1 + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi8) + .addReg(TempReg) + .addReg(ARM::CPSR) + .addReg(TempReg) + .addImm(byte1) + .addImm(ARMCC::AL) + .addReg(0)); + + // lsls temp, temp, #8 + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri) + .addReg(TempReg) + .addReg(ARM::CPSR) + .addReg(TempReg) + .addImm(8) + .addImm(ARMCC::AL) + .addReg(0)); + + // adds temp, #byte0 (low byte) + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi8) + .addReg(TempReg) + .addReg(ARM::CPSR) + .addReg(TempReg) + .addImm(byte0) + .addImm(ARMCC::AL) + .addReg(0)); + + // cmp scratch, temp + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tCMPr) + .addReg(ScratchReg) + .addReg(TempReg) + .addImm(ARMCC::AL) + .addReg(0)); + + // Restore registers if spilled (pop in reverse order of push: R2, then R3) + if (NeedSpillR2) { + // pop {r2} + EmitToStreamer( + *OutStreamer, + MCInstBuilder(ARM::tPOP).addImm(ARMCC::AL).addReg(0).addReg(ARM::R2)); + } + + // Restore r3 if spilled + if (NeedSpillR3) { + // pop {r3} + EmitToStreamer( + *OutStreamer, + MCInstBuilder(ARM::tPOP).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3)); + } + + // beq .Lpass (branch if types match, i.e., scratch == temp) + MCSymbol *Pass = OutContext.createTempSymbol(); + EmitToStreamer(*OutStreamer, + MCInstBuilder(ARM::tBcc) + .addExpr(MCSymbolRefExpr::create(Pass, OutContext)) + .addImm(ARMCC::EQ) + .addReg(ARM::CPSR)); + + // bkpt #0 (trap with encoded diagnostic) + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBKPT).addImm(0)); + + OutStreamer->emitLabel(Pass); +} + +void ARMAsmPrinter::LowerKCFI_CHECK(const MachineInstr &MI) { + Register AddrReg = MI.getOperand(0).getReg(); + const int64_t Type = MI.getOperand(1).getImm(); + + // Get the call instruction that follows this KCFI_CHECK. + assert(std::next(MI.getIterator())->isCall() && + "KCFI_CHECK not followed by a call instruction"); + const MachineInstr &Call = *std::next(MI.getIterator()); + + // Adjust the offset for patchable-function-prefix. + int64_t PrefixNops = 0; + MI.getMF() + ->getFunction() + .getFnAttribute("patchable-function-prefix") + .getValueAsString() + .getAsInteger(10, PrefixNops); + + // Emit the appropriate instruction sequence based on the opcode variant. + switch (MI.getOpcode()) { + case ARM::KCFI_CHECK_ARM: + EmitKCFI_CHECK_ARM32(AddrReg, Type, Call, PrefixNops); + break; + case ARM::KCFI_CHECK_Thumb2: + EmitKCFI_CHECK_Thumb2(AddrReg, Type, Call, PrefixNops); + break; + case ARM::KCFI_CHECK_Thumb1: + EmitKCFI_CHECK_Thumb1(AddrReg, Type, Call, PrefixNops); + break; + default: + llvm_unreachable("Unexpected KCFI_CHECK opcode"); + } +} + void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) { ARM_MC::verifyInstructionPredicates(MI->getOpcode(), getSubtargetInfo().getFeatureBits()); @@ -1504,6 +1933,11 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) { switch (Opc) { case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass"); case ARM::DBG_VALUE: llvm_unreachable("Should be handled by generic printing"); + case ARM::KCFI_CHECK_ARM: + case ARM::KCFI_CHECK_Thumb2: + case ARM::KCFI_CHECK_Thumb1: + LowerKCFI_CHECK(*MI); + return; case ARM::LEApcrel: case ARM::tLEApcrel: case ARM::t2LEApcrel: { diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.h b/llvm/lib/Target/ARM/ARMAsmPrinter.h index 2b067c7..9e92b5a 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.h +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.h @@ -123,9 +123,20 @@ public: void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI); void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI); + // KCFI check lowering + void LowerKCFI_CHECK(const MachineInstr &MI); + private: void EmitSled(const MachineInstr &MI, SledKind Kind); + // KCFI check emission helpers + void EmitKCFI_CHECK_ARM32(Register AddrReg, int64_t Type, + const MachineInstr &Call, int64_t PrefixNops); + void EmitKCFI_CHECK_Thumb2(Register AddrReg, int64_t Type, + const MachineInstr &Call, int64_t PrefixNops); + void EmitKCFI_CHECK_Thumb1(Register AddrReg, int64_t Type, + const MachineInstr &Call, int64_t PrefixNops); + // Helpers for emitStartOfAsmFile() and emitEndOfAsmFile() void emitAttributes(); diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 0d7b6d1..fffb6373 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -2301,6 +2301,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i) NewMI->addOperand(MBBI->getOperand(i)); + NewMI->setCFIType(*MBB.getParent(), MI.getCFIType()); + // Update call info and delete the pseudo instruction TCRETURN. if (MI.isCandidateForAdditionalCallInfo()) MI.getMF()->moveAdditionalCallInfo(&MI, &*NewMI); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index b1a668e..8122db2 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2849,6 +2849,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (isTailCall) { MF.getFrameInfo().setHasTailCall(); SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops); + if (CLI.CFIType) + Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); return Ret; @@ -2856,6 +2858,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops); + if (CLI.CFIType) + Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InGlue = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); @@ -12008,6 +12012,71 @@ static void genTPLoopBody(MachineBasicBlock *TpLoopBody, .add(predOps(ARMCC::AL)); } +bool ARMTargetLowering::supportKCFIBundles() const { + // KCFI is supported in all ARM/Thumb modes + return true; +} + +MachineInstr * +ARMTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator &MBBI, + const TargetInstrInfo *TII) const { + assert(MBBI->isCall() && MBBI->getCFIType() && + "Invalid call instruction for a KCFI check"); + + MachineOperand *TargetOp = nullptr; + switch (MBBI->getOpcode()) { + // ARM mode opcodes + case ARM::BLX: + case ARM::BLX_pred: + case ARM::BLX_noip: + case ARM::BLX_pred_noip: + case ARM::BX_CALL: + TargetOp = &MBBI->getOperand(0); + break; + case ARM::TCRETURNri: + case ARM::TCRETURNrinotr12: + case ARM::TAILJMPr: + case ARM::TAILJMPr4: + TargetOp = &MBBI->getOperand(0); + break; + // Thumb mode opcodes (Thumb1 and Thumb2) + // Note: Most Thumb call instructions have predicate operands before the + // target register Format: tBLXr pred, predreg, target_register, ... + case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T) + case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber + case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX) + TargetOp = &MBBI->getOperand(2); + break; + // Tail call instructions don't have predicates, target is operand 0 + case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register + TargetOp = &MBBI->getOperand(0); + break; + default: + llvm_unreachable("Unexpected CFI call opcode"); + } + + assert(TargetOp && TargetOp->isReg() && "Invalid target operand"); + TargetOp->setIsRenamable(false); + + // Select the appropriate KCFI_CHECK variant based on the instruction set + unsigned KCFICheckOpcode; + if (Subtarget->isThumb()) { + if (Subtarget->isThumb2()) { + KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2; + } else { + KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1; + } + } else { + KCFICheckOpcode = ARM::KCFI_CHECK_ARM; + } + + return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode)) + .addReg(TargetOp->getReg()) + .addImm(MBBI->getCFIType()) + .getInstr(); +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 70aa001..8c5e0cf 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -447,6 +447,12 @@ class VectorType; void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; + bool supportKCFIBundles() const override; + + MachineInstr *EmitKCFICheck(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator &MBBI, + const TargetInstrInfo *TII) const override; + SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 282ff53..53be167 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -6536,6 +6536,36 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPRPair:$addr_temp_out), def : Pat<(atomic_fence (timm), 0), (MEMBARRIER)>; //===----------------------------------------------------------------------===// +// KCFI check pseudo-instruction. +//===----------------------------------------------------------------------===// +// KCFI_CHECK pseudo-instruction for Kernel Control-Flow Integrity. +// Expands to a sequence that verifies the function pointer's type hash. +// Different sizes for different architectures due to different expansions. + +def KCFI_CHECK_ARM + : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>, + Sched<[]>, + Requires<[IsARM]> { + let Size = 28; // 7 instructions (bic, ldr, 4x eor, beq, udf) +} + +def KCFI_CHECK_Thumb2 + : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>, + Sched<[]>, + Requires<[IsThumb2]> { + let Size = + 32; // worst-case 9 instructions (push, bic, ldr, 4x eor, pop, beq.w, udf) +} + +def KCFI_CHECK_Thumb1 + : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>, + Sched<[]>, + Requires<[IsThumb1Only]> { + let Size = 50; // worst-case 25 instructions (pushes, bic helper, type + // building, cmp, pops) +} + +//===----------------------------------------------------------------------===// // Instructions used for emitting unwind opcodes on Windows. //===----------------------------------------------------------------------===// let isPseudo = 1 in { diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 86740a9..590d4c7 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -111,6 +111,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() { initializeMVELaneInterleavingPass(Registry); initializeARMFixCortexA57AES1742098Pass(Registry); initializeARMDAGToDAGISelLegacyPass(Registry); + initializeKCFIPass(Registry); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -487,6 +488,9 @@ void ARMPassConfig::addPreSched2() { // proper scheduling. addPass(createARMExpandPseudoPass()); + // Emit KCFI checks for indirect calls. + addPass(createKCFIPass()); + if (getOptLevel() != CodeGenOptLevel::None) { // When optimising for size, always run the Thumb2SizeReduction pass before // IfConversion. Otherwise, check whether IT blocks are restricted @@ -517,9 +521,12 @@ void ARMPassConfig::addPreSched2() { void ARMPassConfig::addPreEmitPass() { addPass(createThumb2SizeReductionPass()); - // Constant island pass work on unbundled instructions. + // Unpack bundles for: + // - Thumb2: Constant island pass requires unbundled instructions + // - KCFI: KCFI_CHECK pseudo instructions need to be unbundled for AsmPrinter addPass(createUnpackMachineBundles([](const MachineFunction &MF) { - return MF.getSubtarget<ARMSubtarget>().isThumb2(); + return MF.getSubtarget<ARMSubtarget>().isThumb2() || + MF.getFunction().getParent()->getModuleFlag("kcfi"); })); // Don't optimize barriers or block placement at -O0. @@ -530,6 +537,7 @@ void ARMPassConfig::addPreEmitPass() { } void ARMPassConfig::addPreEmitPass2() { + // Inserts fixup instructions before unsafe AES operations. Instructions may // be inserted at the start of blocks and at within blocks so this pass has to // come before those below. diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index f7deeaf..ca4a655 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2614,6 +2614,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; + // Try to widen vectors to gain more optimization opportunities. + if (SDValue NewShuffle = widenShuffleMask(DL, Mask, VT, V1, V2, DAG)) + return NewShuffle; if ((Result = lowerVECTOR_SHUFFLE_XVPERMI(DL, Mask, VT, V1, DAG, Subtarget))) return Result; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 22cf3a7..598735f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -4675,7 +4675,7 @@ class WMMA_INSTR<string _Intr, list<dag> _Args> // class WMMA_LOAD<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride> - : WMMA_INSTR<WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.record, + : WMMA_INSTR<WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.record_name, [!con((ins ADDR:$src), !if(WithStride, (ins B32:$ldm), (ins)))]>, Requires<Frag.Predicates> { @@ -4714,7 +4714,7 @@ class WMMA_LOAD<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride> // class WMMA_STORE_D<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride> - : WMMA_INSTR<WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.record, + : WMMA_INSTR<WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.record_name, [!con((ins ADDR:$dst), Frag.Ins, !if(WithStride, (ins B32:$ldm), (ins)))]>, @@ -4778,7 +4778,7 @@ class MMA_OP_PREDICATES<WMMA_REGINFO FragA, string b1op> { class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, WMMA_REGINFO FragC, WMMA_REGINFO FragD, string ALayout, string BLayout, int Satfinite, string rnd, string b1op> - : WMMA_INSTR<WMMA_NAME<ALayout, BLayout, Satfinite, rnd, b1op, FragA, FragB, FragC, FragD>.record, + : WMMA_INSTR<WMMA_NAME<ALayout, BLayout, Satfinite, rnd, b1op, FragA, FragB, FragC, FragD>.record_name, [FragA.Ins, FragB.Ins, FragC.Ins]>, // Requires does not seem to have effect on Instruction w/o Patterns. // We set it here anyways and propagate to the Pat<> we construct below. @@ -4837,7 +4837,7 @@ defset list<WMMA_INSTR> WMMAs = { class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, WMMA_REGINFO FragC, WMMA_REGINFO FragD, string ALayout, string BLayout, int Satfinite, string b1op, string Kind> - : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, Kind, FragA, FragB, FragC, FragD>.record, + : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, Kind, FragA, FragB, FragC, FragD>.record_name, [FragA.Ins, FragB.Ins, FragC.Ins]>, // Requires does not seem to have effect on Instruction w/o Patterns. // We set it here anyways and propagate to the Pat<> we construct below. @@ -4891,7 +4891,7 @@ class MMA_SP<WMMA_REGINFO FragA, WMMA_REGINFO FragB, WMMA_REGINFO FragC, WMMA_REGINFO FragD, string Metadata, string Kind, int Satfinite> : WMMA_INSTR<MMA_SP_NAME<Metadata, Kind, Satfinite, - FragA, FragB, FragC, FragD>.record, + FragA, FragB, FragC, FragD>.record_name, [FragA.Ins, FragB.Ins, FragC.Ins, (ins B32:$metadata, i32imm:$selector)]>, // Requires does not seem to have effect on Instruction w/o Patterns. @@ -4946,7 +4946,7 @@ defset list<WMMA_INSTR> MMA_SPs = { // ldmatrix.sync.aligned.m8n8[|.trans][|.shared].b16 // class LDMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space> - : WMMA_INSTR<LDMATRIX_NAME<Frag, Transposed>.record, [(ins ADDR:$src)]>, + : WMMA_INSTR<LDMATRIX_NAME<Frag, Transposed>.record_name, [(ins ADDR:$src)]>, Requires<Frag.Predicates> { // Build PatFrag that only matches particular address space. PatFrag IntrFrag = PatFrag<(ops node:$src), (Intr node:$src), @@ -4981,7 +4981,7 @@ defset list<WMMA_INSTR> LDMATRIXs = { // stmatrix.sync.aligned.m8n8[|.trans][|.shared].b16 // class STMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space> - : WMMA_INSTR<STMATRIX_NAME<Frag, Transposed>.record, [!con((ins ADDR:$dst), Frag.Ins)]>, + : WMMA_INSTR<STMATRIX_NAME<Frag, Transposed>.record_name, [!con((ins ADDR:$dst), Frag.Ins)]>, Requires<Frag.Predicates> { // Build PatFrag that only matches particular address space. dag PFOperands = !con((ops node:$dst), @@ -5376,7 +5376,7 @@ class Tcgen05MMAInst<bit Sp, string KindStr, string ASpace, Requires<PTXPredicates> { Intrinsic Intrin = !cast<Intrinsic>( - NVVM_TCGEN05_MMA<Sp, ASpace, AShift, ScaleInputD>.record + NVVM_TCGEN05_MMA<Sp, ASpace, AShift, ScaleInputD>.record_name ); dag ScaleInpIns = !if(!eq(ScaleInputD, 1), (ins i64imm:$scale_input_d), (ins)); @@ -5618,7 +5618,7 @@ class Tcgen05MMABlockScaleInst<bit Sp, string ASpace, string KindStr, Requires<[hasTcgen05Instructions, PTXPredicate]> { Intrinsic Intrin = !cast<Intrinsic>( - NVVM_TCGEN05_MMA_BLOCKSCALE<Sp, ASpace, KindStr, ScaleVecSize>.record); + NVVM_TCGEN05_MMA_BLOCKSCALE<Sp, ASpace, KindStr, ScaleVecSize>.record_name); dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins)); dag SparseMetadataIntr = !if(!eq(Sp, 1), (Intrin i32:$spmetadata), (Intrin)); @@ -5702,7 +5702,7 @@ class Tcgen05MMAWSInst<bit Sp, string ASpace, string KindStr, Requires<[hasTcgen05Instructions]> { Intrinsic Intrin = !cast<Intrinsic>( - NVVM_TCGEN05_MMA_WS<Sp, ASpace, HasZeroColMask>.record); + NVVM_TCGEN05_MMA_WS<Sp, ASpace, HasZeroColMask>.record_name); dag ZeroColMaskIns = !if(!eq(HasZeroColMask, 1), (ins B64:$zero_col_mask), (ins)); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp index 67b510d..f2b216b 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/FormatVariadic.h" #define GET_GICOMBINER_DEPS #include "RISCVGenPostLegalizeGICombiner.inc" @@ -42,6 +43,56 @@ namespace { #include "RISCVGenPostLegalizeGICombiner.inc" #undef GET_GICOMBINER_TYPES +/// Match: G_STORE (G_FCONSTANT +0.0), addr +/// Return the source vreg in MatchInfo if matched. +bool matchFoldFPZeroStore(MachineInstr &MI, MachineRegisterInfo &MRI, + const RISCVSubtarget &STI, Register &MatchInfo) { + if (MI.getOpcode() != TargetOpcode::G_STORE) + return false; + + Register SrcReg = MI.getOperand(0).getReg(); + if (!SrcReg.isVirtual()) + return false; + + MachineInstr *Def = MRI.getVRegDef(SrcReg); + if (!Def || Def->getOpcode() != TargetOpcode::G_FCONSTANT) + return false; + + auto *CFP = Def->getOperand(1).getFPImm(); + if (!CFP || !CFP->getValueAPF().isPosZero()) + return false; + + unsigned ValBits = MRI.getType(SrcReg).getSizeInBits(); + if ((ValBits == 16 && !STI.hasStdExtZfh()) || + (ValBits == 32 && !STI.hasStdExtF()) || + (ValBits == 64 && (!STI.hasStdExtD() || !STI.is64Bit()))) + return false; + + MatchInfo = SrcReg; + return true; +} + +/// Apply: rewrite to G_STORE (G_CONSTANT 0 [XLEN]), addr +void applyFoldFPZeroStore(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, const RISCVSubtarget &STI, + Register &MatchInfo) { + const unsigned XLen = STI.getXLen(); + + auto Zero = B.buildConstant(LLT::scalar(XLen), 0); + MI.getOperand(0).setReg(Zero.getReg(0)); + + MachineInstr *Def = MRI.getVRegDef(MatchInfo); + if (Def && MRI.use_nodbg_empty(MatchInfo)) + Def->eraseFromParent(); + +#ifndef NDEBUG + unsigned ValBits = MRI.getType(MatchInfo).getSizeInBits(); + LLVM_DEBUG(dbgs() << formatv("[{0}] Fold FP zero store -> int zero " + "(XLEN={1}, ValBits={2}):\n {3}\n", + DEBUG_TYPE, XLen, ValBits, MI)); +#endif +} + class RISCVPostLegalizerCombinerImpl : public Combiner { protected: const CombinerHelper Helper; diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td index 995dd0c..a06b60d 100644 --- a/llvm/lib/Target/RISCV/RISCVCombine.td +++ b/llvm/lib/Target/RISCV/RISCVCombine.td @@ -19,11 +19,20 @@ def RISCVO0PreLegalizerCombiner: GICombiner< "RISCVO0PreLegalizerCombinerImpl", [optnone_combines]> { } +// Rule: fold store (fp +0.0) -> store (int zero [XLEN]) +def fp_zero_store_matchdata : GIDefMatchData<"Register">; +def fold_fp_zero_store : GICombineRule< + (defs root:$root, fp_zero_store_matchdata:$matchinfo), + (match (G_STORE $src, $addr):$root, + [{ return matchFoldFPZeroStore(*${root}, MRI, STI, ${matchinfo}); }]), + (apply [{ applyFoldFPZeroStore(*${root}, MRI, B, STI, ${matchinfo}); }])>; + // Post-legalization combines which are primarily optimizations. // TODO: Add more combines. def RISCVPostLegalizerCombiner : GICombiner<"RISCVPostLegalizerCombinerImpl", [sub_to_add, combines_for_extload, redundant_and, identity_combines, shift_immed_chain, - commute_constant_to_rhs, simplify_neg_minmax]> { + commute_constant_to_rhs, simplify_neg_minmax, + fold_fp_zero_store]> { } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 4104abd..4c2f7f6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -482,7 +482,7 @@ let Predicates = [HasVendorXSfvfwmaccqqq] in { defm SF_VFWMACC_4x4x4 : VPseudoSiFiveVFWMACC; } -let Predicates = [HasVendorXSfvfnrclipxfqf] in { +let Predicates = [HasVendorXSfvfnrclipxfqf], AltFmtType = IS_NOT_ALTFMT in { defm SF_VFNRCLIP_XU_F_QF : VPseudoSiFiveVFNRCLIP; defm SF_VFNRCLIP_X_F_QF : VPseudoSiFiveVFNRCLIP; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td index f7d1a09..b9c5b75 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td @@ -668,4 +668,38 @@ foreach vti = NoGroupBF16Vectors in { def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)), (vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>; } + +let Predicates = [HasStdExtZvfbfa] in { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { + defvar fvti = fvtiToFWti.Vti; + defvar fwti = fvtiToFWti.Wti; + def : Pat<(fwti.Vector (any_riscv_fpextend_vl + (fvti.Vector fvti.RegClass:$rs1), + (fvti.Mask VMV0:$vm), + VLOpFrag)), + (!cast<Instruction>("PseudoVFWCVT_F_F_ALT_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") + (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, + (fvti.Mask VMV0:$vm), + GPR:$vl, fvti.Log2SEW, TA_MA)>; + + def : Pat<(fvti.Vector (any_riscv_fpround_vl + (fwti.Vector fwti.RegClass:$rs1), + (fwti.Mask VMV0:$vm), VLOpFrag)), + (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") + (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, + (fwti.Mask VMV0:$vm), + // Value to indicate no rounding mode change in + // RISCVInsertReadWriteCSR + FRM_DYN, + GPR:$vl, fvti.Log2SEW, TA_MA)>; + def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))), + (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW) + (fvti.Vector (IMPLICIT_DEF)), + fwti.RegClass:$rs1, + // Value to indicate no rounding mode change in + // RISCVInsertReadWriteCSR + FRM_DYN, + fvti.AVL, fvti.Log2SEW, TA_MA)>; + } +} } // Predicates = [HasStdExtZvfbfa] diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index d91923b..56a38bb 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -1499,18 +1499,25 @@ static bool generateKernelClockInst(const SPIRV::IncomingCall *Call, Register ResultReg = Call->ReturnRegister; - // Deduce the `Scope` operand from the builtin function name. - SPIRV::Scope::Scope ScopeArg = - StringSwitch<SPIRV::Scope::Scope>(Builtin->Name) - .EndsWith("device", SPIRV::Scope::Scope::Device) - .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup) - .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup); - Register ScopeReg = buildConstantIntReg32(ScopeArg, MIRBuilder, GR); - - MIRBuilder.buildInstr(SPIRV::OpReadClockKHR) - .addDef(ResultReg) - .addUse(GR->getSPIRVTypeID(Call->ReturnType)) - .addUse(ScopeReg); + if (Builtin->Name == "__spirv_ReadClockKHR") { + MIRBuilder.buildInstr(SPIRV::OpReadClockKHR) + .addDef(ResultReg) + .addUse(GR->getSPIRVTypeID(Call->ReturnType)) + .addUse(Call->Arguments[0]); + } else { + // Deduce the `Scope` operand from the builtin function name. + SPIRV::Scope::Scope ScopeArg = + StringSwitch<SPIRV::Scope::Scope>(Builtin->Name) + .EndsWith("device", SPIRV::Scope::Scope::Device) + .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup) + .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup); + Register ScopeReg = buildConstantIntReg32(ScopeArg, MIRBuilder, GR); + + MIRBuilder.buildInstr(SPIRV::OpReadClockKHR) + .addDef(ResultReg) + .addUse(GR->getSPIRVTypeID(Call->ReturnType)) + .addUse(ScopeReg); + } return true; } diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 3b8764a..c259cce 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -1174,6 +1174,7 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0 defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; +defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>; //===----------------------------------------------------------------------===// // Class defining an atomic instruction on floating-point numbers. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index f0ac26b..14097d7 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1336,22 +1336,25 @@ def pmax : PatFrags<(ops node:$lhs, node:$rhs), [ ]>; defm PMAX : SIMDBinaryFP<pmax, "pmax", 235>; +multiclass PMinMaxInt<Vec vec, NI baseMinInst, NI baseMaxInst> { + def : Pat<(vec.int_vt (vselect + (setolt (vec.vt (bitconvert V128:$rhs)), + (vec.vt (bitconvert V128:$lhs))), + V128:$rhs, V128:$lhs)), + (baseMinInst $lhs, $rhs)>; + def : Pat<(vec.int_vt (vselect + (setolt (vec.vt (bitconvert V128:$lhs)), + (vec.vt (bitconvert V128:$rhs))), + V128:$rhs, V128:$lhs)), + (baseMaxInst $lhs, $rhs)>; +} // Also match the pmin/pmax cases where the operands are int vectors (but the // comparison is still a floating point comparison). This can happen when using // the wasm_simd128.h intrinsics because v128_t is an integer vector. foreach vec = [F32x4, F64x2, F16x8] in { -defvar pmin = !cast<NI>("PMIN_"#vec); -defvar pmax = !cast<NI>("PMAX_"#vec); -def : Pat<(vec.int_vt (vselect - (setolt (vec.vt (bitconvert V128:$rhs)), - (vec.vt (bitconvert V128:$lhs))), - V128:$rhs, V128:$lhs)), - (pmin $lhs, $rhs)>; -def : Pat<(vec.int_vt (vselect - (setolt (vec.vt (bitconvert V128:$lhs)), - (vec.vt (bitconvert V128:$rhs))), - V128:$rhs, V128:$lhs)), - (pmax $lhs, $rhs)>; + defvar pmin = !cast<NI>("PMIN_"#vec); + defvar pmax = !cast<NI>("PMAX_"#vec); + defm : PMinMaxInt<vec, pmin, pmax>; } // And match the pmin/pmax LLVM intrinsics as well @@ -1756,6 +1759,15 @@ let Predicates = [HasRelaxedSIMD] in { (relaxed_max V128:$lhs, V128:$rhs)>; def : Pat<(vec.vt (fmaximumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))), (relaxed_max V128:$lhs, V128:$rhs)>; + + // Transform pmin/max-supposed patterns to relaxed min max + let AddedComplexity = 1 in { + def : Pat<(vec.vt (pmin (vec.vt V128:$lhs), (vec.vt V128:$rhs))), + (relaxed_min $lhs, $rhs)>; + def : Pat<(vec.vt (pmax (vec.vt V128:$lhs), (vec.vt V128:$rhs))), + (relaxed_max $lhs, $rhs)>; + defm : PMinMaxInt<vec, relaxed_min, relaxed_max>; + } } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d49f25a..4dfc400 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2632,6 +2632,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(Op, MVT::f32, Promote); } + setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16); + setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32); + setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64); + // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::VECTOR_SHUFFLE, ISD::SCALAR_TO_VECTOR, diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 62a3c88..975a271 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -433,6 +433,8 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["fp8e5m3-insts"] = true; Features["permlane16-swap"] = true; Features["ashr-pk-insts"] = true; + Features["add-min-max-insts"] = true; + Features["pk-add-min-max-insts"] = true; Features["atomic-buffer-pk-add-bf16-inst"] = true; Features["vmem-pref-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index a0f7ec6..2dd0fde 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -948,17 +948,17 @@ void llvm::updateVCallVisibilityInIndex( // linker, as we have no information on their eventual use. if (DynamicExportSymbols.count(P.first)) continue; + // With validation enabled, we want to exclude symbols visible to regular + // objects. Local symbols will be in this group due to the current + // implementation but those with VCallVisibilityTranslationUnit will have + // already been marked in clang so are unaffected. + if (VisibleToRegularObjSymbols.count(P.first)) + continue; for (auto &S : P.second.getSummaryList()) { auto *GVar = dyn_cast<GlobalVarSummary>(S.get()); if (!GVar || GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic) continue; - // With validation enabled, we want to exclude symbols visible to regular - // objects. Local symbols will be in this group due to the current - // implementation but those with VCallVisibilityTranslationUnit will have - // already been marked in clang so are unaffected. - if (VisibleToRegularObjSymbols.count(P.first)) - continue; GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit); } } @@ -1161,14 +1161,10 @@ bool DevirtIndex::tryFindVirtualCallTargets( // and therefore the same GUID. This can happen if there isn't enough // distinguishing path when compiling the source file. In that case we // conservatively return false early. + if (P.VTableVI.hasLocal() && P.VTableVI.getSummaryList().size() > 1) + return false; const GlobalVarSummary *VS = nullptr; - bool LocalFound = false; for (const auto &S : P.VTableVI.getSummaryList()) { - if (GlobalValue::isLocalLinkage(S->linkage())) { - if (LocalFound) - return false; - LocalFound = true; - } auto *CurVS = cast<GlobalVarSummary>(S->getBaseObject()); if (!CurVS->vTableFuncs().empty() || // Previously clang did not attach the necessary type metadata to @@ -1184,6 +1180,7 @@ bool DevirtIndex::tryFindVirtualCallTargets( // with public LTO visibility. if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic) return false; + break; } } // There will be no VS if all copies are available_externally having no @@ -1411,9 +1408,8 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot, // If the summary list contains multiple summaries where at least one is // a local, give up, as we won't know which (possibly promoted) name to use. - for (const auto &S : TheFn.getSummaryList()) - if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1) - return false; + if (TheFn.hasLocal() && Size > 1) + return false; // Collect functions devirtualized at least for one call site for stats. if (PrintSummaryDevirt || AreStatisticsEnabled()) @@ -2591,6 +2587,11 @@ void DevirtIndex::run() { if (ExportSummary.typeIdCompatibleVtableMap().empty()) return; + // Assert that we haven't made any changes that would affect the hasLocal() + // flag on the GUID summary info. + assert(!ExportSummary.withInternalizeAndPromote() && + "Expect index-based WPD to run before internalization and promotion"); + DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID; for (const auto &P : ExportSummary.typeIdCompatibleVtableMap()) { NameByGUID[GlobalValue::getGUIDAssumingExternalLinkage(P.first)].push_back( diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 73ec451..9bee523 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -2760,21 +2760,34 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { // Optimize pointer differences into the same array into a size. Consider: // &A[10] - &A[0]: we should compile this to "10". Value *LHSOp, *RHSOp; - if (match(Op0, m_PtrToInt(m_Value(LHSOp))) && - match(Op1, m_PtrToInt(m_Value(RHSOp)))) + if (match(Op0, m_PtrToIntOrAddr(m_Value(LHSOp))) && + match(Op1, m_PtrToIntOrAddr(m_Value(RHSOp)))) if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(), I.hasNoUnsignedWrap())) return replaceInstUsesWith(I, Res); // trunc(p)-trunc(q) -> trunc(p-q) - if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) && - match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) + if (match(Op0, m_Trunc(m_PtrToIntOrAddr(m_Value(LHSOp)))) && + match(Op1, m_Trunc(m_PtrToIntOrAddr(m_Value(RHSOp))))) if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(), /* IsNUW */ false)) return replaceInstUsesWith(I, Res); - if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) && - match(Op1, m_ZExtOrSelf(m_PtrToInt(m_Value(RHSOp))))) { + auto MatchSubOfZExtOfPtrToIntOrAddr = [&]() { + if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) && + match(Op1, m_ZExt(m_PtrToIntSameSize(DL, m_Value(RHSOp))))) + return true; + if (match(Op0, m_ZExt(m_PtrToAddr(m_Value(LHSOp)))) && + match(Op1, m_ZExt(m_PtrToAddr(m_Value(RHSOp))))) + return true; + // Special case for non-canonical ptrtoint in constant expression, + // where the zext has been folded into the ptrtoint. + if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) && + match(Op1, m_PtrToInt(m_Value(RHSOp)))) + return true; + return false; + }; + if (MatchSubOfZExtOfPtrToIntOrAddr()) { if (auto *GEP = dyn_cast<GEPOperator>(LHSOp)) { if (GEP->getPointerOperand() == RHSOp) { if (GEP->hasNoUnsignedWrap() || GEP->hasNoUnsignedSignedWrap()) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index dab200d..669d4f0 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4003,18 +4003,29 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // Try to fold intrinsic into select/phi operands. This is legal if: // * The intrinsic is speculatable. - // * The select condition is not a vector, or the intrinsic does not - // perform cross-lane operations. - if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI) && - isNotCrossLaneOperation(II)) + // * The operand is one of the following: + // - a phi. + // - a select with a scalar condition. + // - a select with a vector condition and II is not a cross lane operation. + if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI)) { for (Value *Op : II->args()) { - if (auto *Sel = dyn_cast<SelectInst>(Op)) - if (Instruction *R = FoldOpIntoSelect(*II, Sel)) + if (auto *Sel = dyn_cast<SelectInst>(Op)) { + bool IsVectorCond = Sel->getCondition()->getType()->isVectorTy(); + if (IsVectorCond && !isNotCrossLaneOperation(II)) + continue; + // Don't replace a scalar select with a more expensive vector select if + // we can't simplify both arms of the select. + bool SimplifyBothArms = + !Op->getType()->isVectorTy() && II->getType()->isVectorTy(); + if (Instruction *R = FoldOpIntoSelect( + *II, Sel, /*FoldWithMultiUse=*/false, SimplifyBothArms)) return R; + } if (auto *Phi = dyn_cast<PHINode>(Op)) if (Instruction *R = foldOpIntoPhi(*II, Phi)) return R; } + } if (Instruction *Shuf = foldShuffledIntrinsicOperands(II)) return Shuf; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 943c223..ede73f8 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -664,7 +664,8 @@ public: /// This also works for Cast instructions, which obviously do not have a /// second operand. Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI, - bool FoldWithMultiUse = false); + bool FoldWithMultiUse = false, + bool SimplifyBothArms = false); /// This is a convenience wrapper function for the above two functions. Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 3f11cae..67e2aae 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1777,7 +1777,8 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI, } Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI, - bool FoldWithMultiUse) { + bool FoldWithMultiUse, + bool SimplifyBothArms) { // Don't modify shared select instructions unless set FoldWithMultiUse if (!SI->hasOneUse() && !FoldWithMultiUse) return nullptr; @@ -1821,6 +1822,9 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI, if (!NewTV && !NewFV) return nullptr; + if (SimplifyBothArms && !(NewTV && NewFV)) + return nullptr; + // Create an instruction for the arm that did not fold. if (!NewTV) NewTV = foldOperationIntoSelectOperand(Op, SI, TV, *this); diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 4acc3f2..d347ced 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -614,6 +614,16 @@ static Decomposition decompose(Value *V, return {V, IsKnownNonNegative}; } + if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() && + canUseSExt(CI)) { + Preconditions.emplace_back( + CmpInst::ICMP_UGE, Op0, + ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1)); + if (auto Decomp = MergeResults(Op0, CI, true)) + return *Decomp; + return {V, IsKnownNonNegative}; + } + if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) { if (!isKnownNonNegative(Op0, DL)) Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0, @@ -627,16 +637,6 @@ static Decomposition decompose(Value *V, return {V, IsKnownNonNegative}; } - if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() && - canUseSExt(CI)) { - Preconditions.emplace_back( - CmpInst::ICMP_UGE, Op0, - ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1)); - if (auto Decomp = MergeResults(Op0, CI, true)) - return *Decomp; - return {V, IsKnownNonNegative}; - } - // Decompose or as an add if there are no common bits between the operands. if (match(V, m_DisjointOr(m_Value(Op0), m_ConstantInt(CI)))) { if (auto Decomp = MergeResults(Op0, CI, IsSigned)) diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp index a83cbd17a7..f273e9d 100644 --- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -64,10 +64,10 @@ using namespace llvm; -namespace { - #define DEBUG_TYPE "mergeicmps" +namespace { + // A BCE atom "Binary Compare Expression Atom" represents an integer load // that is a constant offset from a base value, e.g. `a` or `o.c` in the example // at the top. @@ -128,11 +128,12 @@ private: unsigned Order = 1; DenseMap<const Value*, int> BaseToIndex; }; +} // namespace // If this value is a load from a constant offset w.r.t. a base address, and // there are no other users of the load or address, returns the base address and // the offset. -BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { +static BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { auto *const LoadI = dyn_cast<LoadInst>(Val); if (!LoadI) return {}; @@ -175,6 +176,7 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { return BCEAtom(GEP, LoadI, BaseId.getBaseId(Base), Offset); } +namespace { // A comparison between two BCE atoms, e.g. `a == o.a` in the example at the // top. // Note: the terminology is misleading: the comparison is symmetric, so there @@ -239,6 +241,7 @@ class BCECmpBlock { private: BCECmp Cmp; }; +} // namespace bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, AliasAnalysis &AA) const { @@ -302,9 +305,9 @@ bool BCECmpBlock::doesOtherWork() const { // Visit the given comparison. If this is a comparison between two valid // BCE atoms, returns the comparison. -std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI, - const ICmpInst::Predicate ExpectedPredicate, - BaseIdentifier &BaseId) { +static std::optional<BCECmp> +visitICmp(const ICmpInst *const CmpI, + const ICmpInst::Predicate ExpectedPredicate, BaseIdentifier &BaseId) { // The comparison can only be used once: // - For intermediate blocks, as a branch condition. // - For the final block, as an incoming value for the Phi. @@ -332,10 +335,9 @@ std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI, // Visit the given comparison block. If this is a comparison between two valid // BCE atoms, returns the comparison. -std::optional<BCECmpBlock> visitCmpBlock(Value *const Val, - BasicBlock *const Block, - const BasicBlock *const PhiBlock, - BaseIdentifier &BaseId) { +static std::optional<BCECmpBlock> +visitCmpBlock(Value *const Val, BasicBlock *const Block, + const BasicBlock *const PhiBlock, BaseIdentifier &BaseId) { if (Block->empty()) return std::nullopt; auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator()); @@ -397,6 +399,7 @@ static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons, Comparisons.push_back(std::move(Comparison)); } +namespace { // A chain of comparisons. class BCECmpChain { public: @@ -420,6 +423,7 @@ private: // The original entry block (before sorting); BasicBlock *EntryBlock_; }; +} // namespace static bool areContiguous(const BCECmpBlock &First, const BCECmpBlock &Second) { return First.Lhs().BaseId == Second.Lhs().BaseId && @@ -742,9 +746,8 @@ bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, return true; } -std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi, - BasicBlock *const LastBlock, - int NumBlocks) { +static std::vector<BasicBlock *> +getOrderedBlocks(PHINode &Phi, BasicBlock *const LastBlock, int NumBlocks) { // Walk up from the last block to find other blocks. std::vector<BasicBlock *> Blocks(NumBlocks); assert(LastBlock && "invalid last block"); @@ -777,8 +780,8 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi, return Blocks; } -bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA, - DomTreeUpdater &DTU) { +static bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, + AliasAnalysis &AA, DomTreeUpdater &DTU) { LLVM_DEBUG(dbgs() << "processPhi()\n"); if (Phi.getNumIncomingValues() <= 1) { LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n"); @@ -874,6 +877,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI, return MadeChange; } +namespace { class MergeICmpsLegacyPass : public FunctionPass { public: static char ID; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d2c100c9..3356516 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7231,6 +7231,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( return DenseMap<const SCEV *, Value *>(); } + VPlanTransforms::narrowInterleaveGroups( + BestVPlan, BestVF, + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); VPlanTransforms::removeDeadRecipes(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan); @@ -8199,10 +8202,6 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, if (CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, *Plan, CM.getMaxSafeElements()); - - if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI)) - VPlans.push_back(std::move(P)); - assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index c95c887..428a8f4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1191,7 +1191,6 @@ VPlan *VPlan::duplicate() { } Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount; Old2NewVPValues[&VF] = &NewPlan->VF; - Old2NewVPValues[&UF] = &NewPlan->UF; Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF; if (BackedgeTakenCount) { NewPlan->BackedgeTakenCount = new VPValue(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 167ba55..a1ad2db 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2712,7 +2712,8 @@ public: static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || + R->getVPDefID() == VPRecipeBase::VPPartialReductionSC; } static inline bool classof(const VPUser *U) { @@ -2783,7 +2784,10 @@ public: Opcode(Opcode), VFScaleFactor(ScaleFactor) { [[maybe_unused]] auto *AccumulatorRecipe = getChainOp()->getDefiningRecipe(); - assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) || + // When cloning as part of a VPExpressionRecipe the chain op could have + // replaced by a temporary VPValue, so it doesn't have a defining recipe. + assert((!AccumulatorRecipe || + isa<VPReductionPHIRecipe>(AccumulatorRecipe) || isa<VPPartialReductionRecipe>(AccumulatorRecipe)) && "Unexpected operand order for partial reduction recipe"); } @@ -3093,6 +3097,11 @@ public: /// removed before codegen. void decompose(); + unsigned getVFScaleFactor() const { + auto *PR = dyn_cast<VPPartialReductionRecipe>(ExpressionRecipes.back()); + return PR ? PR->getVFScaleFactor() : 1; + } + /// Method for generating code, must not be called as this recipe is abstract. void execute(VPTransformState &State) override { llvm_unreachable("recipe must be removed before execute"); @@ -4152,9 +4161,6 @@ class VPlan { /// Represents the vectorization factor of the loop. VPValue VF; - /// Represents the symbolic unroll factor of the loop. - VPValue UF; - /// Represents the loop-invariant VF * UF of the vector loop region. VPValue VFxUF; @@ -4308,9 +4314,6 @@ public: VPValue &getVF() { return VF; }; const VPValue &getVF() const { return VF; }; - /// Returns the symbolic UF of the vector loop region. - VPValue &getSymbolicUF() { return UF; }; - /// Returns VF * UF of the vector loop region. VPValue &getVFxUF() { return VFxUF; } @@ -4320,12 +4323,6 @@ public: void addVF(ElementCount VF) { VFs.insert(VF); } - /// Remove \p VF from the plan. - void removeVF(ElementCount VF) { - assert(hasVF(VF) && "tried to remove VF not present in plan"); - VFs.remove(VF); - } - void setVF(ElementCount VF) { assert(hasVF(VF) && "Cannot set VF not already in plan"); VFs.clear(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1f1b42b..931a5b7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -168,6 +168,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects(); case VPBlendSC: case VPReductionEVLSC: + case VPPartialReductionSC: case VPReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: @@ -300,14 +301,23 @@ InstructionCost VPPartialReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { std::optional<unsigned> Opcode; - VPValue *Op = getOperand(0); - VPRecipeBase *OpR = Op->getDefiningRecipe(); - - // If the partial reduction is predicated, a select will be operand 0 - if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) { - OpR = Op->getDefiningRecipe(); + VPValue *Op = getVecOp(); + uint64_t MulConst; + // If the partial reduction is predicated, a select will be operand 1. + // If it isn't predicated and the mul isn't operating on a constant, then it + // should have been turned into a VPExpressionRecipe. + // FIXME: Replace the entire function with this once all partial reduction + // variants are bundled into VPExpressionRecipe. + if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) && + !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) { + auto *PhiType = Ctx.Types.inferScalarType(getChainOp()); + auto *InputType = Ctx.Types.inferScalarType(getVecOp()); + return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType, + PhiType, VF, TTI::PR_None, + TTI::PR_None, {}, Ctx.CostKind); } + VPRecipeBase *OpR = Op->getDefiningRecipe(); Type *InputTypeA = nullptr, *InputTypeB = nullptr; TTI::PartialReductionExtendKind ExtAType = TTI::PR_None, ExtBType = TTI::PR_None; @@ -2856,11 +2866,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind()); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { - return Ctx.TTI.getExtendedReductionCost( - Opcode, - cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == - Instruction::ZExt, - RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); + unsigned Opcode = RecurrenceDescriptor::getOpcode( + cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind()); + auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); + return isa<VPPartialReductionRecipe>(ExpressionRecipes.back()) + ? Ctx.TTI.getPartialReductionCost( + Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, + RedTy, VF, + TargetTransformInfo::getPartialReductionExtendKind( + ExtR->getOpcode()), + TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind) + : Ctx.TTI.getExtendedReductionCost( + Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, + SrcVecTy, std::nullopt, Ctx.CostKind); } case ExpressionTypes::MulAccReduction: return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy, @@ -2871,6 +2889,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, Opcode = Instruction::Sub; [[fallthrough]]; case ExpressionTypes::ExtMulAccReduction: { + if (isa<VPPartialReductionRecipe>(ExpressionRecipes.back())) { + auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); + auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); + auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]); + return Ctx.TTI.getPartialReductionCost( + Opcode, Ctx.Types.inferScalarType(getOperand(0)), + Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF, + TargetTransformInfo::getPartialReductionExtendKind( + Ext0R->getOpcode()), + TargetTransformInfo::getPartialReductionExtendKind( + Ext1R->getOpcode()), + Mul->getOpcode(), Ctx.CostKind); + } return Ctx.TTI.getMulAccReductionCost( cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, @@ -2910,12 +2941,13 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, O << " = "; auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back()); unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); + bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { getOperand(1)->printAsOperand(O, SlotTracker); - O << " +"; - O << " reduce." << Instruction::getOpcodeName(Opcode) << " ("; + O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << Instruction::getOpcodeName(Opcode) << " ("; getOperand(0)->printAsOperand(O, SlotTracker); Red->printFlags(O); @@ -2931,8 +2963,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, } case ExpressionTypes::ExtNegatedMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + reduce." - << Instruction::getOpcodeName( + O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) << " (sub (0, mul"; auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]); @@ -2956,9 +2988,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, case ExpressionTypes::MulAccReduction: case ExpressionTypes::ExtMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + "; - O << "reduce." - << Instruction::getOpcodeName( + O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) << " ("; O << "mul"; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 48cf763..3e85e6f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3519,18 +3519,31 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VPValue *VecOp = Red->getVecOp(); // Clamp the range if using extended-reduction is profitable. - auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt, - Type *SrcTy) -> bool { + auto IsExtendedRedValidAndClampRange = + [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool { return LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost( - Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(), - CostKind); + + InstructionCost ExtRedCost; InstructionCost ExtCost = cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); + + if (isa<VPPartialReductionRecipe>(Red)) { + TargetTransformInfo::PartialReductionExtendKind ExtKind = + TargetTransformInfo::getPartialReductionExtendKind(ExtOpc); + // FIXME: Move partial reduction creation, costing and clamping + // here from LoopVectorize.cpp. + ExtRedCost = Ctx.TTI.getPartialReductionCost( + Opcode, SrcTy, nullptr, RedTy, VF, ExtKind, + llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind); + } else { + ExtRedCost = Ctx.TTI.getExtendedReductionCost( + Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy, + Red->getFastMathFlags(), CostKind); + } return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost; }, Range); @@ -3541,8 +3554,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) && IsExtendedRedValidAndClampRange( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()), - cast<VPWidenCastRecipe>(VecOp)->getOpcode() == - Instruction::CastOps::ZExt, + cast<VPWidenCastRecipe>(VecOp)->getOpcode(), Ctx.Types.inferScalarType(A))) return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red); @@ -3560,6 +3572,8 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { + bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red); + unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return nullptr; @@ -3568,16 +3582,41 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Clamp the range if using multiply-accumulate-reduction is profitable. auto IsMulAccValidAndClampRange = - [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, - VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool { + [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, + VPWidenCastRecipe *OuterExt) -> bool { return LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *SrcTy = Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; - auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); - InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost( - isZExt, Opcode, RedTy, SrcVecTy, CostKind); + InstructionCost MulAccCost; + + if (IsPartialReduction) { + Type *SrcTy2 = + Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr; + // FIXME: Move partial reduction creation, costing and clamping + // here from LoopVectorize.cpp. + MulAccCost = Ctx.TTI.getPartialReductionCost( + Opcode, SrcTy, SrcTy2, RedTy, VF, + Ext0 ? TargetTransformInfo::getPartialReductionExtendKind( + Ext0->getOpcode()) + : TargetTransformInfo::PR_None, + Ext1 ? TargetTransformInfo::getPartialReductionExtendKind( + Ext1->getOpcode()) + : TargetTransformInfo::PR_None, + Mul->getOpcode(), CostKind); + } else { + // Only partial reductions support mixed extends at the moment. + if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode()) + return false; + + bool IsZExt = + !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt; + auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); + MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy, + SrcVecTy, CostKind); + } + InstructionCost MulCost = Mul->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); InstructionCost ExtCost = 0; @@ -3611,14 +3650,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe()); - // Match reduce.add(mul(ext, ext)). - if (RecipeA && RecipeB && - (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) && - match(RecipeA, m_ZExtOrSExt(m_VPValue())) && + // Match reduce.add/sub(mul(ext, ext)). + if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) && match(RecipeB, m_ZExtOrSExt(m_VPValue())) && - IsMulAccValidAndClampRange(RecipeA->getOpcode() == - Instruction::CastOps::ZExt, - Mul, RecipeA, RecipeB, nullptr)) { + IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) { if (Sub) return new VPExpressionRecipe(RecipeA, RecipeB, Mul, cast<VPWidenRecipe>(Sub), Red); @@ -3626,8 +3661,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, } // Match reduce.add(mul). // TODO: Add an expression type for this variant with a negated mul - if (!Sub && - IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) + if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr)) return new VPExpressionRecipe(Mul, Red); } // TODO: Add an expression type for negated versions of other expression @@ -3647,9 +3681,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe()); if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && Ext0->getOpcode() == Ext1->getOpcode() && - IsMulAccValidAndClampRange(Ext0->getOpcode() == - Instruction::CastOps::ZExt, - Mul, Ext0, Ext1, Ext)) { + IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) { auto *NewExt0 = new VPWidenCastRecipe( Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0, *Ext0, Ext0->getDebugLoc()); @@ -3956,9 +3988,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, // used. // TODO: Assert that they aren't used. - VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); - Plan.getSymbolicUF().replaceAllUsesWith(UF); - // If there are no users of the runtime VF, compute VFxUF by constant folding // the multiplication of VF and UF. if (VF.getNumUsers() == 0) { @@ -3978,6 +4007,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, } VF.replaceAllUsesWith(RuntimeVF); + VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF}); VFxUF.replaceAllUsesWith(MulByUF); } @@ -4045,14 +4075,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, return false; } -/// Returns VF from \p VFs if \p IR is a full interleave group with factor and -/// number of members both equal to VF. The interleave group must also access -/// the full vector width. -static std::optional<ElementCount> isConsecutiveInterleaveGroup( - VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs, - VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) { +/// Returns true if \p IR is a full interleave group with factor and number of +/// members both equal to \p VF. The interleave group must also access the full +/// vector width \p VectorRegWidth. +static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, + unsigned VF, VPTypeAnalysis &TypeInfo, + unsigned VectorRegWidth) { if (!InterleaveR || InterleaveR->getMask()) - return std::nullopt; + return false; Type *GroupElementTy = nullptr; if (InterleaveR->getStoredValues().empty()) { @@ -4061,7 +4091,7 @@ static std::optional<ElementCount> isConsecutiveInterleaveGroup( [&TypeInfo, GroupElementTy](VPValue *Op) { return TypeInfo.inferScalarType(Op) == GroupElementTy; })) - return std::nullopt; + return false; } else { GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]); @@ -4069,27 +4099,13 @@ static std::optional<ElementCount> isConsecutiveInterleaveGroup( [&TypeInfo, GroupElementTy](VPValue *Op) { return TypeInfo.inferScalarType(Op) == GroupElementTy; })) - return std::nullopt; + return false; } - auto GetVectorWidthForVF = [&TTI](ElementCount VF) { - TypeSize Size = TTI.getRegisterBitWidth( - VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector - : TargetTransformInfo::RGK_ScalableVector); - assert(Size.isScalable() == VF.isScalable() && - "if Size is scalable, VF must to and vice versa"); - return Size.getKnownMinValue(); - }; - - for (ElementCount VF : VFs) { - unsigned MinVal = VF.getKnownMinValue(); - unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal; - auto IG = InterleaveR->getInterleaveGroup(); - if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal && - GroupSize == GetVectorWidthForVF(VF)) - return {VF}; - } - return std::nullopt; + unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF; + auto IG = InterleaveR->getInterleaveGroup(); + return IG->getFactor() == VF && IG->getNumMembers() == VF && + GroupSize == VectorRegWidth; } /// Returns true if \p VPValue is a narrow VPValue. @@ -4100,18 +4116,16 @@ static bool isAlreadyNarrow(VPValue *VPV) { return RepR && RepR->isSingleScalar(); } -std::unique_ptr<VPlan> -VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, - const TargetTransformInfo &TTI) { - using namespace llvm::VPlanPatternMatch; +void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, + unsigned VectorRegWidth) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); - if (!VectorLoop) - return nullptr; + return; VPTypeAnalysis TypeInfo(Plan); + + unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; - std::optional<ElementCount> VFToOptimize; for (auto &R : *VectorLoop->getEntryBasicBlock()) { if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount())) continue; @@ -4125,33 +4139,30 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, // * recipes writing to memory except interleave groups // Only support plans with a canonical induction phi. if (R.isPhi()) - return nullptr; + return; auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R); if (R.mayWriteToMemory() && !InterleaveR) - return nullptr; + return; + + // Do not narrow interleave groups if there are VectorPointer recipes and + // the plan was unrolled. The recipe implicitly uses VF from + // VPTransformState. + // TODO: Remove restriction once the VF for the VectorPointer offset is + // modeled explicitly as operand. + if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1) + return; // All other ops are allowed, but we reject uses that cannot be converted // when checking all allowed consumers (store interleave groups) below. if (!InterleaveR) continue; - // Try to find a single VF, where all interleave groups are consecutive and - // saturate the full vector width. If we already have a candidate VF, check - // if it is applicable for the current InterleaveR, otherwise look for a - // suitable VF across the Plans VFs. - // - if (VFToOptimize) { - if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo, - TTI)) - return nullptr; - } else { - if (auto VF = isConsecutiveInterleaveGroup( - InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI)) - VFToOptimize = *VF; - else - return nullptr; - } + // Bail out on non-consecutive interleave groups. + if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo, + VectorRegWidth)) + return; + // Skip read interleave groups. if (InterleaveR->getStoredValues().empty()) continue; @@ -4185,34 +4196,24 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>( InterleaveR->getStoredValues()[0]->getDefiningRecipe()); if (!WideMember0) - return nullptr; + return; for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) { auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe()); if (!R || R->getOpcode() != WideMember0->getOpcode() || R->getNumOperands() > 2) - return nullptr; + return; if (any_of(enumerate(R->operands()), [WideMember0, Idx = I](const auto &P) { const auto &[OpIdx, OpV] = P; return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx); })) - return nullptr; + return; } StoreGroups.push_back(InterleaveR); } if (StoreGroups.empty()) - return nullptr; - - // All interleave groups in Plan can be narrowed for VFToOptimize. Split the - // original Plan into 2: a) a new clone which contains all VFs of Plan, except - // VFToOptimize, and b) the original Plan with VFToOptimize as single VF. - std::unique_ptr<VPlan> NewPlan; - if (size(Plan.vectorFactors()) != 1) { - NewPlan = std::unique_ptr<VPlan>(Plan.duplicate()); - Plan.setVF(*VFToOptimize); - NewPlan->removeVF(*VFToOptimize); - } + return; // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. SmallPtrSet<VPValue *, 4> NarrowedOps; @@ -4283,8 +4284,9 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue()); VPBuilder PHBuilder(Plan.getVectorPreheader()); - VPValue *UF = &Plan.getSymbolicUF(); - if (VFToOptimize->isScalable()) { + VPValue *UF = Plan.getOrAddLiveIn( + ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); + if (VF.isScalable()) { VPValue *VScale = PHBuilder.createElementCount( CanIV->getScalarType(), ElementCount::getScalable(1)); VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); @@ -4296,10 +4298,6 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); } removeDeadRecipes(Plan); - assert(none_of(*VectorLoop->getEntryBasicBlock(), - IsaPred<VPVectorPointerRecipe>) && - "All VPVectorPointerRecipes should have been removed"); - return NewPlan; } /// Add branch weight metadata, if the \p Plan's middle block is terminated by a diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index ca8d956..b28559b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -341,20 +341,14 @@ struct VPlanTransforms { static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan, ScalarEvolution &SE); - /// Try to find a single VF among \p Plan's VFs for which all interleave - /// groups (with known minimum VF elements) can be replaced by wide loads and - /// stores processing VF elements, if all transformed interleave groups access - /// the full vector width (checked via the maximum vector register width). If - /// the transformation can be applied, the original \p Plan will be split in - /// 2: - /// 1. The original Plan with the single VF containing the optimized recipes - /// using wide loads instead of interleave groups. - /// 2. A new clone which contains all VFs of Plan except the optimized VF. - /// - /// This effectively is a very simple form of loop-aware SLP, where we use - /// interleave groups to identify candidates. - static std::unique_ptr<VPlan> - narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI); + /// Try to convert a plan with interleave groups with VF elements to a plan + /// with the interleave groups replaced by wide loads and stores processing VF + /// elements, if all transformed interleave groups access the full vector + /// width (checked via \o VectorRegWidth). This effectively is a very simple + /// form of loop-aware SLP, where we use interleave groups to identify + /// candidates. + static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, + unsigned VectorRegWidth); /// Predicate and linearize the control-flow in the only loop region of /// \p Plan. If \p FoldTail is true, create a mask guarding the loop diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 32e4b88..06c3d75 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -151,6 +151,8 @@ unsigned vputils::getVFScaleFactor(VPRecipeBase *R) { return RR->getVFScaleFactor(); if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R)) return RR->getVFScaleFactor(); + if (auto *ER = dyn_cast<VPExpressionRecipe>(R)) + return ER->getVFScaleFactor(); assert( (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() != VPInstruction::ReductionStartVector) && |
