diff options
Diffstat (limited to 'llvm/lib')
43 files changed, 918 insertions, 491 deletions
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 424bb7b..dc06609 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1153,6 +1153,15 @@ InstructionCost TargetTransformInfo::getGatherScatterOpCost( return Cost; } +InstructionCost TargetTransformInfo::getExpandCompressMemoryOpCost( + unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, + TTI::TargetCostKind CostKind, const Instruction *I) const { + InstructionCost Cost = TTIImpl->getExpandCompressMemoryOpCost( + Opcode, DataTy, VariableMask, Alignment, CostKind, I); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getStridedMemoryOpCost( unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 6b61a35..55feb15 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -1426,7 +1426,22 @@ static void computeKnownBitsFromOperator(const Operator *I, computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); // Accumulate the constant indices in a separate variable // to minimize the number of calls to computeForAddSub. - APInt AccConstIndices(BitWidth, 0, /*IsSigned*/ true); + unsigned IndexWidth = Q.DL.getIndexTypeSizeInBits(I->getType()); + APInt AccConstIndices(IndexWidth, 0); + + auto AddIndexToKnown = [&](KnownBits IndexBits) { + if (IndexWidth == BitWidth) { + // Note that inbounds does *not* guarantee nsw for the addition, as only + // the offset is signed, while the base address is unsigned. + Known = KnownBits::add(Known, IndexBits); + } else { + // If the index width is smaller than the pointer width, only add the + // value to the low bits. + assert(IndexWidth < BitWidth && + "Index width can't be larger than pointer width"); + Known.insertBits(KnownBits::add(Known.trunc(IndexWidth), IndexBits), 0); + } + }; gep_type_iterator GTI = gep_type_begin(I); for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) { @@ -1464,43 +1479,34 @@ static void computeKnownBitsFromOperator(const Operator *I, break; } - unsigned IndexBitWidth = Index->getType()->getScalarSizeInBits(); - KnownBits IndexBits(IndexBitWidth); - computeKnownBits(Index, IndexBits, Depth + 1, Q); - TypeSize IndexTypeSize = GTI.getSequentialElementStride(Q.DL); - uint64_t TypeSizeInBytes = IndexTypeSize.getKnownMinValue(); - KnownBits ScalingFactor(IndexBitWidth); + TypeSize Stride = GTI.getSequentialElementStride(Q.DL); + uint64_t StrideInBytes = Stride.getKnownMinValue(); + if (!Stride.isScalable()) { + // Fast path for constant offset. + if (auto *CI = dyn_cast<ConstantInt>(Index)) { + AccConstIndices += + CI->getValue().sextOrTrunc(IndexWidth) * StrideInBytes; + continue; + } + } + + KnownBits IndexBits = + computeKnownBits(Index, Depth + 1, Q).sextOrTrunc(IndexWidth); + KnownBits ScalingFactor(IndexWidth); // Multiply by current sizeof type. // &A[i] == A + i * sizeof(*A[i]). - if (IndexTypeSize.isScalable()) { + if (Stride.isScalable()) { // For scalable types the only thing we know about sizeof is // that this is a multiple of the minimum size. - ScalingFactor.Zero.setLowBits(llvm::countr_zero(TypeSizeInBytes)); - } else if (IndexBits.isConstant()) { - APInt IndexConst = IndexBits.getConstant(); - APInt ScalingFactor(IndexBitWidth, TypeSizeInBytes); - IndexConst *= ScalingFactor; - AccConstIndices += IndexConst.sextOrTrunc(BitWidth); - continue; + ScalingFactor.Zero.setLowBits(llvm::countr_zero(StrideInBytes)); } else { ScalingFactor = - KnownBits::makeConstant(APInt(IndexBitWidth, TypeSizeInBytes)); + KnownBits::makeConstant(APInt(IndexWidth, StrideInBytes)); } - IndexBits = KnownBits::mul(IndexBits, ScalingFactor); - - // If the offsets have a different width from the pointer, according - // to the language reference we need to sign-extend or truncate them - // to the width of the pointer. - IndexBits = IndexBits.sextOrTrunc(BitWidth); - - // Note that inbounds does *not* guarantee nsw for the addition, as only - // the offset is signed, while the base address is unsigned. - Known = KnownBits::add(Known, IndexBits); - } - if (!Known.isUnknown() && !AccConstIndices.isZero()) { - KnownBits Index = KnownBits::makeConstant(AccConstIndices); - Known = KnownBits::add(Known, Index); + AddIndexToKnown(KnownBits::mul(IndexBits, ScalingFactor)); } + if (!Known.isUnknown() && !AccConstIndices.isZero()) + AddIndexToKnown(KnownBits::makeConstant(AccConstIndices)); break; } case Instruction::PHI: { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 2f96366..6cf05fd 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -33,6 +33,7 @@ #include "llvm/MC/MCSymbolWasm.h" #include "llvm/MC/MachineLocation.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/NVPTXAddrSpace.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -75,6 +76,26 @@ static dwarf::Tag GetCompileUnitType(UnitKind Kind, DwarfDebug *DW) { return dwarf::DW_TAG_compile_unit; } +/// Translate NVVM IR address space code to DWARF correspondent value +static unsigned translateToNVVMDWARFAddrSpace(unsigned AddrSpace) { + switch (AddrSpace) { + case NVPTXAS::ADDRESS_SPACE_GENERIC: + return NVPTXAS::DWARF_ADDR_generic_space; + case NVPTXAS::ADDRESS_SPACE_GLOBAL: + return NVPTXAS::DWARF_ADDR_global_space; + case NVPTXAS::ADDRESS_SPACE_SHARED: + return NVPTXAS::DWARF_ADDR_shared_space; + case NVPTXAS::ADDRESS_SPACE_CONST: + return NVPTXAS::DWARF_ADDR_const_space; + case NVPTXAS::ADDRESS_SPACE_LOCAL: + return NVPTXAS::DWARF_ADDR_local_space; + default: + llvm_unreachable( + "Cannot translate unknown address space to DWARF address space"); + return AddrSpace; + } +} + DwarfCompileUnit::DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU, UnitKind Kind) @@ -264,14 +285,11 @@ void DwarfCompileUnit::addLocationAttribute( } if (Expr) { - // According to - // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf - // cuda-gdb requires DW_AT_address_class for all variables to be able to - // correctly interpret address space of the variable address. + // cuda-gdb special requirement. See NVPTXAS::DWARF_AddressSpace // Decode DW_OP_constu <DWARF Address Space> DW_OP_swap DW_OP_xderef - // sequence for the NVPTX + gdb target. - unsigned LocalNVPTXAddressSpace; + // sequence to specify corresponding address space. if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) { + unsigned LocalNVPTXAddressSpace; const DIExpression *NewExpr = DIExpression::extractAddressClass(Expr, LocalNVPTXAddressSpace); if (NewExpr != Expr) { @@ -363,6 +381,10 @@ void DwarfCompileUnit::addLocationAttribute( DD->addArangeLabel(SymbolCU(this, Sym)); addOpAddress(*Loc, Sym); } + if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB() && + !NVPTXAddressSpace) + NVPTXAddressSpace = + translateToNVVMDWARFAddrSpace(Global->getType()->getAddressSpace()); } // Global variables attached to symbols are memory locations. // It would be better if this were unconditional, but malformed input that @@ -373,13 +395,9 @@ void DwarfCompileUnit::addLocationAttribute( DwarfExpr->addExpression(Expr); } if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) { - // According to - // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf - // cuda-gdb requires DW_AT_address_class for all variables to be able to - // correctly interpret address space of the variable address. - const unsigned NVPTX_ADDR_global_space = 5; + // cuda-gdb special requirement. See NVPTXAS::DWARF_AddressSpace addUInt(*VariableDIE, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1, - NVPTXAddressSpace.value_or(NVPTX_ADDR_global_space)); + NVPTXAddressSpace.value_or(NVPTXAS::DWARF_ADDR_global_space)); } if (Loc) addBlock(*VariableDIE, dwarf::DW_AT_location, DwarfExpr->finalize()); @@ -793,10 +811,10 @@ void DwarfCompileUnit::applyConcreteDbgVariableAttributes( const DbgValueLoc *DVal = &Single.getValueLoc(); if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB() && !Single.getExpr()) { - // Lack of expression means it is a register. Registers for PTX need to - // be marked with DW_AT_address_class = 2. See - // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf - addUInt(VariableDie, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1, 2); + // cuda-gdb special requirement. See NVPTXAS::DWARF_AddressSpace + // Lack of expression means it is a register. + addUInt(VariableDie, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1, + NVPTXAS::DWARF_ADDR_reg_space); } if (!DVal->isVariadic()) { const DbgValueLocEntry *Entry = DVal->getLocEntries().begin(); @@ -922,14 +940,11 @@ void DwarfCompileUnit::applyConcreteDbgVariableAttributes(const Loc::MMI &MMI, SmallVector<uint64_t, 8> Ops; TRI->getOffsetOpcodes(Offset, Ops); - // According to - // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf - // cuda-gdb requires DW_AT_address_class for all variables to be - // able to correctly interpret address space of the variable - // address. Decode DW_OP_constu <DWARF Address Space> DW_OP_swap - // DW_OP_xderef sequence for the NVPTX + gdb target. - unsigned LocalNVPTXAddressSpace; + // cuda-gdb special requirement. See NVPTXAS::DWARF_AddressSpace. + // Decode DW_OP_constu <DWARF Address Space> DW_OP_swap + // DW_OP_xderef sequence to specify address space. if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) { + unsigned LocalNVPTXAddressSpace; const DIExpression *NewExpr = DIExpression::extractAddressClass(Expr, LocalNVPTXAddressSpace); if (NewExpr != Expr) { @@ -949,14 +964,9 @@ void DwarfCompileUnit::applyConcreteDbgVariableAttributes(const Loc::MMI &MMI, DwarfExpr.addExpression(std::move(Cursor)); } if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) { - // According to - // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf - // cuda-gdb requires DW_AT_address_class for all variables to be - // able to correctly interpret address space of the variable - // address. - const unsigned NVPTX_ADDR_local_space = 6; + // cuda-gdb special requirement. See NVPTXAS::DWARF_AddressSpace. addUInt(VariableDie, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1, - NVPTXAddressSpace.value_or(NVPTX_ADDR_local_space)); + NVPTXAddressSpace.value_or(NVPTXAS::DWARF_ADDR_local_space)); } addBlock(VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize()); if (DwarfExpr.TagOffset) diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 5f0c7ec9c..0a7937e 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -77,7 +77,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineCFGPrinterPass(Registry); initializeMachineCSELegacyPass(Registry); initializeMachineCombinerPass(Registry); - initializeMachineCopyPropagationPass(Registry); + initializeMachineCopyPropagationLegacyPass(Registry); initializeMachineCycleInfoPrinterPassPass(Registry); initializeMachineCycleInfoWrapperPassPass(Registry); initializeMachineDominatorTreeWrapperPassPass(Registry); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 3e43299..362d856 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2441,9 +2441,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return true; } case Intrinsic::invariant_start: { - LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL); - Register Undef = MRI->createGenericVirtualRegister(PtrTy); - MIRBuilder.buildUndef(Undef); + MIRBuilder.buildUndef(getOrCreateVReg(CI)); return true; } case Intrinsic::invariant_end: diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index d44b064..460749a 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -48,6 +48,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineCopyPropagation.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" @@ -449,7 +450,7 @@ public: } }; -class MachineCopyPropagation : public MachineFunctionPass { +class MachineCopyPropagation { const TargetRegisterInfo *TRI = nullptr; const TargetInstrInfo *TII = nullptr; const MachineRegisterInfo *MRI = nullptr; @@ -458,24 +459,10 @@ class MachineCopyPropagation : public MachineFunctionPass { bool UseCopyInstr; public: - static char ID; // Pass identification, replacement for typeid - MachineCopyPropagation(bool CopyInstr = false) - : MachineFunctionPass(ID), UseCopyInstr(CopyInstr || MCPUseCopyInstr) { - initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; + : UseCopyInstr(CopyInstr || MCPUseCopyInstr) {} - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); - } + bool run(MachineFunction &MF); private: typedef enum { DebugUse = false, RegularUse = true } DebugType; @@ -510,13 +497,35 @@ private: bool Changed = false; }; +class MachineCopyPropagationLegacy : public MachineFunctionPass { + bool UseCopyInstr; + +public: + static char ID; // pass identification + + MachineCopyPropagationLegacy(bool UseCopyInstr = false) + : MachineFunctionPass(ID), UseCopyInstr(UseCopyInstr) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } +}; + } // end anonymous namespace -char MachineCopyPropagation::ID = 0; +char MachineCopyPropagationLegacy::ID = 0; -char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID; +char &llvm::MachineCopyPropagationID = MachineCopyPropagationLegacy::ID; -INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE, +INITIALIZE_PASS(MachineCopyPropagationLegacy, DEBUG_TYPE, "Machine Copy Propagation Pass", false, false) void MachineCopyPropagation::ReadRegister(MCRegister Reg, MachineInstr &Reader, @@ -1563,10 +1572,25 @@ void MachineCopyPropagation::EliminateSpillageCopies(MachineBasicBlock &MBB) { Tracker.clear(); } -bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) { +bool MachineCopyPropagationLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; + return MachineCopyPropagation(UseCopyInstr).run(MF); +} + +PreservedAnalyses +MachineCopyPropagationPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &) { + MFPropsModifier _(*this, MF); + if (!MachineCopyPropagation(UseCopyInstr).run(MF)) + return PreservedAnalyses::all(); + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} + +bool MachineCopyPropagation::run(MachineFunction &MF) { bool isSpillageCopyElimEnabled = false; switch (EnableSpillageCopyElimination) { case cl::BOU_UNSET: @@ -1599,5 +1623,5 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) { MachineFunctionPass * llvm::createMachineCopyPropagationPass(bool UseCopyInstr = false) { - return new MachineCopyPropagation(UseCopyInstr); + return new MachineCopyPropagationLegacy(UseCopyInstr); } diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp index a4b78c1..b5dc487 100644 --- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp @@ -185,11 +185,11 @@ MachineUniformityAnalysisPass::MachineUniformityAnalysisPass() } INITIALIZE_PASS_BEGIN(MachineUniformityAnalysisPass, "machine-uniformity", - "Machine Uniformity Info Analysis", true, true) + "Machine Uniformity Info Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity", - "Machine Uniformity Info Analysis", true, true) + "Machine Uniformity Info Analysis", false, true) void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index fa60881..59ad9ff 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -147,16 +147,7 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) { assert(FrameIndex >= 0 && "Can't handle negative frame indicies yet!"); if (!isFIDef(*MI, FrameIndex, TII)) continue; - if (MBBFrameObjsReachingDefs.contains(MBBNumber)) { - auto Frame2InstrIdx = MBBFrameObjsReachingDefs[MBBNumber]; - if (Frame2InstrIdx.count(FrameIndex - ObjectIndexBegin) > 0) - Frame2InstrIdx[FrameIndex - ObjectIndexBegin].push_back(CurInstr); - else - Frame2InstrIdx[FrameIndex - ObjectIndexBegin] = {CurInstr}; - } else { - MBBFrameObjsReachingDefs[MBBNumber] = { - {FrameIndex - ObjectIndexBegin, {CurInstr}}}; - } + MBBFrameObjsReachingDefs[{MBBNumber, FrameIndex}].push_back(CurInstr); } if (!isValidRegDef(MO)) continue; @@ -351,9 +342,13 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, Register Reg) const { int LatestDef = ReachingDefDefaultVal; if (Reg.isStack()) { + // Check that there was a reaching def. int FrameIndex = Reg.stackSlotIndex(); - for (int Def : MBBFrameObjsReachingDefs.lookup(MBBNumber).lookup( - FrameIndex - ObjectIndexBegin)) { + auto Lookup = MBBFrameObjsReachingDefs.find({MBBNumber, FrameIndex}); + if (Lookup == MBBFrameObjsReachingDefs.end()) + return LatestDef; + auto &Defs = Lookup->second; + for (int Def : Defs) { if (Def >= InstId) break; DefRes = Def; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 882d6015..8858c20 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -385,17 +385,6 @@ namespace { bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val); bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val); - /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed - /// load. - /// - /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced. - /// \param InVecVT type of the input vector to EVE with bitcasts resolved. - /// \param EltNo index of the vector element to load. - /// \param OriginalLoad load that EVE came from to be replaced. - /// \returns EVE on success SDValue() on failure. - SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, - SDValue EltNo, - LoadSDNode *OriginalLoad); void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad); SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace); SDValue SExtPromoteOperand(SDValue Op, EVT PVT); @@ -22719,81 +22708,6 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { return SDValue(); } -SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, - SDValue EltNo, - LoadSDNode *OriginalLoad) { - assert(OriginalLoad->isSimple()); - - EVT ResultVT = EVE->getValueType(0); - EVT VecEltVT = InVecVT.getVectorElementType(); - - // If the vector element type is not a multiple of a byte then we are unable - // to correctly compute an address to load only the extracted element as a - // scalar. - if (!VecEltVT.isByteSized()) - return SDValue(); - - ISD::LoadExtType ExtTy = - ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD; - if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) || - !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) - return SDValue(); - - Align Alignment = OriginalLoad->getAlign(); - MachinePointerInfo MPI; - SDLoc DL(EVE); - if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) { - int Elt = ConstEltNo->getZExtValue(); - unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; - MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); - Alignment = commonAlignment(Alignment, PtrOff); - } else { - // Discard the pointer info except the address space because the memory - // operand can't represent this new access since the offset is variable. - MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace()); - Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8); - } - - unsigned IsFast = 0; - if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT, - OriginalLoad->getAddressSpace(), Alignment, - OriginalLoad->getMemOperand()->getFlags(), - &IsFast) || - !IsFast) - return SDValue(); - - SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), - InVecVT, EltNo); - - // We are replacing a vector load with a scalar load. The new load must have - // identical memory op ordering to the original. - SDValue Load; - if (ResultVT.bitsGT(VecEltVT)) { - // If the result type of vextract is wider than the load, then issue an - // extending load instead. - ISD::LoadExtType ExtType = - TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) ? ISD::ZEXTLOAD - : ISD::EXTLOAD; - Load = DAG.getExtLoad(ExtType, DL, ResultVT, OriginalLoad->getChain(), - NewPtr, MPI, VecEltVT, Alignment, - OriginalLoad->getMemOperand()->getFlags(), - OriginalLoad->getAAInfo()); - DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load); - } else { - // The result type is narrower or the same width as the vector element - Load = DAG.getLoad(VecEltVT, DL, OriginalLoad->getChain(), NewPtr, MPI, - Alignment, OriginalLoad->getMemOperand()->getFlags(), - OriginalLoad->getAAInfo()); - DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load); - if (ResultVT.bitsLT(VecEltVT)) - Load = DAG.getNode(ISD::TRUNCATE, DL, ResultVT, Load); - else - Load = DAG.getBitcast(ResultVT, Load); - } - ++OpsNarrowed; - return Load; -} - /// Transform a vector binary operation into a scalar binary operation by moving /// the math/logic after an extract element of a vector. static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG, @@ -23272,8 +23186,13 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { ISD::isNormalLoad(VecOp.getNode()) && !Index->hasPredecessor(VecOp.getNode())) { auto *VecLoad = dyn_cast<LoadSDNode>(VecOp); - if (VecLoad && VecLoad->isSimple()) - return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad); + if (VecLoad && VecLoad->isSimple()) { + if (SDValue Scalarized = TLI.scalarizeExtractedVectorLoad( + ExtVT, SDLoc(N), VecVT, Index, VecLoad, DAG)) { + ++OpsNarrowed; + return Scalarized; + } + } } // Perform only after legalization to ensure build_vector / vector_shuffle @@ -23361,7 +23280,13 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { if (Elt == -1) return DAG.getUNDEF(LVT); - return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0); + if (SDValue Scalarized = + TLI.scalarizeExtractedVectorLoad(LVT, DL, VecVT, Index, LN0, DAG)) { + ++OpsNarrowed; + return Scalarized; + } + + return SDValue(); } // Simplify (build_vec (ext )) to (bitcast (build_vec )) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 625052b..f1a91a7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -566,6 +566,29 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) { } } + // TODO: Handle big endian + if (!NOutVT.isVector() && InOp.getValueType().isVector() && + DAG.getDataLayout().isLittleEndian()) { + // Pad the vector operand with undef and cast to a wider integer. + EVT EltVT = InOp.getValueType().getVectorElementType(); + TypeSize EltSize = EltVT.getSizeInBits(); + TypeSize OutSize = NOutVT.getSizeInBits(); + + if (OutSize.hasKnownScalarFactor(EltSize)) { + unsigned NumEltsWithPadding = OutSize.getKnownScalarFactor(EltSize); + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), EltVT, NumEltsWithPadding); + + if (isTypeLegal(WideVecVT)) { + SDValue Inserted = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT, + DAG.getUNDEF(WideVecVT), InOp, + DAG.getVectorIdxConstant(0, dl)); + + return DAG.getNode(ISD::BITCAST, dl, NOutVT, Inserted); + } + } + } + return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, CreateStackStoreLoad(InOp, OutVT)); } @@ -2181,9 +2204,43 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) { } SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) { + EVT OutVT = N->getValueType(0); + SDValue InOp = N->getOperand(0); + EVT InVT = InOp.getValueType(); + EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT); + SDLoc dl(N); + + switch (getTypeAction(InVT)) { + case TargetLowering::TypePromoteInteger: { + // TODO: Handle big endian + if (OutVT.isVector() && DAG.getDataLayout().isLittleEndian()) { + EVT EltVT = OutVT.getVectorElementType(); + TypeSize EltSize = EltVT.getSizeInBits(); + TypeSize NInSize = NInVT.getSizeInBits(); + + if (NInSize.hasKnownScalarFactor(EltSize)) { + unsigned NumEltsWithPadding = NInSize.getKnownScalarFactor(EltSize); + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), EltVT, NumEltsWithPadding); + + if (isTypeLegal(WideVecVT)) { + SDValue Promoted = GetPromotedInteger(InOp); + SDValue Cast = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Promoted); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, Cast, + DAG.getVectorIdxConstant(0, dl)); + } + } + } + + break; + } + default: + break; + } + // This should only occur in unusual situations like bitcasting to an // x86_fp80, so just turn it into a store+load - return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0)); + return CreateStackStoreLoad(InOp, OutVT); } SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 98206b7..adfb960 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -12114,3 +12114,77 @@ SDValue TargetLowering::expandVectorNaryOpBySplitting(SDNode *Node, SDValue SplitOpHi = DAG.getNode(Opcode, DL, HiVT, HiOps); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SplitOpLo, SplitOpHi); } + +SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT, + const SDLoc &DL, + EVT InVecVT, SDValue EltNo, + LoadSDNode *OriginalLoad, + SelectionDAG &DAG) const { + assert(OriginalLoad->isSimple()); + + EVT VecEltVT = InVecVT.getVectorElementType(); + + // If the vector element type is not a multiple of a byte then we are unable + // to correctly compute an address to load only the extracted element as a + // scalar. + if (!VecEltVT.isByteSized()) + return SDValue(); + + ISD::LoadExtType ExtTy = + ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD; + if (!isOperationLegalOrCustom(ISD::LOAD, VecEltVT) || + !shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) + return SDValue(); + + Align Alignment = OriginalLoad->getAlign(); + MachinePointerInfo MPI; + if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) { + int Elt = ConstEltNo->getZExtValue(); + unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; + MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); + Alignment = commonAlignment(Alignment, PtrOff); + } else { + // Discard the pointer info except the address space because the memory + // operand can't represent this new access since the offset is variable. + MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace()); + Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8); + } + + unsigned IsFast = 0; + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT, + OriginalLoad->getAddressSpace(), Alignment, + OriginalLoad->getMemOperand()->getFlags(), &IsFast) || + !IsFast) + return SDValue(); + + SDValue NewPtr = + getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); + + // We are replacing a vector load with a scalar load. The new load must have + // identical memory op ordering to the original. + SDValue Load; + if (ResultVT.bitsGT(VecEltVT)) { + // If the result type of vextract is wider than the load, then issue an + // extending load instead. + ISD::LoadExtType ExtType = isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) + ? ISD::ZEXTLOAD + : ISD::EXTLOAD; + Load = DAG.getExtLoad(ExtType, DL, ResultVT, OriginalLoad->getChain(), + NewPtr, MPI, VecEltVT, Alignment, + OriginalLoad->getMemOperand()->getFlags(), + OriginalLoad->getAAInfo()); + DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load); + } else { + // The result type is narrower or the same width as the vector element + Load = DAG.getLoad(VecEltVT, DL, OriginalLoad->getChain(), NewPtr, MPI, + Alignment, OriginalLoad->getMemOperand()->getFlags(), + OriginalLoad->getAAInfo()); + DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load); + if (ResultVT.bitsLT(VecEltVT)) + Load = DAG.getNode(ISD::TRUNCATE, DL, ResultVT, Load); + else + Load = DAG.getBitcast(ResultVT, Load); + } + + return Load; +} diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 8bf5135..107e79c 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -967,21 +967,20 @@ void DWARFVerifier::verifyDebugLineStmtOffsets() { // here because we validate this in the .debug_info verifier. continue; } - auto Iter = StmtListToDie.find(LineTableOffset); - if (Iter != StmtListToDie.end()) { + auto [Iter, Inserted] = StmtListToDie.try_emplace(LineTableOffset, Die); + if (!Inserted) { ++NumDebugLineErrors; + const auto &OldDie = Iter->second; ErrorCategory.Report("Identical DW_AT_stmt_list section offset", [&]() { error() << "two compile unit DIEs, " - << format("0x%08" PRIx64, Iter->second.getOffset()) << " and " + << format("0x%08" PRIx64, OldDie.getOffset()) << " and " << format("0x%08" PRIx64, Die.getOffset()) << ", have the same DW_AT_stmt_list section offset:\n"; - dump(Iter->second); + dump(OldDie); dump(Die) << '\n'; }); // Already verified this line table before, no need to do it again. - continue; } - StmtListToDie[LineTableOffset] = Die; } } diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index d9096ed..176caa2 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -110,6 +110,7 @@ #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineCSE.h" +#include "llvm/CodeGen/MachineCopyPropagation.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineLICM.h" diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 84f6d42..8617377 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8787,51 +8787,6 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { return ZExtBool; } -// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the -// input operands are copy nodes where the source register is in a -// StridedOrContiguous class. For example: -// -// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO .. -// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous -// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous -// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO .. -// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous -// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous -// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr -// -bool shouldUseFormStridedPseudo(MachineInstr &MI) { - MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - - assert((MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || - MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) && - "Unexpected opcode."); - - MCRegister SubReg = MCRegister::NoRegister; - for (unsigned I = 1; I < MI.getNumOperands(); ++I) { - MachineOperand &MO = MI.getOperand(I); - assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE"); - - MachineOperand *Def = MRI.getOneDef(MO.getReg()); - if (!Def || !Def->getParent()->isCopy()) - return false; - - const MachineOperand &CopySrc = Def->getParent()->getOperand(1); - unsigned OpSubReg = CopySrc.getSubReg(); - if (SubReg == MCRegister::NoRegister) - SubReg = OpSubReg; - - MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg()); - const TargetRegisterClass *CopySrcClass = - MRI.getRegClass(CopySrcOp->getReg()); - if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg || - (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass && - CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass)) - return false; - } - - return true; -} - void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { // Live-in physreg copies that are glued to SMSTART are applied as @@ -8857,27 +8812,6 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, } } - if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || - MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) { - // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies - // from a StridedOrContiguous class, fall back on REG_SEQUENCE node. - if (shouldUseFormStridedPseudo(MI)) - return; - - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - TII->get(TargetOpcode::REG_SEQUENCE), - MI.getOperand(0).getReg()); - - for (unsigned I = 1; I < MI.getNumOperands(); ++I) { - MIB.add(MI.getOperand(I)); - MIB.addImm(AArch64::zsub0 + (I - 1)); - } - - MI.eraseFromParent(); - return; - } - // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that // have nothing to do with VG, were it not that they are used to materialise a // frame-address. If they contain a frame-index to a scalable vector, this diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 3c57ba4..a0928b9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -428,7 +428,6 @@ def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisVT<2, OtherVT>]>; - def SDT_AArch64CSel : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, @@ -451,6 +450,7 @@ def SDT_AArch64FCCMP : SDTypeProfile<1, 5, def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>, SDTCisSameAs<2, 1>]>; +def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>; def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>; def SDT_AArch64Insr : SDTypeProfile<1, 2, [SDTCisVec<0>]>; @@ -817,11 +817,9 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>; def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>; def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>; -def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>; - -def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>; -def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>; -def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>; +def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64Rev>; +def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64Rev>; +def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64Rev>; def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>; def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>; @@ -3000,8 +2998,8 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>; def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>; -def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>; -def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>; +def : Pat<(AArch64rev16 GPR32:$Rn), (REV16Wr GPR32:$Rn)>; +def : Pat<(AArch64rev16 GPR64:$Rn), (REV16Xr GPR64:$Rn)>; def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)), (and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))), diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index aae2fda..a6edcf1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -940,6 +940,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::experimental_cttz_elts: { + EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); + if (!getTLI()->shouldExpandCttzElements(ArgVT)) { + // This will consist of a SVE brkb and a cntp instruction. These + // typically have the same latency and half the throughput as a vector + // add instruction. + return 4; + } + break; + } default: break; } diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 0ac131e..4f6a413 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -36,27 +36,26 @@ let WantsRoot = true in def am_sme_indexed_b4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0, 15>">; // The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to -// improve register allocation for intrinsics which use strided and contiguous -// multi-vector registers, avoiding unnecessary copies. -// If the operands of the pseudo are copies where the source register is in -// the StridedOrContiguous class, the pseudo is used to provide a hint to the -// register allocator suggesting a contigious multi-vector register which -// matches the subregister sequence used by the operands. -// If the operands do not match this pattern, the pseudos are expanded -// to a REG_SEQUENCE using the post-isel hook. +// improve register allocation for intrinsics which use strided and +// contiguous multi-vector registers, avoiding unnecessary copies. +// The SMEPeepholeOpt pass will replace a REG_SEQUENCE instruction with the +// FORM_TRANSPOSED_REG_TUPLE pseudo if the operands are copies where the +// source register is in the StridedOrContiguous class. The operands in the +// sequence must all have the same subreg index. +// The pseudo is then used to provide a hint to the register allocator +// suggesting a contigious multi-vector register which matches the +// subregister sequence used by the operands. def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO : Pseudo<(outs ZPR2:$tup), (ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{ let hasSideEffects = 0; - let hasPostISelHook = 1; } def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO : Pseudo<(outs ZPR4:$tup), (ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{ let hasSideEffects = 0; - let hasPostISelHook = 1; } def SPILL_PPR_TO_ZPR_SLOT_PSEUDO : @@ -178,14 +177,14 @@ class SME2_ZA_TwoOp_Multi_Single_Pat<string name, SDPatternOperator intrinsic, O class SME2_ZA_TwoOp_VG2_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt, ComplexPattern tileslice> : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm), - (!cast<Instruction>(name # _PSEUDO) $base, $offset, (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1, vt:$Zn2), + (!cast<Instruction>(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm)>; class SME2_ZA_TwoOp_VG4_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt, ComplexPattern tileslice> : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm), (!cast<Instruction>(name # _PSEUDO) $base, $offset, - (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4), + (REG_SEQUENCE ZPR4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3), zpr_ty:$Zm)>; class SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ValueType vt, ComplexPattern tileslice> @@ -211,14 +210,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic Operand imm_ty, ComplexPattern tileslice> : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)), (!cast<Instruction>(name # _PSEUDO) $base, $offset, - (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>; + (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>; class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt, Operand imm_ty, ComplexPattern tileslice> : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)), (!cast<Instruction>(name # _PSEUDO) $base, $offset, - (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4), + (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3), zpr_ty:$Zm, imm_ty:$i)>; class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty> diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp index 4a0312d..2ffd4d7 100644 --- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp @@ -45,6 +45,7 @@ struct SMEPeepholeOpt : public MachineFunctionPass { bool optimizeStartStopPairs(MachineBasicBlock &MBB, bool &HasRemovedAllSMChanges) const; + bool visitRegSequence(MachineInstr &MI); }; char SMEPeepholeOpt::ID = 0; @@ -225,6 +226,81 @@ bool SMEPeepholeOpt::optimizeStartStopPairs( return Changed; } +// Using the FORM_TRANSPOSED_REG_TUPLE pseudo can improve register allocation +// of multi-vector intrinsics. However, the psuedo should only be emitted if +// the input registers of the REG_SEQUENCE are copy nodes where the source +// register is in a StridedOrContiguous class. For example: +// +// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO .. +// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous +// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous +// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO .. +// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous +// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous +// %9:zpr2mul2 = REG_SEQUENCE %5:zpr, %subreg.zsub0, %8:zpr, %subreg.zsub1 +// +// -> %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr +// +bool SMEPeepholeOpt::visitRegSequence(MachineInstr &MI) { + assert(MI.getMF()->getRegInfo().isSSA() && "Expected to be run on SSA form!"); + + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + switch (MRI.getRegClass(MI.getOperand(0).getReg())->getID()) { + case AArch64::ZPR2RegClassID: + case AArch64::ZPR4RegClassID: + case AArch64::ZPR2Mul2RegClassID: + case AArch64::ZPR4Mul4RegClassID: + break; + default: + return false; + } + + // The first operand is the register class created by the REG_SEQUENCE. + // Each operand pair after this consists of a vreg + subreg index, so + // for example a sequence of 2 registers will have a total of 5 operands. + if (MI.getNumOperands() != 5 && MI.getNumOperands() != 9) + return false; + + MCRegister SubReg = MCRegister::NoRegister; + for (unsigned I = 1; I < MI.getNumOperands(); I += 2) { + MachineOperand &MO = MI.getOperand(I); + + MachineOperand *Def = MRI.getOneDef(MO.getReg()); + if (!Def || !Def->getParent()->isCopy()) + return false; + + const MachineOperand &CopySrc = Def->getParent()->getOperand(1); + unsigned OpSubReg = CopySrc.getSubReg(); + if (SubReg == MCRegister::NoRegister) + SubReg = OpSubReg; + + MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg()); + if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg || + CopySrcOp->getReg().isPhysical()) + return false; + + const TargetRegisterClass *CopySrcClass = + MRI.getRegClass(CopySrcOp->getReg()); + if (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass && + CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass) + return false; + } + + unsigned Opc = MI.getNumOperands() == 5 + ? AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO + : AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; + + const TargetInstrInfo *TII = + MI.getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(Opc), MI.getOperand(0).getReg()); + for (unsigned I = 1; I < MI.getNumOperands(); I += 2) + MIB.addReg(MI.getOperand(I).getReg()); + + MI.eraseFromParent(); + return true; +} + INITIALIZE_PASS(SMEPeepholeOpt, "aarch64-sme-peephole-opt", "SME Peephole Optimization", false, false) @@ -247,6 +323,12 @@ bool SMEPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { bool BlockHasAllSMChangesRemoved; Changed |= optimizeStartStopPairs(MBB, BlockHasAllSMChangesRemoved); FunctionHasAllSMChangesRemoved |= BlockHasAllSMChangesRemoved; + + if (MF.getSubtarget<AArch64Subtarget>().isStreaming()) { + for (MachineInstr &MI : make_early_inc_range(MBB)) + if (MI.getOpcode() == AArch64::REG_SEQUENCE) + Changed |= visitRegSequence(MI); + } } AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index cca9fa7..792e17e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4217,18 +4217,21 @@ SDValue AMDGPUTargetLowering::performTruncateCombine( // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { if (auto *K = isConstOrConstSplat(Src.getOperand(1))) { - if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) { - SDValue BV = stripBitcast(Src.getOperand(0)); - if (BV.getOpcode() == ISD::BUILD_VECTOR && - BV.getValueType().getVectorNumElements() == 2) { - SDValue SrcElt = BV.getOperand(1); - EVT SrcEltVT = SrcElt.getValueType(); - if (SrcEltVT.isFloatingPoint()) { - SrcElt = DAG.getNode(ISD::BITCAST, SL, - SrcEltVT.changeTypeToInteger(), SrcElt); + SDValue BV = stripBitcast(Src.getOperand(0)); + if (BV.getOpcode() == ISD::BUILD_VECTOR) { + EVT SrcEltVT = BV.getOperand(0).getValueType(); + unsigned SrcEltSize = SrcEltVT.getSizeInBits(); + unsigned BitIndex = K->getZExtValue(); + unsigned PartIndex = BitIndex / SrcEltSize; + + if (PartIndex * SrcEltSize == BitIndex && + PartIndex < BV.getNumOperands()) { + if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) { + SDValue SrcElt = + DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(), + BV.getOperand(PartIndex)); + return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); } - - return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 5bfd891..09f7877 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { return 1024; } -// FIXME: Should we use narrower types for local/region, or account for when -// unaligned access is legal? Type *GCNTTIImpl::getMemcpyLoopLoweringType( LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, @@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType( if (AtomicElementSize) return Type::getIntNTy(Context, *AtomicElementSize * 8); - Align MinAlign = std::min(SrcAlign, DestAlign); - - // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the - // hardware into byte accesses. If you assume all alignments are equally - // probable, it's more efficient on average to use short accesses for this - // case. - if (MinAlign == Align(2)) - return Type::getInt16Ty(Context); - - // Not all subtargets have 128-bit DS instructions, and we currently don't - // form them by default. - if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS || - SrcAddrSpace == AMDGPUAS::REGION_ADDRESS || - DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS || - DestAddrSpace == AMDGPUAS::REGION_ADDRESS) { - return FixedVectorType::get(Type::getInt32Ty(Context), 2); - } - - // Global memory works best with 16-byte accesses. + // 16-byte accesses achieve the highest copy throughput. // If the operation has a fixed known length that is large enough, it is // worthwhile to return an even wider type and let legalization lower it into - // multiple accesses, effectively unrolling the memcpy loop. Private memory - // also hits this, although accesses may be decomposed. + // multiple accesses, effectively unrolling the memcpy loop. + // We also rely on legalization to decompose into smaller accesses for + // subtargets and address spaces where it is necessary. // // Don't unroll if Length is not a constant, since unrolling leads to worse // performance for length values that are smaller or slightly larger than the @@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType( OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, DestAlign, AtomicCpySize); - Align MinAlign = std::min(SrcAlign, DestAlign); - - if (MinAlign != Align(2)) { - Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4); - while (RemainingBytes >= 16) { - OpsOut.push_back(I32x4Ty); - RemainingBytes -= 16; - } + Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4); + while (RemainingBytes >= 16) { + OpsOut.push_back(I32x4Ty); + RemainingBytes -= 16; + } - Type *I64Ty = Type::getInt64Ty(Context); - while (RemainingBytes >= 8) { - OpsOut.push_back(I64Ty); - RemainingBytes -= 8; - } + Type *I64Ty = Type::getInt64Ty(Context); + while (RemainingBytes >= 8) { + OpsOut.push_back(I64Ty); + RemainingBytes -= 8; + } - Type *I32Ty = Type::getInt32Ty(Context); - while (RemainingBytes >= 4) { - OpsOut.push_back(I32Ty); - RemainingBytes -= 4; - } + Type *I32Ty = Type::getInt32Ty(Context); + while (RemainingBytes >= 4) { + OpsOut.push_back(I32Ty); + RemainingBytes -= 4; } Type *I16Ty = Type::getInt16Ty(Context); diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index a20319e..ac11526 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -287,10 +287,10 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, std::vector<std::pair<unsigned, unsigned>> &RemapChan) { unsigned NeededUndefs = 4 - RSI.UndefReg.size(); - if (PreviousRegSeqByUndefCount[NeededUndefs].empty()) - return false; std::vector<MachineInstr *> &MIs = PreviousRegSeqByUndefCount[NeededUndefs]; + if (MIs.empty()) + return false; CompatibleRSI = PreviousRegSeq[MIs.back()]; tryMergeVector(&CompatibleRSI, &RSI, RemapChan); return true; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index bee4c47..6e08aff 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2703,15 +2703,20 @@ class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, S (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) >; -let OtherPredicates = [NotHasTrue16BitInsts] in { +let True16Predicate = NotHasTrue16BitInsts in { def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; -} // end OtherPredicates = [NotHasTrue16BitInsts] +} // end True16Predicate = NotHasTrue16BitInsts + +let True16Predicate = UseRealTrue16Insts in { + def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; + def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; +} // end True16Predicate = UseRealTrue16BitInsts -let OtherPredicates = [HasTrue16BitInsts] in { +let True16Predicate = UseFakeTrue16Insts in { def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; -} // end OtherPredicates = [HasTrue16BitInsts] +} // end True16Predicate = UseFakeTrue16BitInsts def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; @@ -3790,6 +3795,13 @@ def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse> def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; } +let True16Predicate = UseRealTrue16Insts in { +def : FPMinMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +def : FPMinCanonMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinCanonMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +} + let True16Predicate = UseFakeTrue16Insts in { def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; @@ -3819,6 +3831,13 @@ def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum> def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>; } +let True16Predicate = UseRealTrue16Insts, SubtargetPredicate = isGFX12Plus in { +def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; +def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; +def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; +def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; +} + let True16Predicate = UseFakeTrue16Insts, SubtargetPredicate = isGFX12Plus in { def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index fad7e67..67bebfb3 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -976,8 +976,7 @@ struct Waitcnt { Waitcnt() = default; // Pre-gfx12 constructor. Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt) - : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt), - SampleCnt(~0u), BvhCnt(~0u), KmCnt(~0u) {} + : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt) {} // gfx12+ constructor. Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index 1e76bf7..296031e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -27,6 +27,28 @@ using namespace llvm; using namespace llvm::AMDGPU; +// Return the PAL metadata hardware shader stage name. +static const char *getStageName(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_PS: + return ".ps"; + case CallingConv::AMDGPU_VS: + return ".vs"; + case CallingConv::AMDGPU_GS: + return ".gs"; + case CallingConv::AMDGPU_ES: + return ".es"; + case CallingConv::AMDGPU_HS: + return ".hs"; + case CallingConv::AMDGPU_LS: + return ".ls"; + case CallingConv::AMDGPU_Gfx: + llvm_unreachable("Callable shader has no hardware stage"); + default: + return ".cs"; + } +} + // Read the PAL metadata from IR metadata, where it was put by the frontend. void AMDGPUPALMetadata::readFromIR(Module &M) { auto *NamedMD = M.getNamedMetadata("amdgpu.pal.metadata.msgpack"); @@ -232,8 +254,18 @@ void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) { if (isLegacy()) return; // Msgpack format. + // Entry point is updated to .entry_point_symbol and is set to the function + // name getHwStage(CC)[".entry_point_symbol"] = MsgPackDoc.getNode(Name, /*Copy=*/true); + + // Set .entry_point which is defined + // to be _amdgpu_<stage> and _amdgpu_cs for non-shader functions + SmallString<16> EPName("_amdgpu_"); + raw_svector_ostream EPNameOS(EPName); + EPNameOS << getStageName(CC) + 1; + getHwStage(CC)[".entry_point"] = + MsgPackDoc.getNode(EPNameOS.str(), /*Copy=*/true); } // Set the number of used vgprs in the metadata. This is an optional @@ -943,28 +975,6 @@ msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() { return GraphicsRegisters.getMap(); } -// Return the PAL metadata hardware shader stage name. -static const char *getStageName(CallingConv::ID CC) { - switch (CC) { - case CallingConv::AMDGPU_PS: - return ".ps"; - case CallingConv::AMDGPU_VS: - return ".vs"; - case CallingConv::AMDGPU_GS: - return ".gs"; - case CallingConv::AMDGPU_ES: - return ".es"; - case CallingConv::AMDGPU_HS: - return ".hs"; - case CallingConv::AMDGPU_LS: - return ".ls"; - case CallingConv::AMDGPU_Gfx: - llvm_unreachable("Callable shader has no hardware stage"); - default: - return ".cs"; - } -} - msgpack::DocNode &AMDGPUPALMetadata::refHwStage() { auto &N = MsgPackDoc.getRoot() diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 633a99d..74def43 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -163,6 +163,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; +def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; def True : Predicate<"true">; def False : Predicate<"false">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 56d8b73..a0d00e4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7582,3 +7582,44 @@ def GRIDDEPCONTROL_WAIT : Requires<[hasSM<90>, hasPTX<78>]>; def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>; + +// Tcgen05 intrinsics +let isConvergent = true in { + +multiclass TCGEN05_ALLOC_INTR<NVPTXRegClass rc, string AS, string num, Intrinsic Intr> { + def NAME : NVPTXInst<(outs), + (ins rc:$dst, Int32Regs:$ncols), + !strconcat("tcgen05.alloc.cta_group::", num, ".sync.aligned", AS, ".b32 [$dst], $ncols;"), + [(Intr rc:$dst, Int32Regs:$ncols)]>, + Requires<[hasTcgen05Instructions]>; +} + +defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, "", "1", int_nvvm_tcgen05_alloc_cg1>; +defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, "", "2", int_nvvm_tcgen05_alloc_cg2>; + +defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>; +defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>; + +defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>; +defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>; + +multiclass TCGEN05_DEALLOC_INTR<string num, Intrinsic Intr> { + def NAME : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int32Regs:$ncols), + !strconcat("tcgen05.dealloc.cta_group::", num, ".sync.aligned.b32 $tmem_addr, $ncols;"), + [(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>, + Requires<[hasTcgen05Instructions]>; +} +defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1>; +defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>; + +multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> { + def NAME : NVPTXInst<(outs), (ins), + !strconcat("tcgen05.relinquish_alloc_permit.cta_group::", num, ".sync.aligned;"), + [(Intr)]>, + Requires<[hasTcgen05Instructions]>; +} +defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>; +defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>; + +} // isConvergent diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 919f487..0c4420b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -93,6 +93,21 @@ public: bool hasDotInstructions() const { return SmVersion >= 61 && PTXVersion >= 50; } + // Tcgen05 instructions in Blackwell family + bool hasTcgen05Instructions() const { + bool HasTcgen05 = false; + switch (FullSmVersion) { + default: + break; + case 1001: // sm_100a + case 1011: // sm_101a + HasTcgen05 = true; + break; + } + + return HasTcgen05 && PTXVersion >= 86; + } + // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction // terminates a basic block. Instead, it would assume that control flow // continued to the next instruction. The next instruction could be in the diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index e88027f..f2afa6f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -140,6 +140,9 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { else if (UseShortPointers) Ret += "-p3:32:32-p4:32:32-p5:32:32"; + // Tensor Memory (addrspace:6) is always 32-bits. + Ret += "-p6:32:32"; + Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; return Ret; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 36b8a24..274c7cb 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -65,9 +65,10 @@ public: /// for a given imm form load/store opcode \p ImmFormOpcode. /// FIXME: move this to PPCInstrInfo class. unsigned getMappedIdxOpcForImmOpc(unsigned ImmOpcode) const { - if (!ImmToIdxMap.count(ImmOpcode)) + auto It = ImmToIdxMap.find(ImmOpcode); + if (It == ImmToIdxMap.end()) return PPC::INSTRUCTION_LIST_END; - return ImmToIdxMap.find(ImmOpcode)->second; + return It->second; } /// getPointerRegClass - Return the register class to use to hold pointers. diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td index ad06f47..98e05b7 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.td +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td @@ -42,6 +42,8 @@ def CSR_ILP32D_LP64D_V // Needed for implementation of RISCVRegisterInfo::getNoPreservedMask() def CSR_NoRegs : CalleeSavedRegs<(add)>; +def CSR_IPRA : CalleeSavedRegs<(add X1)>; + // Interrupt handler needs to save/restore all registers that are used, // both Caller and Callee saved registers. def CSR_Interrupt : CalleeSavedRegs<(add X1, (sequence "X%u", 5, 31))>; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8e3caf5..7c3b583 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -17759,6 +17759,83 @@ static SDValue combineScalarCTPOPToVCPOP(SDNode *N, SelectionDAG &DAG, return DAG.getZExtOrTrunc(Pop, DL, VT); } +static SDValue performSHLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const RISCVSubtarget &Subtarget) { + // (shl (zext x), y) -> (vwsll x, y) + if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget)) + return V; + + // (shl (sext x), C) -> (vwmulsu x, 1u << C) + // (shl (zext x), C) -> (vwmulu x, 1u << C) + + if (!DCI.isAfterLegalizeDAG()) + return SDValue(); + + SDValue LHS = N->getOperand(0); + if (!LHS.hasOneUse()) + return SDValue(); + unsigned Opcode; + switch (LHS.getOpcode()) { + case ISD::SIGN_EXTEND: + case RISCVISD::VSEXT_VL: + Opcode = RISCVISD::VWMULSU_VL; + break; + case ISD::ZERO_EXTEND: + case RISCVISD::VZEXT_VL: + Opcode = RISCVISD::VWMULU_VL; + break; + default: + return SDValue(); + } + + SDValue RHS = N->getOperand(1); + APInt ShAmt; + uint64_t ShAmtInt; + if (ISD::isConstantSplatVector(RHS.getNode(), ShAmt)) + ShAmtInt = ShAmt.getZExtValue(); + else if (RHS.getOpcode() == RISCVISD::VMV_V_X_VL && + RHS.getOperand(1).getOpcode() == ISD::Constant) + ShAmtInt = RHS.getConstantOperandVal(1); + else + return SDValue(); + + // Better foldings: + // (shl (sext x), 1) -> (vwadd x, x) + // (shl (zext x), 1) -> (vwaddu x, x) + if (ShAmtInt <= 1) + return SDValue(); + + SDValue NarrowOp = LHS.getOperand(0); + MVT NarrowVT = NarrowOp.getSimpleValueType(); + uint64_t NarrowBits = NarrowVT.getScalarSizeInBits(); + if (ShAmtInt >= NarrowBits) + return SDValue(); + MVT VT = N->getSimpleValueType(0); + if (NarrowBits * 2 != VT.getScalarSizeInBits()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + SDValue Passthru, Mask, VL; + switch (N->getOpcode()) { + case ISD::SHL: + Passthru = DAG.getUNDEF(VT); + std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget); + break; + case RISCVISD::SHL_VL: + Passthru = N->getOperand(2); + Mask = N->getOperand(3); + VL = N->getOperand(4); + break; + default: + llvm_unreachable("Expected SHL"); + } + return DAG.getNode(Opcode, DL, VT, NarrowOp, + DAG.getConstant(1ULL << ShAmtInt, SDLoc(RHS), NarrowVT), + Passthru, Mask, VL); +} + SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -18392,7 +18469,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, break; } case RISCVISD::SHL_VL: - if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget)) + if (SDValue V = performSHLCombine(N, DCI, Subtarget)) return V; [[fallthrough]]; case RISCVISD::SRA_VL: @@ -18417,7 +18494,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SRL: case ISD::SHL: { if (N->getOpcode() == ISD::SHL) { - if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget)) + if (SDValue V = performSHLCombine(N, DCI, Subtarget)) return V; } SDValue ShAmt = N->getOperand(1); diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index b0a5269..7a99bfd 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -56,6 +56,11 @@ RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode) /*PC*/0, HwMode) {} const MCPhysReg * +RISCVRegisterInfo::getIPRACSRegs(const MachineFunction *MF) const { + return CSR_IPRA_SaveList; +} + +const MCPhysReg * RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { auto &Subtarget = MF->getSubtarget<RISCVSubtarget>(); if (MF->getFunction().getCallingConv() == CallingConv::GHC) diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h index 3ab79694..6c4e9c7 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h @@ -62,6 +62,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg *getIPRACSRegs(const MachineFunction *MF) const override; + BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index fa7c7c5..cb2ec1d 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -940,6 +940,44 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost( return NumLoads * MemOpCost; } +InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost( + unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, + TTI::TargetCostKind CostKind, const Instruction *I) { + bool IsLegal = (Opcode == Instruction::Store && + isLegalMaskedCompressStore(DataTy, Alignment)) || + (Opcode == Instruction::Load && + isLegalMaskedExpandLoad(DataTy, Alignment)); + if (!IsLegal || CostKind != TTI::TCK_RecipThroughput) + return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask, + Alignment, CostKind, I); + // Example compressstore sequence: + // vsetivli zero, 8, e32, m2, ta, ma (ignored) + // vcompress.vm v10, v8, v0 + // vcpop.m a1, v0 + // vsetvli zero, a1, e32, m2, ta, ma + // vse32.v v10, (a0) + // Example expandload sequence: + // vsetivli zero, 8, e8, mf2, ta, ma (ignored) + // vcpop.m a1, v0 + // vsetvli zero, a1, e32, m2, ta, ma + // vle32.v v10, (a0) + // vsetivli zero, 8, e32, m2, ta, ma + // viota.m v12, v0 + // vrgather.vv v8, v10, v12, v0.t + auto MemOpCost = + getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind); + auto LT = getTypeLegalizationCost(DataTy); + SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI}; + if (VariableMask) + Opcodes.push_back(RISCV::VCPOP_M); + if (Opcode == Instruction::Store) + Opcodes.append({RISCV::VCOMPRESS_VM}); + else + Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV}); + return MemOpCost + + LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); +} + InstructionCost RISCVTTIImpl::getStridedMemoryOpCost( unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 042530b..5389e9b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -174,6 +174,12 @@ public: TTI::TargetCostKind CostKind, const Instruction *I); + InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, + bool VariableMask, + Align Alignment, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); + InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 78db841..c202f7f 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -284,7 +284,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, // Adjust stack pointer. int StackAdj = StackAdjust.getImm(); int MaxTCDelta = X86FI->getTCReturnAddrDelta(); - int Offset = 0; + int64_t Offset = 0; assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); // Incoporate the retaddr area. @@ -297,7 +297,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, if (Offset) { // Check for possible merge with preceding ADD instruction. - Offset += X86FL->mergeSPUpdates(MBB, MBBI, true); + Offset = X86FL->mergeSPAdd(MBB, MBBI, Offset, true); X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true); } diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index a15db03..50c56c9 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -223,6 +223,8 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { return false; } +constexpr int64_t MaxSPChunk = (1LL << 31) - 1; + /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, @@ -242,7 +244,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, return; } - uint64_t Chunk = (1LL << 31) - 1; + uint64_t Chunk = MaxSPChunk; MachineFunction &MF = *MBB.getParent(); const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); @@ -391,12 +393,15 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( return MI; } -int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - bool doMergeWithPrevious) const { +template <typename FoundT, typename CalcT> +int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + FoundT FoundStackAdjust, + CalcT CalcNewOffset, + bool doMergeWithPrevious) const { if ((doMergeWithPrevious && MBBI == MBB.begin()) || (!doMergeWithPrevious && MBBI == MBB.end())) - return 0; + return CalcNewOffset(0); MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; @@ -415,27 +420,38 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction()) PI = std::prev(PI); - unsigned Opc = PI->getOpcode(); - int Offset = 0; - - if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) && - PI->getOperand(0).getReg() == StackPtr) { - assert(PI->getOperand(1).getReg() == StackPtr); - Offset = PI->getOperand(2).getImm(); - } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) && - PI->getOperand(0).getReg() == StackPtr && - PI->getOperand(1).getReg() == StackPtr && - PI->getOperand(2).getImm() == 1 && - PI->getOperand(3).getReg() == X86::NoRegister && - PI->getOperand(5).getReg() == X86::NoRegister) { - // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg. - Offset = PI->getOperand(4).getImm(); - } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) && - PI->getOperand(0).getReg() == StackPtr) { - assert(PI->getOperand(1).getReg() == StackPtr); - Offset = -PI->getOperand(2).getImm(); - } else - return 0; + int64_t Offset = 0; + for (;;) { + unsigned Opc = PI->getOpcode(); + + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) && + PI->getOperand(0).getReg() == StackPtr) { + assert(PI->getOperand(1).getReg() == StackPtr); + Offset = PI->getOperand(2).getImm(); + } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) && + PI->getOperand(0).getReg() == StackPtr && + PI->getOperand(1).getReg() == StackPtr && + PI->getOperand(2).getImm() == 1 && + PI->getOperand(3).getReg() == X86::NoRegister && + PI->getOperand(5).getReg() == X86::NoRegister) { + // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg. + Offset = PI->getOperand(4).getImm(); + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) && + PI->getOperand(0).getReg() == StackPtr) { + assert(PI->getOperand(1).getReg() == StackPtr); + Offset = -PI->getOperand(2).getImm(); + } else + return CalcNewOffset(0); + + FoundStackAdjust(PI, Offset); + if (std::abs((int64_t)CalcNewOffset(Offset)) < MaxSPChunk) + break; + + if (doMergeWithPrevious ? (PI == MBB.begin()) : (PI == MBB.end())) + return CalcNewOffset(0); + + PI = doMergeWithPrevious ? std::prev(PI) : std::next(PI); + } PI = MBB.erase(PI); if (PI != MBB.end() && PI->isCFIInstruction()) { @@ -448,7 +464,16 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, if (!doMergeWithPrevious) MBBI = skipDebugInstructionsForward(PI, MBB.end()); - return Offset; + return CalcNewOffset(Offset); +} + +int64_t X86FrameLowering::mergeSPAdd(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + int64_t AddOffset, + bool doMergeWithPrevious) const { + return mergeSPUpdates( + MBB, MBBI, [AddOffset](int64_t Offset) { return AddOffset + Offset; }, + doMergeWithPrevious); } void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, @@ -1975,8 +2000,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // If there is an SUB32ri of ESP immediately before this instruction, merge // the two. This can be the case when tail call elimination is enabled and - // the callee has more arguments then the caller. - NumBytes -= mergeSPUpdates(MBB, MBBI, true); + // the callee has more arguments than the caller. + NumBytes = mergeSPUpdates( + MBB, MBBI, [NumBytes](int64_t Offset) { return NumBytes - Offset; }, + true); // Adjust stack pointer: ESP -= numbytes. @@ -2457,7 +2484,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (HasFP) { if (X86FI->hasSwiftAsyncContext()) { // Discard the context. - int Offset = 16 + mergeSPUpdates(MBB, MBBI, true); + int64_t Offset = mergeSPAdd(MBB, MBBI, 16, true); emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true); } // Pop EBP. @@ -2531,7 +2558,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. if (NumBytes || MFI.hasVarSizedObjects()) - NumBytes += mergeSPUpdates(MBB, MBBI, true); + NumBytes = mergeSPAdd(MBB, MBBI, NumBytes, true); // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was @@ -2612,11 +2639,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) { // Add the return addr area delta back since we are not tail calling. - int Offset = -1 * X86FI->getTCReturnAddrDelta(); + int64_t Offset = -1 * X86FI->getTCReturnAddrDelta(); assert(Offset >= 0 && "TCDelta should never be positive"); if (Offset) { // Check for possible merge with preceding ADD instruction. - Offset += mergeSPUpdates(MBB, Terminator, true); + Offset = mergeSPAdd(MBB, Terminator, Offset, true); emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true); } } @@ -3814,13 +3841,24 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr( // Add Amount to SP to destroy a frame, or subtract to setup. int64_t StackAdjustment = isDestroy ? Amount : -Amount; + int64_t CfaAdjustment = StackAdjustment; if (StackAdjustment) { // Merge with any previous or following adjustment instruction. Note: the // instructions merged with here do not have CFI, so their stack - // adjustments do not feed into CfaAdjustment. - StackAdjustment += mergeSPUpdates(MBB, InsertPos, true); - StackAdjustment += mergeSPUpdates(MBB, InsertPos, false); + // adjustments do not feed into CfaAdjustment + + auto CalcCfaAdjust = [&CfaAdjustment](MachineBasicBlock::iterator PI, + int64_t Offset) { + CfaAdjustment += Offset; + }; + auto CalcNewOffset = [&StackAdjustment](int64_t Offset) { + return StackAdjustment + Offset; + }; + StackAdjustment = + mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, true); + StackAdjustment = + mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, false); if (StackAdjustment) { if (!(F.hasMinSize() && @@ -3830,7 +3868,7 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr( } } - if (DwarfCFI && !hasFP(MF)) { + if (DwarfCFI && !hasFP(MF) && CfaAdjustment) { // If we don't have FP, but need to generate unwind information, // we need to set the correct CFA offset after the stack adjustment. // How much we adjust the CFA offset depends on whether we're emitting @@ -3838,14 +3876,11 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr( // offset to be correct at each call site, while for debugging we want // it to be more precise. - int64_t CfaAdjustment = -StackAdjustment; // TODO: When not using precise CFA, we also need to adjust for the // InternalAmt here. - if (CfaAdjustment) { - BuildCFI( - MBB, InsertPos, DL, - MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment)); - } + BuildCFI( + MBB, InsertPos, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -CfaAdjustment)); } return I; diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index 02fe8ee..ef41b46 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -134,12 +134,50 @@ public: processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, RegScavenger *RS) const override; - /// Check the instruction before/after the passed instruction. If - /// it is an ADD/SUB/LEA instruction it is deleted argument and the - /// stack adjustment is returned as a positive value for ADD/LEA and - /// a negative for SUB. - int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - bool doMergeWithPrevious) const; +private: + /// Basic Pseudocode: + /// if (instruction before/after the passed instruction is ADD/SUB/LEA) + /// Offset = instruction stack adjustment + /// ... positive value for ADD/LEA and negative for SUB + /// FoundStackAdjust(instruction, Offset) + /// erase(instruction) + /// return CalcNewOffset(Offset) + /// else + /// return CalcNewOffset(0) + /// + /// It's possible that the selected instruction is not immediately + /// before/after MBBI for large adjustments that have been split into multiple + /// instructions. + /// + /// FoundStackAdjust should have the signature: + /// void FoundStackAdjust(MachineBasicBlock::iterator PI, int64_t Offset) + /// CalcNewOffset should have the signature: + /// int64_t CalcNewOffset(int64_t Offset) + template <typename FoundT, typename CalcT> + int64_t mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + FoundT FoundStackAdjust, CalcT CalcNewOffset, + bool doMergeWithPrevious) const; + + template <typename CalcT> + int64_t mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, CalcT CalcNewOffset, + bool doMergeWithPrevious) const { + auto FoundStackAdjust = [](MachineBasicBlock::iterator MBBI, + int64_t Offset) {}; + return mergeSPUpdates(MBB, MBBI, FoundStackAdjust, CalcNewOffset, + doMergeWithPrevious); + } + +public: + /// Equivalent to: + /// mergeSPUpdates(MBB, MBBI, + /// [AddOffset](int64_t Offset) { + /// return AddOffset + Offset; + /// }, + /// doMergeWithPrevious); + int64_t mergeSPAdd(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + int64_t AddOffset, bool doMergeWithPrevious) const; /// Emit a series of instructions to increment / decrement the stack /// pointer by a constant value. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8f90420..6cf6061 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2944,7 +2944,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model CM, } /// Return true if the condition is an signed comparison operation. -static bool isX86CCSigned(unsigned X86CC) { +static bool isX86CCSigned(X86::CondCode X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); @@ -22975,7 +22975,7 @@ static bool isProfitableToUseFlagOp(SDValue Op) { /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. -static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, +static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // CF and OF aren't always set the way we want. Determine which // of these we need. @@ -23085,7 +23085,7 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. -static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, +static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (isNullConstant(Op1)) @@ -23157,10 +23157,17 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, return Add.getValue(1); } - // Use SUB instead of CMP to enable CSE between SUB and CMP. + // If we already have an XOR of the ops, use that to check for equality. + // Else use SUB instead of CMP to enable CSE between SUB and CMP. + unsigned X86Opc = X86ISD::SUB; + if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && + (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) || + DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0}))) + X86Opc = X86ISD::XOR; + SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); - SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); - return Sub.getValue(1); + SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1); + return CmpOp.getValue(1); } bool X86TargetLowering::isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp index 9bcfa6c..8f97537 100644 --- a/llvm/lib/TargetParser/ARMTargetParser.cpp +++ b/llvm/lib/TargetParser/ARMTargetParser.cpp @@ -403,13 +403,12 @@ static ARM::FPUKind findSinglePrecisionFPU(ARM::FPUKind InputFPUKind) { if (!ARM::isDoublePrecision(InputFPU.Restriction)) return InputFPUKind; - // Otherwise, look for an FPU entry with all the same fields, except - // that it does not support double precision. + // Otherwise, look for an FPU entry that has the same FPUVer + // and is not Double Precision. We want to allow for changing of + // NEON Support and Restrictions so CPU's such as Cortex-R52 can + // select between SP Only and Full DP modes. for (const ARM::FPUName &CandidateFPU : ARM::FPUNames) { if (CandidateFPU.FPUVer == InputFPU.FPUVer && - CandidateFPU.NeonSupport == InputFPU.NeonSupport && - ARM::has32Regs(CandidateFPU.Restriction) == - ARM::has32Regs(InputFPU.Restriction) && !ARM::isDoublePrecision(CandidateFPU.Restriction)) { return CandidateFPU.ID; } diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index 41bc67f..34ddeeb 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -1184,22 +1184,22 @@ static std::optional<unsigned> getGVNForPHINode(OutlinableRegion &Region, for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) { Incoming = PN->getIncomingValue(Idx); IncomingBlock = PN->getIncomingBlock(Idx); + // If the incoming block isn't in the region, we don't have to worry about + // this incoming value. + if (!Blocks.contains(IncomingBlock)) + continue; + // If we cannot find a GVN, and the incoming block is included in the region // this means that the input to the PHINode is not included in the region we // are trying to analyze, meaning, that if it was outlined, we would be // adding an extra input. We ignore this case for now, and so ignore the // region. std::optional<unsigned> OGVN = Cand.getGVN(Incoming); - if (!OGVN && Blocks.contains(IncomingBlock)) { + if (!OGVN) { Region.IgnoreRegion = true; return std::nullopt; } - // If the incoming block isn't in the region, we don't have to worry about - // this incoming value. - if (!Blocks.contains(IncomingBlock)) - continue; - // Collect the canonical numbers of the values in the PHINode. unsigned GVN = *OGVN; OGVN = Cand.getCanonicalNum(GVN); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index eab15ac..f3f2e50 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3493,11 +3493,28 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Instrument generic vector reduction intrinsics // by ORing together all their fields. + // + // The return type does not need to be the same type as the fields + // e.g., declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>) void handleVectorReduceIntrinsic(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *S = IRB.CreateOrReduce(getShadow(&I, 0)); + S = CreateShadowCast(IRB, S, getShadowTy(&I)); setShadow(&I, S); - setOrigin(&I, getOrigin(&I, 0)); + setOriginForNaryOp(I); + } + + // Similar to handleVectorReduceIntrinsic but with an initial starting value. + // e.g., call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> + // %a1) + // shadow = shadow[a0] | shadow[a1.0] | shadow[a1.1] + void handleVectorReduceWithStarterIntrinsic(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + Value *Shadow0 = getShadow(&I, 0); + Value *Shadow1 = IRB.CreateOrReduce(getShadow(&I, 1)); + Value *S = IRB.CreateOr(Shadow0, Shadow1); + setShadow(&I, S); + setOriginForNaryOp(I); } // Instrument vector.reduce.or intrinsic. @@ -4346,8 +4363,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_xor: case Intrinsic::vector_reduce_mul: + // Add reduction to scalar + case Intrinsic::aarch64_neon_faddv: + case Intrinsic::aarch64_neon_saddv: + case Intrinsic::aarch64_neon_uaddv: handleVectorReduceIntrinsic(I); break; + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: + handleVectorReduceWithStarterIntrinsic(I); + break; + case Intrinsic::x86_sse_stmxcsr: handleStmxcsr(I); break; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 539c922..558d75c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7634,6 +7634,60 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size()); } +/// Builds the arguments types vector for the given call instruction with the +/// given \p ID for the specified vector factor. +static SmallVector<Type *> +buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, + const unsigned VF, unsigned MinBW, + const TargetTransformInfo *TTI) { + SmallVector<Type *> ArgTys; + for (auto [Idx, Arg] : enumerate(CI->args())) { + if (ID != Intrinsic::not_intrinsic) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) { + ArgTys.push_back(Arg->getType()); + continue; + } + if (MinBW > 0) { + ArgTys.push_back( + getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF)); + continue; + } + } + ArgTys.push_back(getWidenedType(Arg->getType(), VF)); + } + return ArgTys; +} + +/// Calculates the costs of vectorized intrinsic (if possible) and vectorized +/// function (if possible) calls. +static std::pair<InstructionCost, InstructionCost> +getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, + TargetTransformInfo *TTI, TargetLibraryInfo *TLI, + ArrayRef<Type *> ArgTys) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + // Calculate the cost of the scalar and vector calls. + FastMathFlags FMF; + if (auto *FPCI = dyn_cast<FPMathOperator>(CI)) + FMF = FPCI->getFastMathFlags(); + IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF); + auto IntrinsicCost = + TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); + + auto Shape = VFShape::get(CI->getFunctionType(), + ElementCount::getFixed(VecTy->getNumElements()), + false /*HasGlobalPred*/); + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + auto LibCost = IntrinsicCost; + if (!CI->isNoBuiltin() && VecFunc) { + // Calculate the cost of the vector library call. + // If the corresponding vector call is cheaper, return its cost. + LibCost = + TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput); + } + return {IntrinsicCost, LibCost}; +} + BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( const InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, @@ -9017,34 +9071,6 @@ bool BoUpSLP::areAllUsersVectorized( }); } -static std::pair<InstructionCost, InstructionCost> -getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, - TargetTransformInfo *TTI, TargetLibraryInfo *TLI, - ArrayRef<Type *> ArgTys) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - - // Calculate the cost of the scalar and vector calls. - FastMathFlags FMF; - if (auto *FPCI = dyn_cast<FPMathOperator>(CI)) - FMF = FPCI->getFastMathFlags(); - IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF); - auto IntrinsicCost = - TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); - - auto Shape = VFShape::get(CI->getFunctionType(), - ElementCount::getFixed(VecTy->getNumElements()), - false /*HasGlobalPred*/); - Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); - auto LibCost = IntrinsicCost; - if (!CI->isNoBuiltin() && VecFunc) { - // Calculate the cost of the vector library call. - // If the corresponding vector call is cheaper, return its cost. - LibCost = - TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput); - } - return {IntrinsicCost, LibCost}; -} - void BoUpSLP::TreeEntry::buildAltOpShuffleMask( const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask, SmallVectorImpl<Value *> *OpScalars, @@ -11045,30 +11071,6 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { return TTI::CastContextHint::None; } -/// Builds the arguments types vector for the given call instruction with the -/// given \p ID for the specified vector factor. -static SmallVector<Type *> -buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, - const unsigned VF, unsigned MinBW, - const TargetTransformInfo *TTI) { - SmallVector<Type *> ArgTys; - for (auto [Idx, Arg] : enumerate(CI->args())) { - if (ID != Intrinsic::not_intrinsic) { - if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) { - ArgTys.push_back(Arg->getType()); - continue; - } - if (MinBW > 0) { - ArgTys.push_back( - getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF)); - continue; - } - } - ArgTys.push_back(getWidenedType(Arg->getType(), VF)); - } - return ArgTys; -} - InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, SmallPtrSetImpl<Value *> &CheckedExtracts) { |