diff options
Diffstat (limited to 'llvm/lib/Target')
17 files changed, 375 insertions, 88 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp index 137ff89..f13554f 100644 --- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp +++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp @@ -47,6 +47,8 @@ public: StringRef getPassName() const override { return AARCH64_BRANCH_TARGETS_NAME; } private: + const AArch64Subtarget *Subtarget; + void addBTI(MachineBasicBlock &MBB, bool CouldCall, bool CouldJump, bool NeedsWinCFI); }; @@ -75,6 +77,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { << "********** Function: " << MF.getName() << '\n'); const Function &F = MF.getFunction(); + Subtarget = &MF.getSubtarget<AArch64Subtarget>(); + // LLVM does not consider basic blocks which are the targets of jump tables // to be address-taken (the address can't escape anywhere else), but they are // used for indirect branches, so need BTI instructions. @@ -100,9 +104,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { // a BTI, and pointing the indirect branch at that. For non-ELF targets we // can't rely on that, so we assume that `CouldCall` is _always_ true due // to the risk of long-branch thunks at link time. - if (&MBB == &*MF.begin() && - (!MF.getSubtarget<AArch64Subtarget>().isTargetELF() || - (F.hasAddressTaken() || !F.hasLocalLinkage()))) + if (&MBB == &*MF.begin() && (!Subtarget->isTargetELF() || + (F.hasAddressTaken() || !F.hasLocalLinkage()))) CouldCall = true; // If the block itself is address-taken, it could be indirectly branched @@ -132,9 +135,6 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall, << (CouldCall ? "c" : "") << " to " << MBB.getName() << "\n"); - const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>( - MBB.getParent()->getSubtarget().getInstrInfo()); - unsigned HintNum = 32; if (CouldCall) HintNum |= 2; @@ -162,6 +162,8 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall, MBBI->getOpcode() == AArch64::PACIBSP)) return; + const AArch64InstrInfo *TII = Subtarget->getInstrInfo(); + // Insert BTI exactly at the first executable instruction. const DebugLoc DL = MBB.findDebugLoc(MBBI); MachineInstr *BTI = BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT)) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 1e607f4..f63981b 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1871,7 +1871,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { } bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); bool Modified = false; for (auto &MBB : MF) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index c76689f..0f7b34c 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -644,10 +644,10 @@ bool AArch64FrameLowering::hasReservedCallFrame( MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const AArch64InstrInfo *TII = - static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); - const AArch64TargetLowering *TLI = - MF.getSubtarget<AArch64Subtarget>().getTargetLowering(); + + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64InstrInfo *TII = Subtarget.getInstrInfo(); + const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); [[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo(); DebugLoc DL = I->getDebugLoc(); unsigned Opc = I->getOpcode(); @@ -1319,8 +1319,8 @@ StackOffset AArch64FrameLowering::getStackOffset(const MachineFunction &MF, // TODO: This function currently does not work for scalable vectors. int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const { - const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const AArch64RegisterInfo *RegInfo = + MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI); return RegInfo->getLocalAddressRegister(MF) == AArch64::FP ? getFPOffset(MF, ObjectOffset).getFixed() @@ -1343,10 +1343,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( TargetStackID::Value StackID, Register &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); - const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); - const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed(); int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed(); @@ -1466,7 +1465,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( return FPOffset; } FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() - : (unsigned)AArch64::SP; + : MCRegister(AArch64::SP); return SPOffset; } @@ -1589,8 +1588,8 @@ static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, namespace { struct RegPairInfo { - unsigned Reg1 = AArch64::NoRegister; - unsigned Reg2 = AArch64::NoRegister; + Register Reg1; + Register Reg2; int FrameIdx; int Offset; enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type; @@ -1598,21 +1597,21 @@ struct RegPairInfo { RegPairInfo() = default; - bool isPaired() const { return Reg2 != AArch64::NoRegister; } + bool isPaired() const { return Reg2.isValid(); } bool isScalable() const { return Type == PPR || Type == ZPR; } }; } // end anonymous namespace -unsigned findFreePredicateReg(BitVector &SavedRegs) { +MCRegister findFreePredicateReg(BitVector &SavedRegs) { for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) { if (SavedRegs.test(PReg)) { unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0; - return PNReg; + return MCRegister(PNReg); } } - return AArch64::NoRegister; + return MCRegister(); } // The multivector LD/ST are available only for SME or SVE2p1 targets @@ -1930,8 +1929,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } bool PTrueCreated = false; for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) { - unsigned Reg1 = RPI.Reg1; - unsigned Reg2 = RPI.Reg2; + Register Reg1 = RPI.Reg1; + Register Reg2 = RPI.Reg2; unsigned StrOpc; // Issue sequence of spills for cs regs. The first spill may be converted @@ -1967,7 +1966,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( break; } - unsigned X0Scratch = AArch64::NoRegister; + Register X0Scratch; auto RestoreX0 = make_scope_exit([&] { if (X0Scratch != AArch64::NoRegister) BuildMI(MBB, MI, DL, TII.get(TargetOpcode::COPY), AArch64::X0) @@ -2009,11 +2008,15 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } } - LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); - if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); - dbgs() << ") -> fi#(" << RPI.FrameIdx; - if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1; - dbgs() << ")\n"); + LLVM_DEBUG({ + dbgs() << "CSR spill: (" << printReg(Reg1, TRI); + if (RPI.isPaired()) + dbgs() << ", " << printReg(Reg2, TRI); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx + 1; + dbgs() << ")\n"; + }); assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) && "Windows unwdinding requires a consecutive (FP,LR) pair"); @@ -2143,8 +2146,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( bool PTrueCreated = false; for (const RegPairInfo &RPI : RegPairs) { - unsigned Reg1 = RPI.Reg1; - unsigned Reg2 = RPI.Reg2; + Register Reg1 = RPI.Reg1; + Register Reg2 = RPI.Reg2; // Issue sequence of restores for cs regs. The last restore may be converted // to a post-increment load later by emitEpilogue if the callee-save stack @@ -2176,11 +2179,15 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( case RegPairInfo::VG: continue; } - LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); - if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); - dbgs() << ") -> fi#(" << RPI.FrameIdx; - if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1; - dbgs() << ")\n"); + LLVM_DEBUG({ + dbgs() << "CSR restore: (" << printReg(Reg1, TRI); + if (RPI.isPaired()) + dbgs() << ", " << printReg(Reg2, TRI); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx + 1; + dbgs() << ")\n"; + }); // Windows unwind codes require consecutive registers if registers are // paired. Make the switch here, so that the code below will save (x,x+1) @@ -2435,8 +2442,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); - const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; @@ -2444,9 +2450,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, MachineFrameInfo &MFI = MF.getFrameInfo(); const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); - unsigned BasePointerReg = RegInfo->hasBasePointer(MF) - ? RegInfo->getBaseRegister() - : (unsigned)AArch64::NoRegister; + MCRegister BasePointerReg = + RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() : MCRegister(); unsigned ExtraCSSpill = 0; bool HasUnpairedGPR64 = false; @@ -2456,7 +2461,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { - const unsigned Reg = CSRegs[i]; + const MCRegister Reg = CSRegs[i]; // Add the base pointer register to SavedRegs if it is callee-save. if (Reg == BasePointerReg) @@ -2470,7 +2475,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, } bool RegUsed = SavedRegs.test(Reg); - unsigned PairedReg = AArch64::NoRegister; + MCRegister PairedReg; const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg); if (RegIsGPR64 || AArch64::FPR64RegClass.contains(Reg) || AArch64::FPR128RegClass.contains(Reg)) { @@ -2522,8 +2527,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); // Find a suitable predicate register for the multi-vector spill/fill // instructions. - unsigned PnReg = findFreePredicateReg(SavedRegs); - if (PnReg != AArch64::NoRegister) + MCRegister PnReg = findFreePredicateReg(SavedRegs); + if (PnReg.isValid()) AFI->setPredicateRegForFillSpill(PnReg); // If no free callee-save has been found assign one. if (!AFI->getPredicateRegForFillSpill() && @@ -2558,7 +2563,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned PPRCSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (unsigned Reg : SavedRegs.set_bits()) { - auto *RC = TRI->getMinimalPhysRegClass(Reg); + auto *RC = TRI->getMinimalPhysRegClass(MCRegister(Reg)); assert(RC && "expected register class!"); auto SpillSize = TRI->getSpillSize(*RC); bool IsZPR = AArch64::ZPRRegClass.contains(Reg); @@ -2600,7 +2605,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, LLVM_DEBUG({ dbgs() << "*** determineCalleeSaves\nSaved CSRs:"; for (unsigned Reg : SavedRegs.set_bits()) - dbgs() << ' ' << printReg(Reg, RegInfo); + dbgs() << ' ' << printReg(MCRegister(Reg), RegInfo); dbgs() << "\n"; }); diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp index d67182d..03dd1cd 100644 --- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp +++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp @@ -649,7 +649,7 @@ bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) { } bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); bool Modified = false; for (auto &MBB : MF) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 79975b0..5bfb19d9 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -620,7 +620,7 @@ AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { return RC; } -unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; } +MCRegister AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; } bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 47d76f3..3b0f4f6 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -124,7 +124,7 @@ public: bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override; bool hasBasePointer(const MachineFunction &MF) const; - unsigned getBaseRegister() const; + MCRegister getBaseRegister() const; bool isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const override; diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index d695f26..b4a4f4c 100644 --- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -33,6 +33,7 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -49,8 +50,8 @@ #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSchedule.h" #include "llvm/Pass.h" -#include <unordered_map> #include <map> +#include <unordered_map> using namespace llvm; @@ -67,7 +68,7 @@ namespace { struct AArch64SIMDInstrOpt : public MachineFunctionPass { static char ID; - const TargetInstrInfo *TII; + const AArch64InstrInfo *TII; MachineRegisterInfo *MRI; TargetSchedModel SchedModel; @@ -694,13 +695,9 @@ bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - TII = MF.getSubtarget().getInstrInfo(); MRI = &MF.getRegInfo(); - const TargetSubtargetInfo &ST = MF.getSubtarget(); - const AArch64InstrInfo *AAII = - static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); - if (!AAII) - return false; + const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>(); + TII = ST.getInstrInfo(); SchedModel.init(&ST); if (!SchedModel.hasInstrSchedModel()) return false; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 5c3e26e..4cd51d6 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1114,7 +1114,6 @@ bool AArch64InstPrinter::printSyslAlias(const MCInst *MI, } else return false; - std::string Str; llvm::transform(Name, Name.begin(), ::tolower); O << '\t' << Ins << '\t' << Reg.str() << ", " << Name; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 02c5390..6214f4d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -740,7 +740,7 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { return "r600"; } -static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { +static Reloc::Model getEffectiveRelocModel() { // The AMDGPU toolchain only supports generating shared objects, so we // must always use PIC. return Reloc::PIC_; @@ -754,8 +754,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, CodeGenOptLevel OptLevel) : CodeGenTargetMachineImpl( T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, CPU), FS, Options, - getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), + getEffectiveRelocModel(), getEffectiveCodeModel(CM, CodeModel::Small), + OptLevel), TLOF(createTLOF(getTargetTriple())) { initAsmInfo(); if (TT.isAMDGCN()) { diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td index 7453727..b60569e 100644 --- a/llvm/lib/Target/ARM/ARMProcessors.td +++ b/llvm/lib/Target/ARM/ARMProcessors.td @@ -421,6 +421,17 @@ def : ProcessorModel<"cortex-m52", CortexM55Model, [ARMv81mMainline, FeatureMVEVectorCostFactor1, HasMVEFloatOps]>; +def : ProcessorModel<"star-mc3", CortexM55Model, [ARMv81mMainline, + FeatureDSP, + FeatureFPARMv8_D16, + FeatureHasNoBranchPredictor, + FeaturePACBTI, + FeatureUseMISched, + FeaturePreferBranchAlign32, + FeatureHasSlowFPVMLx, + FeatureMVEVectorCostFactor1, + HasMVEFloatOps]>; + def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, FeatureHWDivARM, diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index ca4a655..80c96c6 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -1701,6 +1701,43 @@ lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, DAG.getConstant(Imm, DL, GRLenVT)); } +/// Lower VECTOR_SHUFFLE whose result is the reversed source vector. +/// +/// It is possible to do optimization for VECTOR_SHUFFLE performing vector +/// reverse whose mask likes: +/// <7, 6, 5, 4, 3, 2, 1, 0> +/// +/// When undef's appear in the mask they are treated as if they were whatever +/// value is necessary in order to fit the above forms. +static SDValue +lowerVECTOR_SHUFFLE_IsReverse(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { + // Only vectors with i8/i16 elements which cannot match other patterns + // directly needs to do this. + if (VT != MVT::v16i8 && VT != MVT::v8i16 && VT != MVT::v32i8 && + VT != MVT::v16i16) + return SDValue(); + + if (!ShuffleVectorInst::isReverseMask(Mask, Mask.size())) + return SDValue(); + + int WidenNumElts = VT.getVectorNumElements() / 4; + SmallVector<int, 16> WidenMask(WidenNumElts, -1); + for (int i = 0; i < WidenNumElts; ++i) + WidenMask[i] = WidenNumElts - 1 - i; + + MVT WidenVT = MVT::getVectorVT( + VT.getVectorElementType() == MVT::i8 ? MVT::i32 : MVT::i64, WidenNumElts); + SDValue NewV1 = DAG.getBitcast(WidenVT, V1); + SDValue WidenRev = DAG.getVectorShuffle(WidenVT, DL, NewV1, + DAG.getUNDEF(WidenVT), WidenMask); + + return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, + DAG.getBitcast(VT, WidenRev), + DAG.getConstant(27, DL, Subtarget.getGRLenVT())); +} + /// Lower VECTOR_SHUFFLE into VPACKEV (if possible). /// /// VPACKEV interleaves the even elements from each vector. @@ -2004,6 +2041,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; + if ((Result = + lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget))) + return Result; // TODO: This comment may be enabled in the future to better match the // pattern for instruction selection. @@ -2622,6 +2662,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, return Result; if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, DAG, Subtarget))) return Result; + if ((Result = + lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget))) + return Result; // TODO: This comment may be enabled in the future to better match the // pattern for instruction selection. diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index c5e26c1..9de4c9d 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -62,8 +62,7 @@ static cl::opt<bool> cl::desc("Enable the merge base offset pass"), cl::init(true), cl::Hidden); -static Reloc::Model getEffectiveRelocModel(const Triple &TT, - std::optional<Reloc::Model> RM) { +static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { return RM.value_or(Reloc::Static); } @@ -92,7 +91,7 @@ LoongArchTargetMachine::LoongArchTargetMachine( const TargetOptions &Options, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options, - getEffectiveRelocModel(TT, RM), + getEffectiveRelocModel(RM), getEffectiveLoongArchCodeModel(TT, CM), OL), TLOF(std::make_unique<TargetLoweringObjectFileELF>()) { initAsmInfo(); diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.cpp b/llvm/lib/Target/M68k/M68kTargetMachine.cpp index 847c27ba..f525d43 100644 --- a/llvm/lib/Target/M68k/M68kTargetMachine.cpp +++ b/llvm/lib/Target/M68k/M68kTargetMachine.cpp @@ -46,13 +46,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kTarget() { namespace { -Reloc::Model getEffectiveRelocModel(const Triple &TT, - std::optional<Reloc::Model> RM) { +Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { // If not defined we default to static - if (!RM.has_value()) - return Reloc::Static; - - return *RM; + return RM.value_or(Reloc::Static); } CodeModel::Model getEffectiveCodeModel(std::optional<CodeModel::Model> CM, @@ -73,7 +69,7 @@ M68kTargetMachine::M68kTargetMachine(const Target &T, const Triple &TT, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options, - getEffectiveRelocModel(TT, RM), + getEffectiveRelocModel(RM), ::getEffectiveCodeModel(CM, JIT), OL), TLOF(std::make_unique<M68kELFTargetObjectFile>()), Subtarget(TT, CPU, FS, *this) { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 598735f..c923f0e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1082,6 +1082,161 @@ let Predicates = [hasPTX<70>, hasSM<80>] in { "mbarrier.pending_count.b64", [(set i32:$res, (int_nvvm_mbarrier_pending_count i64:$state))]>; } + +class MBAR_UTIL<string op, string scope, + string space = "", string sem = "", + bit tl = 0, bit parity = 0> { + // The mbarrier instructions in PTX ISA are of the general form: + // mbarrier.op.semantics.scope.space.b64 arg1, arg2 ... + // where: + // op -> arrive, expect_tx, complete_tx, arrive.expect_tx etc. + // semantics -> acquire, release, relaxed (default depends on the op) + // scope -> cta or cluster (default is cta-scope) + // space -> shared::cta or shared::cluster (default is shared::cta) + // + // The 'semantics' and 'scope' go together. If one is specified, + // then the other _must_ be specified. For example: + // (A) mbarrier.arrive <args> (valid, release and cta are default) + // (B) mbarrier.arrive.release.cta <args> (valid, sem/scope mentioned explicitly) + // (C) mbarrier.arrive.release <args> (invalid, needs scope) + // (D) mbarrier.arrive.cta <args> (invalid, needs order) + // + // Wherever possible, we prefer form (A) to (B) since it is available + // from early PTX versions. In most cases, explicitly specifying the + // scope requires a later version of PTX. + string _scope_asm = !cond( + !eq(scope, "scope_cluster") : "cluster", + !eq(scope, "scope_cta") : !if(!empty(sem), "", "cta"), + true : scope); + string _space_asm = !cond( + !eq(space, "space_cta") : "shared", + !eq(space, "space_cluster") : "shared::cluster", + true : space); + + string _parity = !if(parity, "parity", ""); + string asm_str = StrJoin<".", ["mbarrier", op, _parity, + sem, _scope_asm, _space_asm, "b64"]>.ret; + + string _intr_suffix = StrJoin<"_", [!subst(".", "_", op), _parity, + !if(tl, "tl", ""), + sem, scope, space]>.ret; + string intr_name = "int_nvvm_mbarrier_" # _intr_suffix; + + // Predicate checks: + // These are used only for the "test_wait/try_wait" variants as they + // have evolved since sm80 and are complex. The predicates for the + // remaining instructions are straightforward and have already been + // applied directly. + Predicate _sm_pred = !cond(!or( + !eq(op, "try_wait"), + !eq(scope, "scope_cluster"), + !eq(sem, "relaxed")) : hasSM<90>, + true : hasSM<80>); + Predicate _ptx_pred = !cond( + !eq(sem, "relaxed") : hasPTX<86>, + !ne(_scope_asm, "") : hasPTX<80>, + !eq(op, "try_wait") : hasPTX<78>, + parity : hasPTX<71>, + true : hasPTX<70>); + list<Predicate> preds = [_ptx_pred, _sm_pred]; +} + +foreach op = ["expect_tx", "complete_tx"] in { + foreach scope = ["scope_cta", "scope_cluster"] in { + foreach space = ["space_cta", "space_cluster"] in { + defvar intr = !cast<Intrinsic>(MBAR_UTIL<op, scope, space>.intr_name); + defvar suffix = StrJoin<"_", [op, scope, space]>.ret; + def mbar_ # suffix : BasicNVPTXInst<(outs), (ins ADDR:$addr, B32:$tx_count), + MBAR_UTIL<op, scope, space, "relaxed">.asm_str, + [(intr addr:$addr, i32:$tx_count)]>, + Requires<[hasPTX<80>, hasSM<90>]>; + } // space + } // scope +} // op + +multiclass MBAR_ARR_INTR<string op, string scope, string sem, + list<Predicate> pred = []> { + // When either of sem or scope is non-default, both have to + // be explicitly specified. So, explicitly state that + // sem is `release` when scope is `cluster`. + defvar asm_sem = !if(!and(!empty(sem), !eq(scope, "scope_cluster")), + "release", sem); + + defvar asm_cta = MBAR_UTIL<op, scope, "space_cta", asm_sem>.asm_str; + defvar intr_cta = !cast<Intrinsic>(MBAR_UTIL<op, scope, + "space_cta", sem>.intr_name); + + defvar asm_cluster = MBAR_UTIL<op, scope, "space_cluster", asm_sem>.asm_str; + defvar intr_cluster = !cast<Intrinsic>(MBAR_UTIL<op, scope, + "space_cluster", sem>.intr_name); + + def _CTA : NVPTXInst<(outs B64:$state), + (ins ADDR:$addr, B32:$tx_count), + asm_cta # " $state, [$addr], $tx_count;", + [(set i64:$state, (intr_cta addr:$addr, i32:$tx_count))]>, + Requires<pred>; + def _CLUSTER : NVPTXInst<(outs), + (ins ADDR:$addr, B32:$tx_count), + asm_cluster # " _, [$addr], $tx_count;", + [(intr_cluster addr:$addr, i32:$tx_count)]>, + Requires<pred>; +} +foreach op = ["arrive", "arrive.expect_tx", + "arrive_drop", "arrive_drop.expect_tx"] in { + foreach scope = ["scope_cta", "scope_cluster"] in { + defvar suffix = !subst(".", "_", op) # scope; + defm mbar_ # suffix # _release : MBAR_ARR_INTR<op, scope, "", [hasPTX<80>, hasSM<90>]>; + defm mbar_ # suffix # _relaxed : MBAR_ARR_INTR<op, scope, "relaxed", [hasPTX<86>, hasSM<90>]>; + } // scope +} // op + +multiclass MBAR_WAIT_INTR<string op, string scope, string sem, bit time_limit> { + // When either of sem or scope is non-default, both have to + // be explicitly specified. So, explicitly state that the + // semantics is `acquire` when the scope is `cluster`. + defvar asm_sem = !if(!and(!empty(sem), !eq(scope, "scope_cluster")), + "acquire", sem); + + defvar asm_parity = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit, 1>.asm_str; + defvar pred_parity = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit, 1>.preds; + defvar intr_parity = !cast<Intrinsic>(MBAR_UTIL<op, scope, "space_cta", + sem, time_limit, 1>.intr_name); + + defvar asm_state = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit>.asm_str; + defvar pred_state = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit>.preds; + defvar intr_state = !cast<Intrinsic>(MBAR_UTIL<op, scope, "space_cta", + sem, time_limit>.intr_name); + + defvar ins_tl_dag = !if(time_limit, (ins B32:$tl), (ins)); + defvar tl_suffix = !if(time_limit, ", $tl;", ";"); + defvar intr_state_dag = !con((intr_state addr:$addr, i64:$state), + !if(time_limit, (intr_state i32:$tl), (intr_state))); + defvar intr_parity_dag = !con((intr_parity addr:$addr, i32:$phase), + !if(time_limit, (intr_parity i32:$tl), (intr_parity))); + + def _STATE : NVPTXInst<(outs B1:$res), !con((ins ADDR:$addr, B64:$state), ins_tl_dag), + asm_state # " $res, [$addr], $state" # tl_suffix, + [(set i1:$res, intr_state_dag)]>, + Requires<pred_state>; + def _PARITY : NVPTXInst<(outs B1:$res), !con((ins ADDR:$addr, B32:$phase), ins_tl_dag), + asm_parity # " $res, [$addr], $phase" # tl_suffix, + [(set i1:$res, intr_parity_dag)]>, + Requires<pred_parity>; +} +foreach op = ["test_wait", "try_wait"] in { + foreach scope = ["scope_cta", "scope_cluster"] in { + foreach time_limit = !if(!eq(op, "try_wait"), [true, false], [false]) in { + defvar suffix = StrJoin<"_", [op, scope, !if(time_limit, "tl", "")]>.ret; + defm mbar_ # suffix # "_acquire" : MBAR_WAIT_INTR<op, scope, "", time_limit>; + defm mbar_ # suffix # "_relaxed" : MBAR_WAIT_INTR<op, scope, "relaxed", time_limit>; + } // time_limit + } // scope +} // op + //----------------------------------- // Math Functions //----------------------------------- diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 53633ea..8198173 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -92,6 +92,8 @@ private: void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID, MachineIRBuilder &MIB) const; bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; + bool selectIntrinsicWithSideEffects(MachineInstr &I, + MachineIRBuilder &MIB) const; ComplexRendererFns selectShiftMask(MachineOperand &Root, unsigned ShiftWidth) const; @@ -714,6 +716,88 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { return GenericOpc; } +bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( + MachineInstr &I, MachineIRBuilder &MIB) const { + // Find the intrinsic ID. + unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID(); + // Select the instruction. + switch (IntrinID) { + default: + return false; + case Intrinsic::riscv_vlm: + case Intrinsic::riscv_vle: + case Intrinsic::riscv_vle_mask: + case Intrinsic::riscv_vlse: + case Intrinsic::riscv_vlse_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vle_mask || + IntrinID == Intrinsic::riscv_vlse_mask; + bool IsStrided = IntrinID == Intrinsic::riscv_vlse || + IntrinID == Intrinsic::riscv_vlse_mask; + LLT VT = MRI->getType(I.getOperand(0).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Result vector + const Register DstReg = I.getOperand(0).getReg(); + + // Sources + bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm; + unsigned CurOp = 2; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Passthru + if (HasPassthruOperand) { + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + } else { + SrcOps.push_back(Register(RISCV::NoRegister)); + } + + // Base Pointer + auto PtrReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PtrReg); + + // Stride + if (IsStrided) { + auto StrideReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(StrideReg); + } + + // Mask + if (IsMasked) { + auto MaskReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(MaskReg); + } + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + const RISCV::VLEPseudo *P = + RISCV::getVLEPseudo(IsMasked, IsStrided, /*FF*/ false, Log2SEW, + static_cast<unsigned>(LMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Policy + uint64_t Policy = RISCVVType::MASK_AGNOSTIC; + if (IsMasked) + Policy = I.getOperand(CurOp++).getImm(); + PseudoMI.addImm(Policy); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } + } +} + bool RISCVInstructionSelector::select(MachineInstr &MI) { MachineIRBuilder MIB(MI); @@ -984,6 +1068,8 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { return constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI); } + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + return selectIntrinsicWithSideEffects(MI, MIB); default: return false; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index f81b1e12..ae54ff1 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -141,8 +141,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVAsmPrinterPass(*PR); } -static Reloc::Model getEffectiveRelocModel(const Triple &TT, - std::optional<Reloc::Model> RM) { +static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { return RM.value_or(Reloc::Static); } @@ -154,7 +153,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, CodeGenOptLevel OL, bool JIT) : CodeGenTargetMachineImpl( T, TT.computeDataLayout(Options.MCOptions.getABIName()), TT, CPU, FS, - Options, getEffectiveRelocModel(TT, RM), + Options, getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), TLOF(std::make_unique<RISCVELFTargetObjectFile>()) { initAsmInfo(); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index a9c638c..621640c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -127,16 +127,11 @@ LLVMInitializeWebAssemblyTarget() { // WebAssembly Lowering public interface. //===----------------------------------------------------------------------===// -static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM, - const Triple &TT) { - if (!RM) { - // Default to static relocation model. This should always be more optimial - // than PIC since the static linker can determine all global addresses and - // assume direct function calls. - return Reloc::Static; - } - - return *RM; +static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { + // Default to static relocation model. This should always be more optimial + // than PIC since the static linker can determine all global addresses and + // assume direct function calls. + return RM.value_or(Reloc::Static); } using WebAssembly::WasmEnableEH; @@ -197,7 +192,7 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine( const TargetOptions &Options, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options, - getEffectiveRelocModel(RM, TT), + getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Large), OL), TLOF(new WebAssemblyTargetObjectFile()), UsesMultivalueABI(Options.MCOptions.getABIName() == "experimental-mv") { |
