diff options
Diffstat (limited to 'llvm/lib/Target/X86')
28 files changed, 462 insertions, 226 deletions
diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp index c0a6035..c0b9339 100644 --- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp @@ -69,13 +69,13 @@ public: CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, CCState &State) override { - bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); + bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State); StackSize = State.getStackSize(); static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7}; - if (!Info.IsFixed) + if (Flags.isVarArg()) NumXMMRegs = State.getFirstUnallocated(XMMArgRegs); return Res; @@ -363,7 +363,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, Info.CallConv, Info.IsVarArg)) return false; - bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed; + bool IsFixed = + Info.OrigArgs.empty() ? true : !Info.OrigArgs.back().Flags[0].isVarArg(); if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp index 817e88d..e2a1bbf3 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp @@ -36,11 +36,31 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst, } } +void X86InstrPostProcess::useStackEngine(std::unique_ptr<Instruction> &Inst, + const MCInst &MCI) { + // TODO(boomanaiden154): We currently do not handle PUSHF/POPF because we + // have not done the necessary benchmarking to see if they are also + // optimized by the stack engine. + // TODO: We currently just remove all RSP writes from stack operations. This + // is not fully correct because we do not model sync uops which will + // delay subsequent rsp using non-stack instructions. + if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) { + auto *StackRegisterDef = + llvm::find_if(Inst->getDefs(), [](const WriteState &State) { + return State.getRegisterID() == X86::RSP; + }); + assert( + StackRegisterDef != Inst->getDefs().end() && + "Expected push instruction to implicitly use stack pointer register."); + Inst->getDefs().erase(StackRegisterDef); + } +} + void X86InstrPostProcess::postProcessInstruction( std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { - // Currently, we only modify certain instructions' IsALoadBarrier and - // IsAStoreBarrier flags. + // Set IsALoadBarrier and IsAStoreBarrier flags. setMemBarriers(Inst, MCI); + useStackEngine(Inst, MCI); } } // namespace mca diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h index 4a83ba8..c5459e4 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess { /// as load and store barriers. void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI); + /// Called within X86InstrPostPorcess to remove some rsp read operands + /// on stack instructions to better simulate the stack engine. We currently + /// do not model features of the stack engine like sync uops. + void useStackEngine(std::unique_ptr<Instruction> &Inst, const MCInst &MCI); + public: X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 56a4cc3..865fc0c 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -485,7 +485,16 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, if (!CanPadInst) return; - if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) { + if (PendingBA) { + auto *NextFragment = PendingBA->getNext(); + assert(NextFragment && "NextFragment should not be null"); + if (NextFragment == OS.getCurrentFragment()) + return; + // We eagerly create an empty fragment when inserting a fragment + // with a variable-size tail. + if (NextFragment->getNext() == OS.getCurrentFragment()) + return; + // Macro fusion actually happens and there is no other fragment inserted // after the previous instruction. // diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 0dabd98a3..25fcf81 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -17,6 +17,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSymbolMachO.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" @@ -354,8 +355,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, unsigned Type = MachO::GENERIC_RELOC_VANILLA; // See <reloc.h>. - const MCSymbol *A = Target.getAddSym(); - + auto *A = static_cast<const MCSymbolMachO *>(Target.getAddSym()); if (!A->getFragment()) { reportError(Fixup.getLoc(), "symbol '" + A->getName() + diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 990b381..9cfe081 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -355,6 +355,9 @@ def FeatureCCMP : SubtargetFeature<"ccmp", "HasCCMP", "true", "Support conditional cmp & test instructions">; def FeatureNF : SubtargetFeature<"nf", "HasNF", "true", "Support status flags update suppression">; +// FeatureCF is not enabled by default for APXF and targets that support APXF +// due to performance reason, though it is part of APXF. Users need to enable it +// manually. def FeatureCF : SubtargetFeature<"cf", "HasCF", "true", "Support conditional faulting">; def FeatureZU : SubtargetFeature<"zu", "HasZU", "true", @@ -1169,7 +1172,6 @@ def ProcessorFeatures { FeaturePPX, FeatureNDD, FeatureNF, - FeatureCF, FeatureMOVRS, FeatureAMXMOVRS, FeatureAMXAVX512, @@ -1291,7 +1293,9 @@ def ProcessorFeatures { list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps, TuningPreferMovmskOverVTest, TuningFastImmVectorShift]; - list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning); + list<SubtargetFeature> ADLRemoveTuning = [TuningPOPCNTFalseDeps]; + list<SubtargetFeature> ADLTuning = + !listremove(!listconcat(SKLTuning, ADLAdditionalTuning), ADLRemoveTuning); list<SubtargetFeature> ADLFeatures = !listconcat(TRMFeatures, ADLAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86CallingConv.h b/llvm/lib/Target/X86/X86CallingConv.h index 191e0fa..8e37f34 100644 --- a/llvm/lib/Target/X86/X86CallingConv.h +++ b/llvm/lib/Target/X86/X86CallingConv.h @@ -22,10 +22,10 @@ namespace llvm { bool RetCC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State); } // End llvm namespace diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 067bd43..f007886 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -3323,6 +3323,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { return false; SmallVector<MVT, 16> OutVTs; + SmallVector<Type *, 16> ArgTys; SmallVector<Register, 16> ArgRegs; // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra @@ -3369,6 +3370,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { ArgRegs.push_back(ResultReg); OutVTs.push_back(VT); + ArgTys.push_back(Val->getType()); } // Analyze operands of the call, assigning locations to each operand. @@ -3379,7 +3381,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (IsWin64) CCInfo.AllocateStack(32, Align(8)); - CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); + CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, ArgTys, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index d3c2392..787b71d 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -564,8 +564,17 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { MachineBasicBlock::iterator I; if (LastShapeMI && dominates(MBB, MI, LastShapeMI)) I = ++LastShapeMI->getIterator(); - else - I = ++MI.getIterator(); + else { + // Call can overwrite registers like rax, ensure the tile config + // instruction is sinked closer to first instruction that uses tile. + auto UseIt = MI.getIterator(); + while (UseIt != MBB.end()) { + if (HasTileOperand(MRI, *UseIt)) + break; + ++UseIt; + } + I = UseIt; + } Config(*I); HasUnconfigTile = false; continue; diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 95ed590..cba7843 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/EHPersonalities.h" @@ -2678,7 +2679,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, // object. // We need to factor in additional offsets applied during the prologue to the // frame, base, and stack pointer depending on which is used. - int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); + int64_t Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t StackSize = MFI.getStackSize(); @@ -4212,6 +4213,14 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( // emitPrologue if it gets called and emits CFI. MF.setHasWinCFI(false); + MachineFrameInfo &MFI = MF.getFrameInfo(); + // If the frame is big enough that we might need to scavenge a register to + // handle huge offsets, reserve a stack slot for that now. + if (!isInt<32>(MFI.estimateStackSize(MF))) { + int FI = MFI.CreateStackObject(SlotSize, Align(SlotSize), false); + RS->addScavengingFrameIndex(FI); + } + // If we are using Windows x64 CFI, ensure that the stack is always 8 byte // aligned. The format doesn't support misaligned stack adjustments. if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f366094..47cea93 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2756,8 +2756,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { !Subtarget.hasBWI()) return TypeSplitVector; + // Since v8f16 is legal, widen anything over v4f16. if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && - !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16) + VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() && + VT.getVectorElementType() == MVT::f16) return TypeSplitVector; if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && @@ -15419,18 +15421,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute( return SDValue(); } - // Avoid returning the same shuffle operation. For example, - // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5, - // undef:v16i16 - if (CrossLaneMask == Mask || InLaneMask == Mask) - return SDValue(); - // Simplify CrossLaneMask based on the actual demanded elements. if (V1.hasOneUse()) for (int i = 0; i != NumElts; ++i) if (!DemandedCrossLane[i]) CrossLaneMask[i] = SM_SentinelUndef; + // Avoid returning the same shuffle operation. For example, + // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5, + // undef:v16i16 + if (CrossLaneMask == Mask || InLaneMask == Mask) + return SDValue(); + SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask); return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT), InLaneMask); @@ -21250,7 +21252,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, // the truncation then we can use PACKSS by converting the srl to a sra. // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it. if (In.getOpcode() == ISD::SRL && In->hasOneUse()) - if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) { + if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) { if (*ShAmt == MinSignBits) { PackOpcode = X86ISD::PACKSS; return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops()); @@ -22219,9 +22221,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { In = DAG.getBitcast(MVT::i16, In); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = In; - Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext()); + TargetLowering::ArgListEntry Entry( + In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext())); Entry.IsSExt = false; Entry.IsZExt = true; Args.push_back(Entry); @@ -22318,9 +22319,8 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = In; - Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext()); + TargetLowering::ArgListEntry Entry( + In, EVT(SVT).getTypeForEVT(*DAG.getContext())); Entry.IsSExt = false; Entry.IsZExt = true; Args.push_back(Entry); @@ -23185,43 +23185,51 @@ static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, // Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback // to CMP(MOVMSK(PCMPEQB(X,Y))). -static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, +static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC) { - assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"); + SDValue Op = OrigLHS; - bool CmpNull = isNullConstant(RHS); - bool CmpAllOnes = isAllOnesConstant(RHS); - if (!CmpNull && !CmpAllOnes) - return SDValue(); + bool CmpNull; + APInt Mask; + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + CmpNull = isNullConstant(OrigRHS); + if (!CmpNull && !isAllOnesConstant(OrigRHS)) + return SDValue(); - SDValue Op = LHS; - if (!Subtarget.hasSSE2() || !Op->hasOneUse()) - return SDValue(); + if (!Subtarget.hasSSE2() || !Op->hasOneUse()) + return SDValue(); - // Check whether we're masking/truncating an OR-reduction result, in which - // case track the masked bits. - // TODO: Add CmpAllOnes support. - APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits()); - if (CmpNull) { - switch (Op.getOpcode()) { - case ISD::TRUNCATE: { - SDValue Src = Op.getOperand(0); - Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(), - Op.getScalarValueSizeInBits()); - Op = Src; - break; - } - case ISD::AND: { - if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - Mask = Cst->getAPIntValue(); - Op = Op.getOperand(0); + // Check whether we're masking/truncating an OR-reduction result, in which + // case track the masked bits. + // TODO: Add CmpAllOnes support. + Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits()); + if (CmpNull) { + switch (Op.getOpcode()) { + case ISD::TRUNCATE: { + SDValue Src = Op.getOperand(0); + Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(), + Op.getScalarValueSizeInBits()); + Op = Src; + break; + } + case ISD::AND: { + if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + Mask = Cst->getAPIntValue(); + Op = Op.getOperand(0); + } + break; + } } - break; - } } + } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) { + CC = ISD::SETEQ; + CmpNull = true; + Mask = APInt::getSignMask(Op.getScalarValueSizeInBits()); + } else { + return SDValue(); } ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND; @@ -26261,10 +26269,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - - if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask)) - if (MaskConst->getZExtValue() & 0x1) - return Op; + auto *MaskConst = dyn_cast<ConstantSDNode>(Mask); + if (MaskConst && (MaskConst->getZExtValue() & 0x1)) + return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -26280,6 +26287,17 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + + if (MaskConst) { + assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask"); + // Discard op and blend passthrough with scalar op src/dst. + SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements()); + std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0); + ShuffleMask[0] = VT.getVectorNumElements(); + return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc, + ShuffleMask); + } + return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc); } @@ -30049,7 +30067,6 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { EVT ArgVT = Op->getOperand(i).getValueType(); assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && @@ -30058,13 +30075,9 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); - Entry.Node = StackPtr; InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16)); - Entry.Ty = PointerType::get(*DAG.getContext(), 0); - Entry.IsSExt = false; - Entry.IsZExt = false; - Args.push_back(Entry); + Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0)); } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), @@ -33087,13 +33100,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - - Entry.Node = Arg; - Entry.Ty = ArgTy; - Entry.IsSExt = false; - Entry.IsZExt = false; - Args.push_back(Entry); + Args.emplace_back(Arg, ArgTy); bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, @@ -38679,13 +38686,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); if (Opc == X86ISD::VSHLI) { - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; + Known <<= ShAmt; // Low bits are known zero. Known.Zero.setLowBits(ShAmt); } else if (Opc == X86ISD::VSRLI) { - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // High bits are known zero. Known.Zero.setHighBits(ShAmt); } else { @@ -44206,8 +44211,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } // Conversions. // TODO: Add more CVT opcodes when we have test coverage. - case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: { + if (!Subtarget.hasVLX()) + break; + [[fallthrough]]; + } + case X86ISD::CVTTP2SI: { if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 && !Subtarget.hasVLX()) break; @@ -44517,8 +44526,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( TLO, Depth + 1)) return true; - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; + Known <<= ShAmt; // Low bits known zero. Known.Zero.setLowBits(ShAmt); @@ -44548,8 +44556,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( TLO, Depth + 1)) return true; - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // High bits known zero. Known.Zero.setHighBits(ShAmt); @@ -44597,8 +44604,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( TLO, Depth + 1)) return true; - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. @@ -44956,6 +44962,40 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( Known.Zero.setLowBits(Known2.countMinTrailingZeros()); return false; } + case X86ISD::VPMADD52L: + case X86ISD::VPMADD52H: { + KnownBits KnownOp0, KnownOp1; + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of + // operand 2). + APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52); + if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0, + TLO, Depth + 1)) + return true; + + if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1, + TLO, Depth + 1)) + return true; + + KnownBits KnownMul; + KnownOp0 = KnownOp0.trunc(52); + KnownOp1 = KnownOp1.trunc(52); + KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1) + : KnownBits::mulhu(KnownOp0, KnownOp1); + KnownMul = KnownMul.zext(64); + + // lo/hi(X * Y) + Z --> C + Z + if (KnownMul.isConstant()) { + SDLoc DL(Op); + SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2)); + } + + // TODO: Compute the known bits for VPMADD52L/VPMADD52H. + break; + } } return TargetLowering::SimplifyDemandedBitsForTargetNode( @@ -45131,6 +45171,14 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { switch (Op.getOpcode()) { + // SSE bit logic. + case X86ISD::FAND: + case X86ISD::FOR: + case X86ISD::FXOR: + case X86ISD::FANDN: + case X86ISD::ANDNP: + case X86ISD::VPTERNLOG: + return false; // SSE vector insert/extracts use modulo indices. case X86ISD::PINSRB: case X86ISD::PINSRW: @@ -45163,6 +45211,14 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return false; + // SSE signbit extraction. + case X86ISD::MOVMSK: + return false; + // GFNI instructions. + case X86ISD::GF2P8AFFINEINVQB: + case X86ISD::GF2P8AFFINEQB: + case X86ISD::GF2P8MULB: + return false; case ISD::INTRINSIC_WO_CHAIN: switch (Op->getConstantOperandVal(0)) { case Intrinsic::x86_sse2_pmadd_wd: @@ -48345,7 +48401,7 @@ static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then // peek through and adjust the TEST bit. if (Src.getOpcode() == ISD::SHL) { - if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) { + if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) { Src = Src.getOperand(0); BitMask.lshrInPlace(*ShiftAmt); } @@ -51800,6 +51856,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, SDValue X, Y; EVT CondVT = VT.changeVectorElementType(MVT::i1); if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) && + (VT.is512BitVector() || Subtarget.hasVLX()) && + (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && sd_match(N, m_And(m_Value(X), m_OneUse(m_SExt(m_AllOf( m_Value(Y), m_SpecificVT(CondVT), @@ -54131,10 +54189,10 @@ static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL) { assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode"); - std::optional<uint64_t> ValidSrlConst = DAG.getValidShiftAmount(N); + std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N); if (!ValidSrlConst) return SDValue(); - uint64_t SrlConstVal = *ValidSrlConst; + unsigned SrlConstVal = *ValidSrlConst; SDValue Op = N.getOperand(0); unsigned Opcode = Op.getOpcode(); @@ -55364,6 +55422,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, SDValue Src = N0.getOperand(0); EVT SrcVT = Src.getValueType(); if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 && + (VT.is512BitVector() || Subtarget.hasVLX()) && + (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse()) return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1, getZeroVector(VT, Subtarget, DAG, DL)); @@ -56243,7 +56303,13 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, SDValue Masked = BroadcastOp; if (N != 0) { - APInt Mask = APInt::getLowBitsSet(BroadcastOpVT.getSizeInBits(), Len); + unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits(); + unsigned NumDefinedElts = UndefElts.countTrailingZeros(); + + if (NumDefinedElts > BroadcastOpBitWidth) + return SDValue(); + + APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts); SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp, DAG.getConstant(N, DL, BroadcastOpVT)); Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue, @@ -56278,14 +56344,16 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG, Subtarget)) return V; + } - if (VT == MVT::i1) { - X86::CondCode X86CC; - if (SDValue V = - MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC)) - return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG)); - } + if (VT == MVT::i1) { + X86::CondCode X86CC; + if (SDValue V = + MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC)) + return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG)); + } + if (CC == ISD::SETNE || CC == ISD::SETEQ) { if (OpVT.isScalarInteger()) { // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0) // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0) @@ -60062,6 +60130,19 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Simplify VPMADD52L/VPMADD52H operations. +static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + MVT VT = N->getSimpleValueType(0); + unsigned NumEltBits = VT.getScalarSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits), + DCI)) + return SDValue(N, 0); + + return SDValue(); +} + static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -60699,6 +60780,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI); + case X86ISD::VPMADD52L: + case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI); case X86ISD::KSHIFTL: case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 547b221..d888f9f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1591,7 +1591,6 @@ namespace llvm { bool useLoadStackGuardNode(const Module &M) const override; bool useStackGuardXorFP() const override; void insertSSPDeclarations(Module &M) const override; - Value *getSDagStackGuard(const Module &M) const override; Function *getSSPStackGuardCheck(const Module &M) const override; SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override; @@ -1663,14 +1662,14 @@ namespace llvm { /// instructions/intrinsics. bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, - ArrayRef<unsigned> Indices, - unsigned Factor) const override; + ArrayRef<unsigned> Indices, unsigned Factor, + const APInt &GapMask) const override; /// Lower interleaved store(s) into target specific /// instructions/intrinsics. bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const override; + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 7c594d0..1c745a3 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -632,15 +632,6 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const { TargetLowering::insertSSPDeclarations(M); } -Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { - // MSVC CRT has a global variable holding security cookie. - if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || - Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { - return M.getGlobalVariable("__security_cookie"); - } - return TargetLowering::getSDagStackGuard(M); -} - Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index abf365e..5c0deeb 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4399,13 +4399,8 @@ static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) { if (STI.hasFP16()) return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; if (Load) - return STI.hasAVX512() ? X86::VMOVSSZrm - : STI.hasAVX() ? X86::VMOVSSrm - : X86::MOVSSrm; - else - return STI.hasAVX512() ? X86::VMOVSSZmr - : STI.hasAVX() ? X86::VMOVSSmr - : X86::MOVSSmr; + return X86::MOVSHPrm; + return X86::MOVSHPmr; } static unsigned getLoadStoreRegOpcode(Register Reg, @@ -4903,6 +4898,16 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, CmpMask = ~0; CmpValue = 0; return true; + case X86::TEST64ri32: + case X86::TEST32ri: + case X86::TEST16ri: + case X86::TEST8ri: + SrcReg = MI.getOperand(0).getReg(); + SrcReg2 = 0; + // Force identical compare. + CmpMask = 0; + CmpValue = 0; + return true; } return false; } @@ -4942,6 +4947,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI, case X86::CMP32ri: case X86::CMP16ri: case X86::CMP8ri: + case X86::TEST64ri32: + case X86::TEST32ri: + case X86::TEST16ri: + case X86::TEST8ri: CASE_ND(SUB64ri32) CASE_ND(SUB32ri) CASE_ND(SUB16ri) @@ -6131,6 +6140,25 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { return true; } +static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI, + const TargetInstrInfo &TII, bool HasAVX) { + unsigned NewOpc; + if (MI.getOpcode() == X86::MOVSHPrm) { + NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; + Register Reg = MI.getOperand(0).getReg(); + if (Reg > X86::XMM15) + NewOpc = X86::VMOVSSZrm; + } else { + NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; + Register Reg = MI.getOperand(5).getReg(); + if (Reg > X86::XMM15) + NewOpc = X86::VMOVSSZmr; + } + + MIB->setDesc(TII.get(NewOpc)); + return true; +} + bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); @@ -6203,6 +6231,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); } + case X86::MOVSHPmr: + case X86::MOVSHPrm: + return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX()); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 1acc0cd8..b792649 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -267,6 +267,18 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, } } +// pseudo instruction for fp16 spilling. +let isPseudo = 1, Predicates = [HasSSE2] in { + let mayStore = 1 in + def MOVSHPmr : I<0, Pseudo, (outs), (ins f32mem:$dst, FR16X:$src), "", + [], SSEPackedSingle>, + Sched<[WriteFStore]>; + let mayLoad = 1 in + def MOVSHPrm : I<0, Pseudo, (outs FR16X:$dst), (ins f32mem:$src), "", + [], SSEPackedSingle>, + Sched<[WriteFLoad]>; +} + defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", SSEPackedSingle, UseSSE1>, TB, XS; defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 636b072..4188487 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, - ArrayRef<unsigned> Indices, unsigned Factor) const { + ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); @@ -812,7 +812,7 @@ bool X86TargetLowering::lowerInterleavedLoad( auto *LI = dyn_cast<LoadInst>(Load); if (!LI) return false; - assert(!Mask && "Unexpected mask on a load"); + assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load"); // Create an interleaved access group. IRBuilder<> Builder(LI); @@ -825,7 +825,8 @@ bool X86TargetLowering::lowerInterleavedLoad( bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -836,7 +837,8 @@ bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, auto *SI = dyn_cast<StoreInst>(Store); if (!SI) return false; - assert(!LaneMask && "Unexpected mask on store"); + assert(!LaneMask && GapMask.popcount() == Factor && + "Unexpected mask on store"); // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index cf055cf..090060e 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -491,7 +491,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( NumGadgets += GadgetCount; // Traverse CFG to build the rest of the graph - SmallSet<MachineBasicBlock *, 8> BlocksVisited; + SmallPtrSet<MachineBasicBlock *, 8> BlocksVisited; std::function<void(MachineBasicBlock *, GraphIter, unsigned)> TraverseCFG = [&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) { unsigned LoopDepth = MLI.getLoopDepth(MBB); diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 3b4e531..2a1c499 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -100,7 +100,7 @@ struct BBInfo { class X86PreTileConfig : public MachineFunctionPass { MachineRegisterInfo *MRI = nullptr; const MachineLoopInfo *MLI = nullptr; - SmallSet<MachineInstr *, 8> DefVisited; + SmallPtrSet<MachineInstr *, 8> DefVisited; DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo; DenseMap<MachineBasicBlock *, SmallVector<MIRef, 8>> ShapeBBs; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 83b11ee..b79e508 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -21,8 +21,8 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TileShapeInfo.h" @@ -204,15 +204,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, // we can still use 64-bit register as long as we know the high bits // are zeros. // Reflect that in the returned register class. - if (Is64Bit) { - // When the target also allows 64-bit frame pointer and we do have a - // frame, this is fine to use it for the address accesses as well. - const X86FrameLowering *TFI = getFrameLowering(MF); - return TFI->hasFP(MF) && TFI->Uses64BitFramePtr - ? &X86::LOW32_ADDR_ACCESS_RBPRegClass - : &X86::LOW32_ADDR_ACCESSRegClass; - } - return &X86::GR32RegClass; + return Is64Bit ? &X86::LOW32_ADDR_ACCESSRegClass : &X86::GR32RegClass; case 1: // Normal GPRs except the stack pointer (for encoding reasons). if (Subtarget.isTarget64BitLP64()) return &X86::GR64_NOSPRegClass; @@ -907,7 +899,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); // Determine base register and offset. - int FIOffset; + int64_t FIOffset; Register BasePtr; if (MI.isReturn()) { assert((!hasStackRealignment(MF) || @@ -958,11 +950,41 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } if (MI.getOperand(FIOperandNum+3).isImm()) { - // Offset is a 32-bit integer. - int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm()); - int Offset = FIOffset + Imm; - assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) && - "Requesting 64-bit offset in 32-bit immediate!"); + const X86InstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + int64_t Imm = MI.getOperand(FIOperandNum + 3).getImm(); + int64_t Offset = FIOffset + Imm; + bool FitsIn32Bits = isInt<32>(Offset); + // If the offset will not fit in a 32-bit displacement, then for 64-bit + // targets, scavenge a register to hold it. Otherwise... + if (Is64Bit && !FitsIn32Bits) { + assert(RS && "RegisterScavenger was NULL"); + + RS->enterBasicBlockEnd(MBB); + RS->backward(std::next(II)); + + Register ScratchReg = RS->scavengeRegisterBackwards( + X86::GR64RegClass, II, /*RestoreAfter=*/false, /*SPAdj=*/0, + /*AllowSpill=*/true); + assert(ScratchReg != 0 && "scratch reg was 0"); + RS->setRegUsed(ScratchReg); + + BuildMI(MBB, II, DL, TII->get(X86::MOV64ri), ScratchReg).addImm(Offset); + + MI.getOperand(FIOperandNum + 3).setImm(0); + MI.getOperand(FIOperandNum + 2).setReg(ScratchReg); + + return false; + } + + // ... for 32-bit targets, this is a bug! + if (!Is64Bit && !FitsIn32Bits) { + MI.emitGenericError("64-bit offset calculated but target is 32-bit"); + // Trap so that the instruction verification pass does not fail if run. + BuildMI(MBB, MBBI, DL, TII->get(X86::TRAP)); + return false; + } + if (Offset != 0 || !tryOptimizeLEAtoMOV(II)) MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset); } else { diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h index 19b409a..2f4c55c 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H #define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #define GET_REGINFO_HEADER @@ -180,6 +181,10 @@ public: constrainRegClassToNonRex2(const TargetRegisterClass *RC) const; bool isNonRex2RegClass(const TargetRegisterClass *RC) const; + + bool requiresRegisterScavenging(const MachineFunction &MF) const override { + return true; + } }; } // End llvm namespace diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index e9ca25d..99b7910 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -716,10 +716,7 @@ def GR64_NOREX2_NOSP : RegisterClass<"X86", [i64], 64, // which we do not have right now. def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>; -// When RBP is used as a base pointer in a 32-bit addresses environment, -// this is also safe to use the full register to access addresses. -// Since RBP will never be spilled, stick to a 32 alignment to save -// on memory consumption. +// FIXME: This is unused, but deleting it results in codegen changes def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32, (add LOW32_ADDR_ACCESS, RBP)>; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 8cd52e2..f15a7c7 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -70,6 +70,12 @@ def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4, let BufferSize=60; } +// Skylake can retire up to four (potentially fused) uops per cycle. Set the +// limit to twice that given we do not model fused uops as only taking up one +// retirement slot. I could not find any documented sources on how many +// in-flight micro-ops can be tracked. +def SKRCU : RetireControlUnit<0, 8>; + // Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 // cycles after the memory operand. def : ReadAdvance<ReadAfterLd, 5>; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 14a51d1e..2a793d0 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -70,6 +70,12 @@ def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4, let BufferSize=60; } +// Skylake can retire up to four (potentially fused) uops per cycle. Set the +// limit to twice that given we do not model fused uops as only taking up one +// retirement slot. I could not find any documented sources on how many +// in-flight micro-ops can be tracked. +def SKXRCU : RetireControlUnit<0, 8>; + // Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 // cycles after the memory operand. def : ReadAdvance<ReadAfterLd, 5>; diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index c92bc97..133c1a4 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -562,14 +562,7 @@ def AtomWrite0_1_7_4 : SchedWriteRes<[AtomPort0,AtomPort1]> { let ReleaseAtCycles = [8,8]; let NumMicroOps = 4; } -def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrr(_Int)?")>; - -def AtomWrite0_1_8_4 : SchedWriteRes<[AtomPort0,AtomPort1]> { - let Latency = 8; - let ReleaseAtCycles = [8,8]; - let NumMicroOps = 4; -} -def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrm(_Int)?")>; +def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSr(r|m)(_Int)?")>; def AtomWrite0_1_9 : SchedWriteRes<[AtomPort0,AtomPort1]> { let Latency = 9; diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 8ad8d42..3745c1e 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -280,7 +280,7 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, } // Disable 64-bit only features in non-64-bit mode. - SmallVector<StringRef, 9> FeaturesIn64BitOnly = { + StringRef FeaturesIn64BitOnly[] = { "egpr", "push2pop2", "ppx", "ndd", "ccmp", "nf", "cf", "zu", "uintr"}; if (FullFS.find("-64bit-mode") != std::string::npos) for (StringRef F : FeaturesIn64BitOnly) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 90791fc..62f9527 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -161,19 +161,26 @@ std::optional<unsigned> X86TTIImpl::getCacheAssociativity( llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); } +enum ClassIDEnum { GPRClass = 0, VectorClass = 1, ScalarFPClass = 2 }; + +unsigned X86TTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const { + return Vector ? VectorClass + : Ty && Ty->isFloatingPointTy() ? ScalarFPClass + : GPRClass; +} + unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { - bool Vector = (ClassID == 1); - if (Vector && !ST->hasSSE1()) + if (ClassID == VectorClass && !ST->hasSSE1()) return 0; - if (ST->is64Bit()) { - if (Vector && ST->hasAVX512()) - return 32; - if (!Vector && ST->hasEGPR()) - return 32; - return 16; - } - return 8; + if (!ST->is64Bit()) + return 8; + + if ((ClassID == GPRClass && ST->hasEGPR()) || + (ClassID != GPRClass && ST->hasAVX512())) + return 32; + + return 16; } bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const { @@ -5488,9 +5495,10 @@ InstructionCost X86TTIImpl::getPointersChainCost( return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); } -InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, - ScalarEvolution *SE, - const SCEV *Ptr) const { +InstructionCost +X86TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr, + TTI::TargetCostKind CostKind) const { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -5504,7 +5512,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, // Even in the case of (loop invariant) stride whose value is not known at // compile time, the address computation will not incur more than one extra // ADD instruction. - if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { + if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) { // TODO: AVX2 is the current cut-off because we don't have correct // interleaving costs for prior ISA's. if (!BaseT::isStridedAccess(Ptr)) @@ -5513,7 +5521,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, return 1; } - return BaseT::getAddressComputationCost(Ty, SE, Ptr); + return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind); } InstructionCost @@ -6525,8 +6533,8 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, for (const Instruction &I : instructions(Callee)) { if (const auto *CB = dyn_cast<CallBase>(&I)) { - // Having more target features is fine for inline ASM. - if (CB->isInlineAsm()) + // Having more target features is fine for inline ASM and intrinsics. + if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic) continue; SmallVector<Type *, 8> Types; @@ -6542,19 +6550,9 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, if (all_of(Types, IsSimpleTy)) continue; - if (Function *NestedCallee = CB->getCalledFunction()) { - // Assume that intrinsics are always ABI compatible. - if (NestedCallee->isIntrinsic()) - continue; - - // Do a precise compatibility check. - if (!areTypesABICompatible(Caller, NestedCallee, Types)) - return false; - } else { - // We don't know the target features of the callee, - // assume it is incompatible. + // Do a precise compatibility check. + if (!areTypesABICompatible(Caller, Callee, Types)) return false; - } } } return true; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index bc06c47..133b366 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -132,6 +132,7 @@ public: /// @{ unsigned getNumberOfRegisters(unsigned ClassID) const override; + unsigned getRegisterClassForType(bool Vector, Type *Ty) const override; bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override; TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override; @@ -194,8 +195,9 @@ public: getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override; - InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, - const SCEV *Ptr) const override; + InstructionCost + getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, + TTI::TargetCostKind CostKind) const override; std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override; diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp index e9081a4..9bf0abb 100644 --- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp +++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp @@ -105,6 +105,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { // Prolog information. SmallVector<int64_t> PushedRegs; bool HasStackAlloc = false; + bool HasSetFrame = false; unsigned ApproximatePrologCodeCount = 0; // Requested changes. @@ -130,15 +131,20 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { break; case X86::SEH_StackAlloc: - case X86::SEH_SetFrame: if (State != FunctionState::InProlog) - llvm_unreachable("SEH_StackAlloc or SEH_SetFrame outside of prolog"); + llvm_unreachable("SEH_StackAlloc outside of prolog"); // Assume a large alloc... - ApproximatePrologCodeCount += - (MI.getOpcode() == X86::SEH_StackAlloc) ? 3 : 1; + ApproximatePrologCodeCount += 3; HasStackAlloc = true; break; + case X86::SEH_SetFrame: + if (State != FunctionState::InProlog) + llvm_unreachable("SEH_SetFrame outside of prolog"); + ApproximatePrologCodeCount++; + HasSetFrame = true; + break; + case X86::SEH_SaveReg: case X86::SEH_SaveXMM: if (State != FunctionState::InProlog) @@ -191,6 +197,29 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { break; case X86::MOV64rr: + if (State == FunctionState::InEpilog) { + // If the prolog contains a stack allocation, then the first + // instruction in the epilog must be to adjust the stack pointer. + if (!HasSetFrame) + return rejectCurrentFunctionInternalError( + MF, Mode, + "The epilog is setting frame back, but prolog did not set it"); + if (PoppedRegCount > 0) + return rejectCurrentFunctionInternalError( + MF, Mode, + "The epilog is setting the frame back after popping " + "registers"); + if (HasStackDealloc) + return rejectCurrentFunctionInternalError( + MF, Mode, + "Cannot set the frame back after the stack " + "allocation has been deallocated"); + } else if (State == FunctionState::FinishedEpilog) + return rejectCurrentFunctionInternalError( + MF, Mode, "Unexpected mov instruction after the epilog"); + break; + + case X86::LEA64r: case X86::ADD64ri32: if (State == FunctionState::InEpilog) { // If the prolog contains a stack allocation, then the first @@ -201,51 +230,55 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { "The epilog is deallocating a stack " "allocation, but the prolog did " "not allocate one"); - if (HasStackDealloc) + if (PoppedRegCount > 0) return rejectCurrentFunctionInternalError( MF, Mode, - "The epilog is deallocating the stack " - "allocation more than once"); - if (PoppedRegCount > 0) - llvm_unreachable( - "Should have raised an error: either popping before " - "deallocating or deallocating without an allocation"); + "The epilog is deallocating a stack allocation after popping " + "registers"); HasStackDealloc = true; } else if (State == FunctionState::FinishedEpilog) return rejectCurrentFunctionInternalError( - MF, Mode, "Unexpected mov or add instruction after the epilog"); + MF, Mode, "Unexpected lea or add instruction after the epilog"); break; case X86::POP64r: if (State == FunctionState::InEpilog) { - // After the stack pointer has been adjusted, the epilog must - // POP each register in reverse order of the PUSHes in the prolog. - PoppedRegCount++; - if (HasStackAlloc != HasStackDealloc) - return rejectCurrentFunctionInternalError( - MF, Mode, - "Cannot pop registers before the stack " - "allocation has been deallocated"); - if (PoppedRegCount > PushedRegs.size()) - return rejectCurrentFunctionInternalError( - MF, Mode, - "The epilog is popping more registers than the prolog pushed"); - if (PushedRegs[PushedRegs.size() - PoppedRegCount] != - MI.getOperand(0).getReg()) - return rejectCurrentFunctionInternalError( - MF, Mode, - "The epilog is popping a registers in " - "a different order than the " - "prolog pushed them"); - - // Unwind v2 records the size of the epilog not from where we place - // SEH_BeginEpilogue (as that contains the instruction to adjust the - // stack pointer) but from the first POP instruction (if there is - // one). - if (!UnwindV2StartLocation) { - assert(PoppedRegCount == 1); - UnwindV2StartLocation = &MI; + Register Reg = MI.getOperand(0).getReg(); + if (HasStackAlloc && (PoppedRegCount == 0) && + !llvm::is_contained(PushedRegs, Reg)) { + // If this is a pop that doesn't correspond to the set of pushed + // registers, then assume it was used to adjust the stack pointer. + HasStackDealloc = true; + } else { + // After the stack pointer has been adjusted, the epilog must + // POP each register in reverse order of the PUSHes in the prolog. + PoppedRegCount++; + if (HasStackAlloc != HasStackDealloc) + return rejectCurrentFunctionInternalError( + MF, Mode, + "Cannot pop registers before the stack " + "allocation has been deallocated"); + if (PoppedRegCount > PushedRegs.size()) + return rejectCurrentFunctionInternalError( + MF, Mode, + "The epilog is popping more registers than the prolog " + "pushed"); + if (PushedRegs[PushedRegs.size() - PoppedRegCount] != Reg.id()) + return rejectCurrentFunctionInternalError( + MF, Mode, + "The epilog is popping a registers in " + "a different order than the " + "prolog pushed them"); + + // Unwind v2 records the size of the epilog not from where we place + // SEH_BeginEpilogue (as that contains the instruction to adjust the + // stack pointer) but from the first POP instruction (if there is + // one). + if (!UnwindV2StartLocation) { + assert(PoppedRegCount == 1); + UnwindV2StartLocation = &MI; + } } } else if (State == FunctionState::FinishedEpilog) // Unexpected instruction after the epilog. @@ -272,11 +305,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { } } - if (UnwindV2StartLocations.empty()) { - assert(State == FunctionState::InProlog && - "If there are no epilogs, then there should be no prolog"); + if (UnwindV2StartLocations.empty()) return false; - } MachineBasicBlock &FirstMBB = MF.front(); // Assume +1 for the "header" UOP_Epilog that contains the epilog size, and |