aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/GISel/X86CallLowering.cpp7
-rw-r--r--llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp24
-rw-r--r--llvm/lib/Target/X86/MCA/X86CustomBehaviour.h5
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp11
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/X86/X86.td8
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.h4
-rw-r--r--llvm/lib/Target/X86/X86FastISel.cpp4
-rw-r--r--llvm/lib/Target/X86/X86FastPreTileConfig.cpp13
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp11
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp243
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h9
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp9
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp45
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td12
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp10
-rw-r--r--llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp2
-rw-r--r--llvm/lib/Target/X86/X86PreTileConfig.cpp2
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.cpp54
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.h5
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.td5
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeClient.td6
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeServer.td6
-rw-r--r--llvm/lib/Target/X86/X86ScheduleAtom.td9
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp2
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp56
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h6
-rw-r--r--llvm/lib/Target/X86/X86WinEHUnwindV2.cpp116
28 files changed, 462 insertions, 226 deletions
diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
index c0a6035..c0b9339 100644
--- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
@@ -69,13 +69,13 @@ public:
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
- bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+ bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State);
StackSize = State.getStackSize();
static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
X86::XMM3, X86::XMM4, X86::XMM5,
X86::XMM6, X86::XMM7};
- if (!Info.IsFixed)
+ if (Flags.isVarArg())
NumXMMRegs = State.getFirstUnallocated(XMMArgRegs);
return Res;
@@ -363,7 +363,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
Info.CallConv, Info.IsVarArg))
return false;
- bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed;
+ bool IsFixed =
+ Info.OrigArgs.empty() ? true : !Info.OrigArgs.back().Flags[0].isVarArg();
if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) {
// From AMD64 ABI document:
// For calls that may call functions that use varargs or stdargs
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
index 817e88d..e2a1bbf3 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
@@ -36,11 +36,31 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst,
}
}
+void X86InstrPostProcess::useStackEngine(std::unique_ptr<Instruction> &Inst,
+ const MCInst &MCI) {
+ // TODO(boomanaiden154): We currently do not handle PUSHF/POPF because we
+ // have not done the necessary benchmarking to see if they are also
+ // optimized by the stack engine.
+ // TODO: We currently just remove all RSP writes from stack operations. This
+ // is not fully correct because we do not model sync uops which will
+ // delay subsequent rsp using non-stack instructions.
+ if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) {
+ auto *StackRegisterDef =
+ llvm::find_if(Inst->getDefs(), [](const WriteState &State) {
+ return State.getRegisterID() == X86::RSP;
+ });
+ assert(
+ StackRegisterDef != Inst->getDefs().end() &&
+ "Expected push instruction to implicitly use stack pointer register.");
+ Inst->getDefs().erase(StackRegisterDef);
+ }
+}
+
void X86InstrPostProcess::postProcessInstruction(
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
- // Currently, we only modify certain instructions' IsALoadBarrier and
- // IsAStoreBarrier flags.
+ // Set IsALoadBarrier and IsAStoreBarrier flags.
setMemBarriers(Inst, MCI);
+ useStackEngine(Inst, MCI);
}
} // namespace mca
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
index 4a83ba8..c5459e4 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess {
/// as load and store barriers.
void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+ /// Called within X86InstrPostPorcess to remove some rsp read operands
+ /// on stack instructions to better simulate the stack engine. We currently
+ /// do not model features of the stack engine like sync uops.
+ void useStackEngine(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
public:
X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
: InstrPostProcess(STI, MCII) {}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 56a4cc3..865fc0c 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -485,7 +485,16 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
if (!CanPadInst)
return;
- if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) {
+ if (PendingBA) {
+ auto *NextFragment = PendingBA->getNext();
+ assert(NextFragment && "NextFragment should not be null");
+ if (NextFragment == OS.getCurrentFragment())
+ return;
+ // We eagerly create an empty fragment when inserting a fragment
+ // with a variable-size tail.
+ if (NextFragment->getNext() == OS.getCurrentFragment())
+ return;
+
// Macro fusion actually happens and there is no other fragment inserted
// after the previous instruction.
//
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 0dabd98a3..25fcf81 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -17,6 +17,7 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCMachObjectWriter.h"
#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbolMachO.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
@@ -354,8 +355,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
unsigned Type = MachO::GENERIC_RELOC_VANILLA;
// See <reloc.h>.
- const MCSymbol *A = Target.getAddSym();
-
+ auto *A = static_cast<const MCSymbolMachO *>(Target.getAddSym());
if (!A->getFragment()) {
reportError(Fixup.getLoc(),
"symbol '" + A->getName() +
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 990b381..9cfe081 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -355,6 +355,9 @@ def FeatureCCMP : SubtargetFeature<"ccmp", "HasCCMP", "true",
"Support conditional cmp & test instructions">;
def FeatureNF : SubtargetFeature<"nf", "HasNF", "true",
"Support status flags update suppression">;
+// FeatureCF is not enabled by default for APXF and targets that support APXF
+// due to performance reason, though it is part of APXF. Users need to enable it
+// manually.
def FeatureCF : SubtargetFeature<"cf", "HasCF", "true",
"Support conditional faulting">;
def FeatureZU : SubtargetFeature<"zu", "HasZU", "true",
@@ -1169,7 +1172,6 @@ def ProcessorFeatures {
FeaturePPX,
FeatureNDD,
FeatureNF,
- FeatureCF,
FeatureMOVRS,
FeatureAMXMOVRS,
FeatureAMXAVX512,
@@ -1291,7 +1293,9 @@ def ProcessorFeatures {
list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps,
TuningPreferMovmskOverVTest,
TuningFastImmVectorShift];
- list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning);
+ list<SubtargetFeature> ADLRemoveTuning = [TuningPOPCNTFalseDeps];
+ list<SubtargetFeature> ADLTuning =
+ !listremove(!listconcat(SKLTuning, ADLAdditionalTuning), ADLRemoveTuning);
list<SubtargetFeature> ADLFeatures =
!listconcat(TRMFeatures, ADLAdditionalFeatures);
diff --git a/llvm/lib/Target/X86/X86CallingConv.h b/llvm/lib/Target/X86/X86CallingConv.h
index 191e0fa..8e37f34 100644
--- a/llvm/lib/Target/X86/X86CallingConv.h
+++ b/llvm/lib/Target/X86/X86CallingConv.h
@@ -22,10 +22,10 @@ namespace llvm {
bool RetCC_X86(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State);
} // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 067bd43..f007886 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3323,6 +3323,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
return false;
SmallVector<MVT, 16> OutVTs;
+ SmallVector<Type *, 16> ArgTys;
SmallVector<Register, 16> ArgRegs;
// If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
@@ -3369,6 +3370,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
ArgRegs.push_back(ResultReg);
OutVTs.push_back(VT);
+ ArgTys.push_back(Val->getType());
}
// Analyze operands of the call, assigning locations to each operand.
@@ -3379,7 +3381,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (IsWin64)
CCInfo.AllocateStack(32, Align(8));
- CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
+ CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, ArgTys, CC_X86);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index d3c2392..787b71d 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -564,8 +564,17 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
MachineBasicBlock::iterator I;
if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
I = ++LastShapeMI->getIterator();
- else
- I = ++MI.getIterator();
+ else {
+ // Call can overwrite registers like rax, ensure the tile config
+ // instruction is sinked closer to first instruction that uses tile.
+ auto UseIt = MI.getIterator();
+ while (UseIt != MBB.end()) {
+ if (HasTileOperand(MRI, *UseIt))
+ break;
+ ++UseIt;
+ }
+ I = UseIt;
+ }
Config(*I);
HasUnconfigTile = false;
continue;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 95ed590..cba7843 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/EHPersonalities.h"
@@ -2678,7 +2679,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
// object.
// We need to factor in additional offsets applied during the prologue to the
// frame, base, and stack pointer depending on which is used.
- int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+ int64_t Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
unsigned CSSize = X86FI->getCalleeSavedFrameSize();
uint64_t StackSize = MFI.getStackSize();
@@ -4212,6 +4213,14 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
// emitPrologue if it gets called and emits CFI.
MF.setHasWinCFI(false);
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ // If the frame is big enough that we might need to scavenge a register to
+ // handle huge offsets, reserve a stack slot for that now.
+ if (!isInt<32>(MFI.estimateStackSize(MF))) {
+ int FI = MFI.CreateStackObject(SlotSize, Align(SlotSize), false);
+ RS->addScavengingFrameIndex(FI);
+ }
+
// If we are using Windows x64 CFI, ensure that the stack is always 8 byte
// aligned. The format doesn't support misaligned stack adjustments.
if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f366094..47cea93 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2756,8 +2756,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
!Subtarget.hasBWI())
return TypeSplitVector;
+ // Since v8f16 is legal, widen anything over v4f16.
if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
- !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
+ VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
+ VT.getVectorElementType() == MVT::f16)
return TypeSplitVector;
if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
@@ -15419,18 +15421,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
return SDValue();
}
- // Avoid returning the same shuffle operation. For example,
- // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
- // undef:v16i16
- if (CrossLaneMask == Mask || InLaneMask == Mask)
- return SDValue();
-
// Simplify CrossLaneMask based on the actual demanded elements.
if (V1.hasOneUse())
for (int i = 0; i != NumElts; ++i)
if (!DemandedCrossLane[i])
CrossLaneMask[i] = SM_SentinelUndef;
+ // Avoid returning the same shuffle operation. For example,
+ // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
+ // undef:v16i16
+ if (CrossLaneMask == Mask || InLaneMask == Mask)
+ return SDValue();
+
SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
InLaneMask);
@@ -21250,7 +21252,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
// the truncation then we can use PACKSS by converting the srl to a sra.
// SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
if (In.getOpcode() == ISD::SRL && In->hasOneUse())
- if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
+ if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
if (*ShAmt == MinSignBits) {
PackOpcode = X86ISD::PACKSS;
return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
@@ -22219,9 +22221,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
In = DAG.getBitcast(MVT::i16, In);
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = In;
- Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
+ TargetLowering::ArgListEntry Entry(
+ In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
Entry.IsSExt = false;
Entry.IsZExt = true;
Args.push_back(Entry);
@@ -22318,9 +22319,8 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = In;
- Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
+ TargetLowering::ArgListEntry Entry(
+ In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
Entry.IsSExt = false;
Entry.IsZExt = true;
Args.push_back(Entry);
@@ -23185,43 +23185,51 @@ static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,
// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
// to CMP(MOVMSK(PCMPEQB(X,Y))).
-static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,
+static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS,
ISD::CondCode CC, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG,
X86::CondCode &X86CC) {
- assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
+ SDValue Op = OrigLHS;
- bool CmpNull = isNullConstant(RHS);
- bool CmpAllOnes = isAllOnesConstant(RHS);
- if (!CmpNull && !CmpAllOnes)
- return SDValue();
+ bool CmpNull;
+ APInt Mask;
+ if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+ CmpNull = isNullConstant(OrigRHS);
+ if (!CmpNull && !isAllOnesConstant(OrigRHS))
+ return SDValue();
- SDValue Op = LHS;
- if (!Subtarget.hasSSE2() || !Op->hasOneUse())
- return SDValue();
+ if (!Subtarget.hasSSE2() || !Op->hasOneUse())
+ return SDValue();
- // Check whether we're masking/truncating an OR-reduction result, in which
- // case track the masked bits.
- // TODO: Add CmpAllOnes support.
- APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
- if (CmpNull) {
- switch (Op.getOpcode()) {
- case ISD::TRUNCATE: {
- SDValue Src = Op.getOperand(0);
- Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
- Op.getScalarValueSizeInBits());
- Op = Src;
- break;
- }
- case ISD::AND: {
- if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- Mask = Cst->getAPIntValue();
- Op = Op.getOperand(0);
+ // Check whether we're masking/truncating an OR-reduction result, in which
+ // case track the masked bits.
+ // TODO: Add CmpAllOnes support.
+ Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
+ if (CmpNull) {
+ switch (Op.getOpcode()) {
+ case ISD::TRUNCATE: {
+ SDValue Src = Op.getOperand(0);
+ Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
+ Op.getScalarValueSizeInBits());
+ Op = Src;
+ break;
+ }
+ case ISD::AND: {
+ if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Mask = Cst->getAPIntValue();
+ Op = Op.getOperand(0);
+ }
+ break;
+ }
}
- break;
- }
}
+ } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
+ CC = ISD::SETEQ;
+ CmpNull = true;
+ Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
+ } else {
+ return SDValue();
}
ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
@@ -26261,10 +26269,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
-
- if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
- if (MaskConst->getZExtValue() & 0x1)
- return Op;
+ auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskConst && (MaskConst->getZExtValue() & 0x1))
+ return Op;
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -26280,6 +26287,17 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+
+ if (MaskConst) {
+ assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
+ // Discard op and blend passthrough with scalar op src/dst.
+ SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
+ std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
+ ShuffleMask[0] = VT.getVectorNumElements();
+ return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
+ ShuffleMask);
+ }
+
return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
}
@@ -30049,7 +30067,6 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
SDValue InChain = DAG.getEntryNode();
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
EVT ArgVT = Op->getOperand(i).getValueType();
assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
@@ -30058,13 +30075,9 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
- Entry.Node = StackPtr;
InChain =
DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
- Entry.Ty = PointerType::get(*DAG.getContext(), 0);
- Entry.IsSExt = false;
- Entry.IsZExt = false;
- Args.push_back(Entry);
+ Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
}
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
@@ -33087,13 +33100,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
-
- Entry.Node = Arg;
- Entry.Ty = ArgTy;
- Entry.IsSExt = false;
- Entry.IsZExt = false;
- Args.push_back(Entry);
+ Args.emplace_back(Arg, ArgTy);
bool isF64 = ArgVT == MVT::f64;
// Only optimize x86_64 for now. i386 is a bit messy. For f32,
@@ -38679,13 +38686,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
if (Opc == X86ISD::VSHLI) {
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
+ Known <<= ShAmt;
// Low bits are known zero.
Known.Zero.setLowBits(ShAmt);
} else if (Opc == X86ISD::VSRLI) {
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// High bits are known zero.
Known.Zero.setHighBits(ShAmt);
} else {
@@ -44206,8 +44211,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
// Conversions.
// TODO: Add more CVT opcodes when we have test coverage.
- case X86ISD::CVTTP2SI:
case X86ISD::CVTTP2UI: {
+ if (!Subtarget.hasVLX())
+ break;
+ [[fallthrough]];
+ }
+ case X86ISD::CVTTP2SI: {
if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
!Subtarget.hasVLX())
break;
@@ -44517,8 +44526,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
TLO, Depth + 1))
return true;
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
+ Known <<= ShAmt;
// Low bits known zero.
Known.Zero.setLowBits(ShAmt);
@@ -44548,8 +44556,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
TLO, Depth + 1))
return true;
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// High bits known zero.
Known.Zero.setHighBits(ShAmt);
@@ -44597,8 +44604,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
TLO, Depth + 1))
return true;
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// If the input sign bit is known to be zero, or if none of the top bits
// are demanded, turn this into an unsigned shift right.
@@ -44956,6 +44962,40 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
Known.Zero.setLowBits(Known2.countMinTrailingZeros());
return false;
}
+ case X86ISD::VPMADD52L:
+ case X86ISD::VPMADD52H: {
+ KnownBits KnownOp0, KnownOp1;
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+ // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
+ // operand 2).
+ APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
+ if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
+ TLO, Depth + 1))
+ return true;
+
+ if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
+ TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownMul;
+ KnownOp0 = KnownOp0.trunc(52);
+ KnownOp1 = KnownOp1.trunc(52);
+ KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
+ : KnownBits::mulhu(KnownOp0, KnownOp1);
+ KnownMul = KnownMul.zext(64);
+
+ // lo/hi(X * Y) + Z --> C + Z
+ if (KnownMul.isConstant()) {
+ SDLoc DL(Op);
+ SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
+ }
+
+ // TODO: Compute the known bits for VPMADD52L/VPMADD52H.
+ break;
+ }
}
return TargetLowering::SimplifyDemandedBitsForTargetNode(
@@ -45131,6 +45171,14 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
switch (Op.getOpcode()) {
+ // SSE bit logic.
+ case X86ISD::FAND:
+ case X86ISD::FOR:
+ case X86ISD::FXOR:
+ case X86ISD::FANDN:
+ case X86ISD::ANDNP:
+ case X86ISD::VPTERNLOG:
+ return false;
// SSE vector insert/extracts use modulo indices.
case X86ISD::PINSRB:
case X86ISD::PINSRW:
@@ -45163,6 +45211,14 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT:
return false;
+ // SSE signbit extraction.
+ case X86ISD::MOVMSK:
+ return false;
+ // GFNI instructions.
+ case X86ISD::GF2P8AFFINEINVQB:
+ case X86ISD::GF2P8AFFINEQB:
+ case X86ISD::GF2P8MULB:
+ return false;
case ISD::INTRINSIC_WO_CHAIN:
switch (Op->getConstantOperandVal(0)) {
case Intrinsic::x86_sse2_pmadd_wd:
@@ -48345,7 +48401,7 @@ static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC,
// If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
// peek through and adjust the TEST bit.
if (Src.getOpcode() == ISD::SHL) {
- if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) {
+ if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
Src = Src.getOperand(0);
BitMask.lshrInPlace(*ShiftAmt);
}
@@ -51800,6 +51856,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
SDValue X, Y;
EVT CondVT = VT.changeVectorElementType(MVT::i1);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
+ (VT.is512BitVector() || Subtarget.hasVLX()) &&
+ (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
sd_match(N, m_And(m_Value(X),
m_OneUse(m_SExt(m_AllOf(
m_Value(Y), m_SpecificVT(CondVT),
@@ -54131,10 +54189,10 @@ static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG,
const SDLoc &DL) {
assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
- std::optional<uint64_t> ValidSrlConst = DAG.getValidShiftAmount(N);
+ std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
if (!ValidSrlConst)
return SDValue();
- uint64_t SrlConstVal = *ValidSrlConst;
+ unsigned SrlConstVal = *ValidSrlConst;
SDValue Op = N.getOperand(0);
unsigned Opcode = Op.getOpcode();
@@ -55364,6 +55422,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
SDValue Src = N0.getOperand(0);
EVT SrcVT = Src.getValueType();
if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
+ (VT.is512BitVector() || Subtarget.hasVLX()) &&
+ (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
getZeroVector(VT, Subtarget, DAG, DL));
@@ -56243,7 +56303,13 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
SDValue Masked = BroadcastOp;
if (N != 0) {
- APInt Mask = APInt::getLowBitsSet(BroadcastOpVT.getSizeInBits(), Len);
+ unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
+ unsigned NumDefinedElts = UndefElts.countTrailingZeros();
+
+ if (NumDefinedElts > BroadcastOpBitWidth)
+ return SDValue();
+
+ APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
DAG.getConstant(N, DL, BroadcastOpVT));
Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
@@ -56278,14 +56344,16 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
Subtarget))
return V;
+ }
- if (VT == MVT::i1) {
- X86::CondCode X86CC;
- if (SDValue V =
- MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
- return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
- }
+ if (VT == MVT::i1) {
+ X86::CondCode X86CC;
+ if (SDValue V =
+ MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
+ }
+ if (CC == ISD::SETNE || CC == ISD::SETEQ) {
if (OpVT.isScalarInteger()) {
// cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
// cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
@@ -60062,6 +60130,19 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Simplify VPMADD52L/VPMADD52H operations.
+static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumEltBits = VT.getScalarSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
+ DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -60699,6 +60780,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
case X86ISD::VPMADDUBSW:
case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
+ case X86ISD::VPMADD52L:
+ case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
case X86ISD::KSHIFTL:
case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 547b221..d888f9f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1591,7 +1591,6 @@ namespace llvm {
bool useLoadStackGuardNode(const Module &M) const override;
bool useStackGuardXorFP() const override;
void insertSSPDeclarations(Module &M) const override;
- Value *getSDagStackGuard(const Module &M) const override;
Function *getSSPStackGuardCheck(const Module &M) const override;
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
const SDLoc &DL) const override;
@@ -1663,14 +1662,14 @@ namespace llvm {
/// instructions/intrinsics.
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices,
- unsigned Factor) const override;
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ const APInt &GapMask) const override;
/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
- ShuffleVectorInst *SVI,
- unsigned Factor) const override;
+ ShuffleVectorInst *SVI, unsigned Factor,
+ const APInt &GapMask) const override;
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
int JTI, SelectionDAG &DAG) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 7c594d0..1c745a3 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -632,15 +632,6 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
TargetLowering::insertSSPDeclarations(M);
}
-Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
- // MSVC CRT has a global variable holding security cookie.
- if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
- Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
- return M.getGlobalVariable("__security_cookie");
- }
- return TargetLowering::getSDagStackGuard(M);
-}
-
Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index abf365e..5c0deeb 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4399,13 +4399,8 @@ static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
if (STI.hasFP16())
return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
if (Load)
- return STI.hasAVX512() ? X86::VMOVSSZrm
- : STI.hasAVX() ? X86::VMOVSSrm
- : X86::MOVSSrm;
- else
- return STI.hasAVX512() ? X86::VMOVSSZmr
- : STI.hasAVX() ? X86::VMOVSSmr
- : X86::MOVSSmr;
+ return X86::MOVSHPrm;
+ return X86::MOVSHPmr;
}
static unsigned getLoadStoreRegOpcode(Register Reg,
@@ -4903,6 +4898,16 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
CmpMask = ~0;
CmpValue = 0;
return true;
+ case X86::TEST64ri32:
+ case X86::TEST32ri:
+ case X86::TEST16ri:
+ case X86::TEST8ri:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = 0;
+ // Force identical compare.
+ CmpMask = 0;
+ CmpValue = 0;
+ return true;
}
return false;
}
@@ -4942,6 +4947,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
case X86::CMP32ri:
case X86::CMP16ri:
case X86::CMP8ri:
+ case X86::TEST64ri32:
+ case X86::TEST32ri:
+ case X86::TEST16ri:
+ case X86::TEST8ri:
CASE_ND(SUB64ri32)
CASE_ND(SUB32ri)
CASE_ND(SUB16ri)
@@ -6131,6 +6140,25 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
return true;
}
+static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI,
+ const TargetInstrInfo &TII, bool HasAVX) {
+ unsigned NewOpc;
+ if (MI.getOpcode() == X86::MOVSHPrm) {
+ NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
+ Register Reg = MI.getOperand(0).getReg();
+ if (Reg > X86::XMM15)
+ NewOpc = X86::VMOVSSZrm;
+ } else {
+ NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+ Register Reg = MI.getOperand(5).getReg();
+ if (Reg > X86::XMM15)
+ NewOpc = X86::VMOVSSZmr;
+ }
+
+ MIB->setDesc(TII.get(NewOpc));
+ return true;
+}
+
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -6203,6 +6231,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
}
+ case X86::MOVSHPmr:
+ case X86::MOVSHPrm:
+ return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
case X86::V_SETALLONES:
return Expand2AddrUndef(MIB,
get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 1acc0cd8..b792649 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -267,6 +267,18 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
}
}
+// pseudo instruction for fp16 spilling.
+let isPseudo = 1, Predicates = [HasSSE2] in {
+ let mayStore = 1 in
+ def MOVSHPmr : I<0, Pseudo, (outs), (ins f32mem:$dst, FR16X:$src), "",
+ [], SSEPackedSingle>,
+ Sched<[WriteFStore]>;
+ let mayLoad = 1 in
+ def MOVSHPrm : I<0, Pseudo, (outs FR16X:$dst), (ins f32mem:$src), "",
+ [], SSEPackedSingle>,
+ Sched<[WriteFLoad]>;
+}
+
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
SSEPackedSingle, UseSSE1>, TB, XS;
defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 636b072..4188487 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
bool X86TargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices, unsigned Factor) const {
+ ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
@@ -812,7 +812,7 @@ bool X86TargetLowering::lowerInterleavedLoad(
auto *LI = dyn_cast<LoadInst>(Load);
if (!LI)
return false;
- assert(!Mask && "Unexpected mask on a load");
+ assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
// Create an interleaved access group.
IRBuilder<> Builder(LI);
@@ -825,7 +825,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
- unsigned Factor) const {
+ unsigned Factor,
+ const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -836,7 +837,8 @@ bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
- assert(!LaneMask && "Unexpected mask on store");
+ assert(!LaneMask && GapMask.popcount() == Factor &&
+ "Unexpected mask on store");
// Holds the indices of SVI that correspond to the starting index of each
// interleaved shuffle.
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index cf055cf..090060e 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -491,7 +491,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
NumGadgets += GadgetCount;
// Traverse CFG to build the rest of the graph
- SmallSet<MachineBasicBlock *, 8> BlocksVisited;
+ SmallPtrSet<MachineBasicBlock *, 8> BlocksVisited;
std::function<void(MachineBasicBlock *, GraphIter, unsigned)> TraverseCFG =
[&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) {
unsigned LoopDepth = MLI.getLoopDepth(MBB);
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 3b4e531..2a1c499 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -100,7 +100,7 @@ struct BBInfo {
class X86PreTileConfig : public MachineFunctionPass {
MachineRegisterInfo *MRI = nullptr;
const MachineLoopInfo *MLI = nullptr;
- SmallSet<MachineInstr *, 8> DefVisited;
+ SmallPtrSet<MachineInstr *, 8> DefVisited;
DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
DenseMap<MachineBasicBlock *, SmallVector<MIRef, 8>> ShapeBBs;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 83b11ee..b79e508 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -21,8 +21,8 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TileShapeInfo.h"
@@ -204,15 +204,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
// we can still use 64-bit register as long as we know the high bits
// are zeros.
// Reflect that in the returned register class.
- if (Is64Bit) {
- // When the target also allows 64-bit frame pointer and we do have a
- // frame, this is fine to use it for the address accesses as well.
- const X86FrameLowering *TFI = getFrameLowering(MF);
- return TFI->hasFP(MF) && TFI->Uses64BitFramePtr
- ? &X86::LOW32_ADDR_ACCESS_RBPRegClass
- : &X86::LOW32_ADDR_ACCESSRegClass;
- }
- return &X86::GR32RegClass;
+ return Is64Bit ? &X86::LOW32_ADDR_ACCESSRegClass : &X86::GR32RegClass;
case 1: // Normal GPRs except the stack pointer (for encoding reasons).
if (Subtarget.isTarget64BitLP64())
return &X86::GR64_NOSPRegClass;
@@ -907,7 +899,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
// Determine base register and offset.
- int FIOffset;
+ int64_t FIOffset;
Register BasePtr;
if (MI.isReturn()) {
assert((!hasStackRealignment(MF) ||
@@ -958,11 +950,41 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
if (MI.getOperand(FIOperandNum+3).isImm()) {
- // Offset is a 32-bit integer.
- int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
- int Offset = FIOffset + Imm;
- assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
- "Requesting 64-bit offset in 32-bit immediate!");
+ const X86InstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ int64_t Imm = MI.getOperand(FIOperandNum + 3).getImm();
+ int64_t Offset = FIOffset + Imm;
+ bool FitsIn32Bits = isInt<32>(Offset);
+ // If the offset will not fit in a 32-bit displacement, then for 64-bit
+ // targets, scavenge a register to hold it. Otherwise...
+ if (Is64Bit && !FitsIn32Bits) {
+ assert(RS && "RegisterScavenger was NULL");
+
+ RS->enterBasicBlockEnd(MBB);
+ RS->backward(std::next(II));
+
+ Register ScratchReg = RS->scavengeRegisterBackwards(
+ X86::GR64RegClass, II, /*RestoreAfter=*/false, /*SPAdj=*/0,
+ /*AllowSpill=*/true);
+ assert(ScratchReg != 0 && "scratch reg was 0");
+ RS->setRegUsed(ScratchReg);
+
+ BuildMI(MBB, II, DL, TII->get(X86::MOV64ri), ScratchReg).addImm(Offset);
+
+ MI.getOperand(FIOperandNum + 3).setImm(0);
+ MI.getOperand(FIOperandNum + 2).setReg(ScratchReg);
+
+ return false;
+ }
+
+ // ... for 32-bit targets, this is a bug!
+ if (!Is64Bit && !FitsIn32Bits) {
+ MI.emitGenericError("64-bit offset calculated but target is 32-bit");
+ // Trap so that the instruction verification pass does not fail if run.
+ BuildMI(MBB, MBBI, DL, TII->get(X86::TRAP));
+ return false;
+ }
+
if (Offset != 0 || !tryOptimizeLEAtoMOV(II))
MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
} else {
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 19b409a..2f4c55c 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#define GET_REGINFO_HEADER
@@ -180,6 +181,10 @@ public:
constrainRegClassToNonRex2(const TargetRegisterClass *RC) const;
bool isNonRex2RegClass(const TargetRegisterClass *RC) const;
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+ return true;
+ }
};
} // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index e9ca25d..99b7910 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -716,10 +716,7 @@ def GR64_NOREX2_NOSP : RegisterClass<"X86", [i64], 64,
// which we do not have right now.
def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
-// When RBP is used as a base pointer in a 32-bit addresses environment,
-// this is also safe to use the full register to access addresses.
-// Since RBP will never be spilled, stick to a 32 alignment to save
-// on memory consumption.
+// FIXME: This is unused, but deleting it results in codegen changes
def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32,
(add LOW32_ADDR_ACCESS, RBP)>;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 8cd52e2..f15a7c7 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -70,6 +70,12 @@ def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4,
let BufferSize=60;
}
+// Skylake can retire up to four (potentially fused) uops per cycle. Set the
+// limit to twice that given we do not model fused uops as only taking up one
+// retirement slot. I could not find any documented sources on how many
+// in-flight micro-ops can be tracked.
+def SKRCU : RetireControlUnit<0, 8>;
+
// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
// cycles after the memory operand.
def : ReadAdvance<ReadAfterLd, 5>;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 14a51d1e..2a793d0 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -70,6 +70,12 @@ def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4,
let BufferSize=60;
}
+// Skylake can retire up to four (potentially fused) uops per cycle. Set the
+// limit to twice that given we do not model fused uops as only taking up one
+// retirement slot. I could not find any documented sources on how many
+// in-flight micro-ops can be tracked.
+def SKXRCU : RetireControlUnit<0, 8>;
+
// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
// cycles after the memory operand.
def : ReadAdvance<ReadAfterLd, 5>;
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index c92bc97..133c1a4 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -562,14 +562,7 @@ def AtomWrite0_1_7_4 : SchedWriteRes<[AtomPort0,AtomPort1]> {
let ReleaseAtCycles = [8,8];
let NumMicroOps = 4;
}
-def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrr(_Int)?")>;
-
-def AtomWrite0_1_8_4 : SchedWriteRes<[AtomPort0,AtomPort1]> {
- let Latency = 8;
- let ReleaseAtCycles = [8,8];
- let NumMicroOps = 4;
-}
-def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrm(_Int)?")>;
+def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSr(r|m)(_Int)?")>;
def AtomWrite0_1_9 : SchedWriteRes<[AtomPort0,AtomPort1]> {
let Latency = 9;
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 8ad8d42..3745c1e 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -280,7 +280,7 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
}
// Disable 64-bit only features in non-64-bit mode.
- SmallVector<StringRef, 9> FeaturesIn64BitOnly = {
+ StringRef FeaturesIn64BitOnly[] = {
"egpr", "push2pop2", "ppx", "ndd", "ccmp", "nf", "cf", "zu", "uintr"};
if (FullFS.find("-64bit-mode") != std::string::npos)
for (StringRef F : FeaturesIn64BitOnly)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 90791fc..62f9527 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -161,19 +161,26 @@ std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
}
+enum ClassIDEnum { GPRClass = 0, VectorClass = 1, ScalarFPClass = 2 };
+
+unsigned X86TTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
+ return Vector ? VectorClass
+ : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
+ : GPRClass;
+}
+
unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
- bool Vector = (ClassID == 1);
- if (Vector && !ST->hasSSE1())
+ if (ClassID == VectorClass && !ST->hasSSE1())
return 0;
- if (ST->is64Bit()) {
- if (Vector && ST->hasAVX512())
- return 32;
- if (!Vector && ST->hasEGPR())
- return 32;
- return 16;
- }
- return 8;
+ if (!ST->is64Bit())
+ return 8;
+
+ if ((ClassID == GPRClass && ST->hasEGPR()) ||
+ (ClassID != GPRClass && ST->hasAVX512()))
+ return 32;
+
+ return 16;
}
bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const {
@@ -5488,9 +5495,10 @@ InstructionCost X86TTIImpl::getPointersChainCost(
return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
}
-InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
- ScalarEvolution *SE,
- const SCEV *Ptr) const {
+InstructionCost
+X86TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -5504,7 +5512,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
// Even in the case of (loop invariant) stride whose value is not known at
// compile time, the address computation will not incur more than one extra
// ADD instruction.
- if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
+ if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
// TODO: AVX2 is the current cut-off because we don't have correct
// interleaving costs for prior ISA's.
if (!BaseT::isStridedAccess(Ptr))
@@ -5513,7 +5521,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
return 1;
}
- return BaseT::getAddressComputationCost(Ty, SE, Ptr);
+ return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
}
InstructionCost
@@ -6525,8 +6533,8 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
for (const Instruction &I : instructions(Callee)) {
if (const auto *CB = dyn_cast<CallBase>(&I)) {
- // Having more target features is fine for inline ASM.
- if (CB->isInlineAsm())
+ // Having more target features is fine for inline ASM and intrinsics.
+ if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
continue;
SmallVector<Type *, 8> Types;
@@ -6542,19 +6550,9 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
if (all_of(Types, IsSimpleTy))
continue;
- if (Function *NestedCallee = CB->getCalledFunction()) {
- // Assume that intrinsics are always ABI compatible.
- if (NestedCallee->isIntrinsic())
- continue;
-
- // Do a precise compatibility check.
- if (!areTypesABICompatible(Caller, NestedCallee, Types))
- return false;
- } else {
- // We don't know the target features of the callee,
- // assume it is incompatible.
+ // Do a precise compatibility check.
+ if (!areTypesABICompatible(Caller, Callee, Types))
return false;
- }
}
}
return true;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index bc06c47..133b366 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,6 +132,7 @@ public:
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const override;
+ unsigned getRegisterClassForType(bool Vector, Type *Ty) const override;
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override;
TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override;
@@ -194,8 +195,9 @@ public:
getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base,
const TTI::PointersChainInfo &Info, Type *AccessTy,
TTI::TargetCostKind CostKind) const override;
- InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
- const SCEV *Ptr) const override;
+ InstructionCost
+ getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const override;
std::optional<Instruction *>
instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
index e9081a4..9bf0abb 100644
--- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
+++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
@@ -105,6 +105,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
// Prolog information.
SmallVector<int64_t> PushedRegs;
bool HasStackAlloc = false;
+ bool HasSetFrame = false;
unsigned ApproximatePrologCodeCount = 0;
// Requested changes.
@@ -130,15 +131,20 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
break;
case X86::SEH_StackAlloc:
- case X86::SEH_SetFrame:
if (State != FunctionState::InProlog)
- llvm_unreachable("SEH_StackAlloc or SEH_SetFrame outside of prolog");
+ llvm_unreachable("SEH_StackAlloc outside of prolog");
// Assume a large alloc...
- ApproximatePrologCodeCount +=
- (MI.getOpcode() == X86::SEH_StackAlloc) ? 3 : 1;
+ ApproximatePrologCodeCount += 3;
HasStackAlloc = true;
break;
+ case X86::SEH_SetFrame:
+ if (State != FunctionState::InProlog)
+ llvm_unreachable("SEH_SetFrame outside of prolog");
+ ApproximatePrologCodeCount++;
+ HasSetFrame = true;
+ break;
+
case X86::SEH_SaveReg:
case X86::SEH_SaveXMM:
if (State != FunctionState::InProlog)
@@ -191,6 +197,29 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
break;
case X86::MOV64rr:
+ if (State == FunctionState::InEpilog) {
+ // If the prolog contains a stack allocation, then the first
+ // instruction in the epilog must be to adjust the stack pointer.
+ if (!HasSetFrame)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "The epilog is setting frame back, but prolog did not set it");
+ if (PoppedRegCount > 0)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "The epilog is setting the frame back after popping "
+ "registers");
+ if (HasStackDealloc)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "Cannot set the frame back after the stack "
+ "allocation has been deallocated");
+ } else if (State == FunctionState::FinishedEpilog)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode, "Unexpected mov instruction after the epilog");
+ break;
+
+ case X86::LEA64r:
case X86::ADD64ri32:
if (State == FunctionState::InEpilog) {
// If the prolog contains a stack allocation, then the first
@@ -201,51 +230,55 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
"The epilog is deallocating a stack "
"allocation, but the prolog did "
"not allocate one");
- if (HasStackDealloc)
+ if (PoppedRegCount > 0)
return rejectCurrentFunctionInternalError(
MF, Mode,
- "The epilog is deallocating the stack "
- "allocation more than once");
- if (PoppedRegCount > 0)
- llvm_unreachable(
- "Should have raised an error: either popping before "
- "deallocating or deallocating without an allocation");
+ "The epilog is deallocating a stack allocation after popping "
+ "registers");
HasStackDealloc = true;
} else if (State == FunctionState::FinishedEpilog)
return rejectCurrentFunctionInternalError(
- MF, Mode, "Unexpected mov or add instruction after the epilog");
+ MF, Mode, "Unexpected lea or add instruction after the epilog");
break;
case X86::POP64r:
if (State == FunctionState::InEpilog) {
- // After the stack pointer has been adjusted, the epilog must
- // POP each register in reverse order of the PUSHes in the prolog.
- PoppedRegCount++;
- if (HasStackAlloc != HasStackDealloc)
- return rejectCurrentFunctionInternalError(
- MF, Mode,
- "Cannot pop registers before the stack "
- "allocation has been deallocated");
- if (PoppedRegCount > PushedRegs.size())
- return rejectCurrentFunctionInternalError(
- MF, Mode,
- "The epilog is popping more registers than the prolog pushed");
- if (PushedRegs[PushedRegs.size() - PoppedRegCount] !=
- MI.getOperand(0).getReg())
- return rejectCurrentFunctionInternalError(
- MF, Mode,
- "The epilog is popping a registers in "
- "a different order than the "
- "prolog pushed them");
-
- // Unwind v2 records the size of the epilog not from where we place
- // SEH_BeginEpilogue (as that contains the instruction to adjust the
- // stack pointer) but from the first POP instruction (if there is
- // one).
- if (!UnwindV2StartLocation) {
- assert(PoppedRegCount == 1);
- UnwindV2StartLocation = &MI;
+ Register Reg = MI.getOperand(0).getReg();
+ if (HasStackAlloc && (PoppedRegCount == 0) &&
+ !llvm::is_contained(PushedRegs, Reg)) {
+ // If this is a pop that doesn't correspond to the set of pushed
+ // registers, then assume it was used to adjust the stack pointer.
+ HasStackDealloc = true;
+ } else {
+ // After the stack pointer has been adjusted, the epilog must
+ // POP each register in reverse order of the PUSHes in the prolog.
+ PoppedRegCount++;
+ if (HasStackAlloc != HasStackDealloc)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "Cannot pop registers before the stack "
+ "allocation has been deallocated");
+ if (PoppedRegCount > PushedRegs.size())
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "The epilog is popping more registers than the prolog "
+ "pushed");
+ if (PushedRegs[PushedRegs.size() - PoppedRegCount] != Reg.id())
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "The epilog is popping a registers in "
+ "a different order than the "
+ "prolog pushed them");
+
+ // Unwind v2 records the size of the epilog not from where we place
+ // SEH_BeginEpilogue (as that contains the instruction to adjust the
+ // stack pointer) but from the first POP instruction (if there is
+ // one).
+ if (!UnwindV2StartLocation) {
+ assert(PoppedRegCount == 1);
+ UnwindV2StartLocation = &MI;
+ }
}
} else if (State == FunctionState::FinishedEpilog)
// Unexpected instruction after the epilog.
@@ -272,11 +305,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
}
}
- if (UnwindV2StartLocations.empty()) {
- assert(State == FunctionState::InProlog &&
- "If there are no epilogs, then there should be no prolog");
+ if (UnwindV2StartLocations.empty())
return false;
- }
MachineBasicBlock &FirstMBB = MF.front();
// Assume +1 for the "header" UOP_Epilog that contains the epilog size, and