aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp63
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.h1
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.cpp31
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.td5
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp33
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h5
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp15
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp15
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp208
9 files changed, 292 insertions, 84 deletions
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3d060c6..e213923 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -127,7 +127,6 @@ class X86AsmBackend : public MCAsmBackend {
unsigned PrevInstOpcode = 0;
MCBoundaryAlignFragment *PendingBA = nullptr;
std::pair<MCFragment *, size_t> PrevInstPosition;
- bool IsRightAfterData = false;
uint8_t determinePaddingPrefix(const MCInst &Inst) const;
bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
@@ -156,10 +155,13 @@ public:
AlignBranchType = X86AlignBranchKindLoc;
if (X86PadMaxPrefixSize.getNumOccurrences())
TargetPrefixMax = X86PadMaxPrefixSize;
+
+ AllowAutoPadding =
+ AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone;
+ AllowEnhancedRelaxation =
+ AllowAutoPadding && TargetPrefixMax != 0 && X86PadForBranchAlign;
}
- bool allowAutoPadding() const override;
- bool allowEnhancedRelaxation() const override;
void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst,
const MCSubtargetInfo &STI);
void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst);
@@ -365,14 +367,6 @@ static bool hasVariantSymbol(const MCInst &MI) {
return false;
}
-bool X86AsmBackend::allowAutoPadding() const {
- return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone);
-}
-
-bool X86AsmBackend::allowEnhancedRelaxation() const {
- return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign;
-}
-
/// X86 has certain instructions which enable interrupts exactly one
/// instruction *after* the instruction which stores to SS. Return true if the
/// given instruction may have such an interrupt delay slot.
@@ -447,7 +441,7 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
// semantic.
return false;
- if (IsRightAfterData)
+ if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
// If this instruction follows any data, there is no clear
// instruction boundary, inserting a nop/prefix would change semantic.
return false;
@@ -484,13 +478,26 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const {
(AlignBranchType & X86::AlignBranchIndirect));
}
+void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst,
+ const MCSubtargetInfo &STI) {
+ bool AutoPadding = S.getAllowAutoPadding();
+ if (LLVM_LIKELY(!AutoPadding && !X86PadForAlign)) {
+ S.MCObjectStreamer::emitInstruction(Inst, STI);
+ return;
+ }
+
+ auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend());
+ Backend.emitInstructionBegin(S, Inst, STI);
+ S.MCObjectStreamer::emitInstruction(Inst, STI);
+ Backend.emitInstructionEnd(S, Inst);
+}
+
/// Insert BoundaryAlignFragment before instructions to align branches.
void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
const MCInst &Inst, const MCSubtargetInfo &STI) {
- // Used by canPadInst. Done here, because in emitInstructionEnd, the current
- // fragment will have changed.
- IsRightAfterData =
- isRightAfterData(OS.getCurrentFragment(), PrevInstPosition);
+ bool CanPadInst = canPadInst(Inst, OS);
+ if (CanPadInst)
+ OS.getCurrentFragment()->setAllowAutoPadding(true);
if (!canPadBranches(OS))
return;
@@ -504,7 +511,7 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
// we call canPadInst (not cheap) twice. However, in the common case, we can
// avoid unnecessary calls to that, as this is otherwise only used for
// relaxable fragments.
- if (!canPadInst(Inst, OS))
+ if (!CanPadInst)
return;
if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) {
@@ -542,11 +549,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
/// Set the last fragment to be aligned for the BoundaryAlignFragment.
void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
const MCInst &Inst) {
- MCFragment *CF = OS.getCurrentFragment();
- if (CF->getKind() == MCFragment::FT_Relaxable)
- CF->setAllowAutoPadding(canPadInst(Inst, OS));
-
// Update PrevInstOpcode here, canPadInst() reads that.
+ MCFragment *CF = OS.getCurrentFragment();
PrevInstOpcode = Inst.getOpcode();
PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
@@ -567,11 +571,10 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
// DataFragment, so that we can get the size of instructions later in
// MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
// DataFragment.
- OS.insert(OS.getContext().allocFragment<MCFragment>());
+ OS.newFragment();
// Update the maximum alignment on the current section if necessary.
- MCSection *Sec = OS.getCurrentSectionOnly();
- Sec->ensureMinAlignment(AlignBoundary);
+ CF->getParent()->ensureMinAlignment(AlignBoundary);
}
std::optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
@@ -923,13 +926,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
continue;
}
- const uint64_t OrigSize = Asm.computeFragmentSize(F);
-
// To keep the effects local, prefer to relax instructions closest to
// the align directive. This is purely about human understandability
// of the resulting code. If we later find a reason to expand
// particular instructions over others, we can adjust.
- unsigned RemainingSize = OrigSize;
+ unsigned RemainingSize = Asm.computeFragmentSize(F) - F.getFixedSize();
while (!Relaxable.empty() && RemainingSize != 0) {
auto &RF = *Relaxable.pop_back_val();
// Give the backend a chance to play any tricks it wishes to increase
@@ -1542,14 +1543,6 @@ public:
};
} // end anonymous namespace
-void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst,
- const MCSubtargetInfo &STI) {
- auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend());
- Backend.emitInstructionBegin(S, Inst, STI);
- S.MCObjectStreamer::emitInstruction(Inst, STI);
- Backend.emitInstructionEnd(S, Inst);
-}
-
void X86ELFStreamer::emitInstruction(const MCInst &Inst,
const MCSubtargetInfo &STI) {
X86_MC::emitInstruction(*this, Inst, STI);
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index efb951b..e02b556 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -151,6 +151,7 @@ private:
MCSymbol *LazyPointer) override;
void emitCallInstruction(const llvm::MCInst &MCI);
+ void maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI);
// Emits a label to mark the next instruction as being relevant to Import Call
// Optimization.
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index 0b4c63f..5d5a705 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
+/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized
+/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment.
+/// Technically only fp128 has a specified ABI, but it makes sense to handle
+/// i128 the same until we hear differently.
+static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ assert(ValVT == MVT::i32 && "Should have i32 parts");
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ assert(PendingMembers.size() == 4 && "Should have four parts");
+
+ int64_t Offset = State.AllocateStack(16, Align(16));
+ PendingMembers[0].convertToMem(Offset);
+ PendingMembers[1].convertToMem(Offset + 4);
+ PendingMembers[2].convertToMem(Offset + 8);
+ PendingMembers[3].convertToMem(Offset + 12);
+
+ State.addLoc(PendingMembers[0]);
+ State.addLoc(PendingMembers[1]);
+ State.addLoc(PendingMembers[2]);
+ State.addLoc(PendingMembers[3]);
+ PendingMembers.clear();
+ return true;
+}
+
// Provides entry points of CC_X86 and RetCC_X86.
#include "X86GenCallingConv.inc"
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 823e0caa..f020e0b 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[
// The 'nest' parameter, if any, is passed in ECX.
CCIfNest<CCAssignToReg<[ECX]>>,
+ // i128 and fp128 need to be passed on the stack with a higher alignment than
+ // their legal types. Handle this with a custom function.
+ CCIfType<[i32],
+ CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>,
+
// On swifttailcc pass swiftself in ECX.
CCIfCC<"CallingConv::SwiftTail",
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d91ea1ea..11ab8dc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1323,11 +1323,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
}
- if (Subtarget.hasGFNI()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i16, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i64, Custom);
+
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
+ }
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
@@ -4997,9 +5001,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
EVT VT = Op.getValueType();
unsigned SizeInBits = VT.getSizeInBits();
- assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
unsigned NumElts = SizeInBits / EltSizeInBits;
+ // Can't split constant.
+ if ((SizeInBits % EltSizeInBits) != 0)
+ return false;
+
// Bitcast a source array of element bits to the target size.
auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
unsigned NumSrcElts = UndefSrcElts.getBitWidth();
@@ -32694,7 +32701,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
if (Subtarget.hasXOP() && !VT.is512BitVector())
return LowerBITREVERSE_XOP(Op, DAG);
- assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
+ assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
+ "SSSE3 or GFNI required for BITREVERSE");
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
@@ -45054,6 +45062,10 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
unsigned NumElts = DemandedElts.getBitWidth();
switch (Op.getOpcode()) {
+ case X86ISD::GlobalBaseReg:
+ case X86ISD::Wrapper:
+ case X86ISD::WrapperRIP:
+ return true;
case X86ISD::BLENDI:
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
@@ -45093,27 +45105,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
switch (Op.getOpcode()) {
+ // SSE vector insert/extracts use modulo indices.
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW:
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW:
+ return false;
// SSE vector multiplies are either inbounds or saturate.
case X86ISD::VPMADDUBSW:
case X86ISD::VPMADDWD:
+ return false;
// SSE vector shifts handle out of bounds shift amounts.
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI:
return false;
- // SSE blends.
+ // SSE blends.
case X86ISD::BLENDI:
case X86ISD::BLENDV:
return false;
- // SSE target shuffles.
+ // SSE target shuffles.
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VPERMILPI:
case X86ISD::VPERMV3:
return false;
- // SSE comparisons handle all icmp/fcmp cases.
- // TODO: Add CMPM/MM with test coverage.
+ // SSE comparisons handle all icmp/fcmp cases.
+ // TODO: Add CMPM/MM with test coverage.
case X86ISD::CMPP:
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 6bcb7a3..547b221 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1661,14 +1661,15 @@ namespace llvm {
/// Lower interleaved load(s) into target specific
/// instructions/intrinsics.
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI,
unsigned Factor) const override;
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 9ad3553..b4639ac 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg,
const DataLayout &DL) const {
- // i128 split into i64 needs to be allocated to two consecutive registers,
- // or spilled to the stack as a whole.
- return Ty->isIntegerTy(128);
+ // On x86-64 i128 is split into two i64s and needs to be allocated to two
+ // consecutive registers, or spilled to the stack as a whole. On x86-32 i128
+ // is split to four i32s and never actually passed in registers, but we use
+ // the consecutive register mark to match it in TableGen.
+ if (Ty->isIntegerTy(128))
+ return true;
+
+ // On x86-32, fp128 acts the same as i128.
+ if (Subtarget.is32Bit() && Ty->isFP128Ty())
+ return true;
+
+ return false;
}
/// Helper for getByValTypeAlignment to determine
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 1eb47e3..636b072 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// number of shuffles and ISA.
// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
bool X86TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
// Create an interleaved access group.
IRBuilder<> Builder(LI);
X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
@@ -817,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
}
-bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
+ Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
@@ -827,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
0 &&
"Invalid interleaved store");
+ auto *SI = dyn_cast<StoreInst>(Store);
+ if (!SI)
+ return false;
+ assert(!LaneMask && "Unexpected mask on store");
+
// Holds the indices of SVI that correspond to the starting index of each
// interleaved shuffle.
auto Mask = SVI->getShuffleMask();
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 45d596b..481a9be 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Mangler.h"
@@ -833,6 +834,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
CallInst.setOpcode(CallOpcode);
CallInst.addOperand(CallTargetMCOp);
OutStreamer->emitInstruction(CallInst, getSubtargetInfo());
+ maybeEmitNopAfterCallForWindowsEH(&MI);
}
// Record our statepoint node in the same section used by STACKMAP
@@ -1430,21 +1432,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
OutStreamer->emitLabel(FallthroughLabel);
}
-// Returns instruction preceding MBBI in MachineFunction.
-// If MBBI is the first instruction of the first basic block, returns null.
-static MachineBasicBlock::const_iterator
-PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
- const MachineBasicBlock *MBB = MBBI->getParent();
- while (MBBI == MBB->begin()) {
- if (MBB == &MBB->getParent()->front())
- return MachineBasicBlock::const_iterator();
- MBB = MBB->getPrevNode();
- MBBI = MBB->end();
- }
- --MBBI;
- return MBBI;
-}
-
static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) {
if (X86II::isKMasked(MI->getDesc().TSFlags)) {
// Skip mask operand.
@@ -2271,6 +2258,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
OutStreamer->AddComment("EVEX TO EVEX Compression ", false);
}
+ // We use this to suppress NOP padding for Windows EH.
+ bool IsTailJump = false;
+
switch (MI->getOpcode()) {
case TargetOpcode::DBG_VALUE:
llvm_unreachable("Should be handled target independently");
@@ -2325,6 +2315,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// Lower this as normal, but add a comment.
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPr:
@@ -2340,6 +2331,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// Lower these as normal, but add some comments.
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPm64_REX:
@@ -2349,6 +2341,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPr64_REX: {
@@ -2361,6 +2354,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
}
@@ -2537,26 +2531,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::SEH_BeginEpilogue: {
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- // Windows unwinder will not invoke function's exception handler if IP is
- // either in prologue or in epilogue. This behavior causes a problem when a
- // call immediately precedes an epilogue, because the return address points
- // into the epilogue. To cope with that, we insert a 'nop' if it ends up
- // immediately after a CALL in the final emitted code.
- MachineBasicBlock::const_iterator MBBI(MI);
- // Check if preceded by a call and emit nop if so.
- for (MBBI = PrevCrossBBInst(MBBI);
- MBBI != MachineBasicBlock::const_iterator();
- MBBI = PrevCrossBBInst(MBBI)) {
- // Pseudo instructions that aren't a call are assumed to not emit any
- // code. If they do, we worst case generate unnecessary noops after a
- // call.
- if (MBBI->isCall() || !MBBI->isPseudo()) {
- if (MBBI->isCall())
- EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
- break;
- }
- }
-
EmitSEHInstruction(MI);
return;
}
@@ -2585,6 +2559,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
emitCallInstruction(TmpInst);
emitNop(*OutStreamer, 5, Subtarget);
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
@@ -2605,6 +2580,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// For Import Call Optimization to work, we need a 3-byte nop after the
// call instruction.
emitNop(*OutStreamer, 3, Subtarget);
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
break;
@@ -2638,6 +2614,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
if (MI->isCall()) {
emitCallInstruction(TmpInst);
+ // Since tail calls transfer control without leaving a stack frame, there is
+ // never a need for NOP padding tail calls.
+ if (!IsTailJump)
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
@@ -2659,6 +2639,164 @@ void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) {
OutStreamer->emitInstruction(MCI, getSubtargetInfo());
}
+// Determines whether a NOP is required after a CALL, so that Windows EH
+// IP2State tables have the correct information.
+//
+// On most Windows platforms (AMD64, ARM64, ARM32, IA64, but *not* x86-32),
+// exception handling works by looking up instruction pointers in lookup
+// tables. These lookup tables are stored in .xdata sections in executables.
+// One element of the lookup tables are the "IP2State" tables (Instruction
+// Pointer to State).
+//
+// If a function has any instructions that require cleanup during exception
+// unwinding, then it will have an IP2State table. Each entry in the IP2State
+// table describes a range of bytes in the function's instruction stream, and
+// associates an "EH state number" with that range of instructions. A value of
+// -1 means "the null state", which does not require any code to execute.
+// A value other than -1 is an index into the State table.
+//
+// The entries in the IP2State table contain byte offsets within the instruction
+// stream of the function. The Windows ABI requires that these offsets are
+// aligned to instruction boundaries; they are not permitted to point to a byte
+// that is not the first byte of an instruction.
+//
+// Unfortunately, CALL instructions present a problem during unwinding. CALL
+// instructions push the address of the instruction after the CALL instruction,
+// so that execution can resume after the CALL. If the CALL is the last
+// instruction within an IP2State region, then the return address (on the stack)
+// points to the *next* IP2State region. This means that the unwinder will
+// use the wrong cleanup funclet during unwinding.
+//
+// To fix this problem, the Windows AMD64 ABI requires that CALL instructions
+// are never placed at the end of an IP2State region. Stated equivalently, the
+// end of a CALL instruction cannot be aligned to an IP2State boundary. If a
+// CALL instruction would occur at the end of an IP2State region, then the
+// compiler must insert a NOP instruction after the CALL. The NOP instruction
+// is placed in the same EH region as the CALL instruction, so that the return
+// address points to the NOP and the unwinder will locate the correct region.
+//
+// NOP padding is only necessary on Windows AMD64 targets. On ARM64 and ARM32,
+// instructions have a fixed size so the unwinder knows how to "back up" by
+// one instruction.
+//
+// Interaction with Import Call Optimization (ICO):
+//
+// Import Call Optimization (ICO) is a compiler + OS feature on Windows which
+// improves the performance and security of DLL imports. ICO relies on using a
+// specific CALL idiom that can be replaced by the OS DLL loader. This removes
+// a load and indirect CALL and replaces it with a single direct CALL.
+//
+// To achieve this, ICO also inserts NOPs after the CALL instruction. If the
+// end of the CALL is aligned with an EH state transition, we *also* insert
+// a single-byte NOP. **Both forms of NOPs must be preserved.** They cannot
+// be combined into a single larger NOP; nor can the second NOP be removed.
+//
+// This is necessary because, if ICO is active and the call site is modified
+// by the loader, the loader will end up overwriting the NOPs that were inserted
+// for ICO. That means that those NOPs cannot be used for the correct
+// termination of the exception handling region (the IP2State transition),
+// so we still need an additional NOP instruction. The NOPs cannot be combined
+// into a longer NOP (which is ordinarily desirable) because then ICO would
+// split one instruction, producing a malformed instruction after the ICO call.
+void X86AsmPrinter::maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI) {
+ // We only need to insert NOPs after CALLs when targeting Windows on AMD64.
+ // (Don't let the name fool you: Itanium refers to table-based exception
+ // handling, not the Itanium architecture.)
+ if (MAI->getExceptionHandlingType() != ExceptionHandling::WinEH ||
+ MAI->getWinEHEncodingType() != WinEH::EncodingType::Itanium) {
+ return;
+ }
+
+ bool HasEHPersonality = MF->getWinEHFuncInfo() != nullptr;
+
+ // Set up MBB iterator, initially positioned on the same MBB as MI.
+ MachineFunction::const_iterator MFI(MI->getParent());
+ MachineFunction::const_iterator MFE(MF->end());
+
+ // Set up instruction iterator, positioned immediately *after* MI.
+ MachineBasicBlock::const_iterator MBBI(MI);
+ MachineBasicBlock::const_iterator MBBE = MI->getParent()->end();
+ ++MBBI; // Step over MI
+
+ // This loop iterates MBBs
+ for (;;) {
+ // This loop iterates instructions
+ for (; MBBI != MBBE; ++MBBI) {
+ // Check the instruction that follows this CALL.
+ const MachineInstr &NextMI = *MBBI;
+
+ // If there is an EH_LABEL after this CALL, then there is an EH state
+ // transition after this CALL. This is exactly the situation which
+ // requires NOP padding.
+ if (NextMI.isEHLabel()) {
+ if (HasEHPersonality) {
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ return;
+ }
+ // We actually want to continue, in case there is an SEH_BeginEpilogue
+ // instruction after the EH_LABEL. In some situations, IR is produced
+ // that contains EH_LABEL pseudo-instructions, even when we are not
+ // generating IP2State tables. We still need to insert a NOP before
+ // SEH_BeginEpilogue in that case.
+ continue;
+ }
+
+ // Somewhat similarly, if the CALL is the last instruction before the
+ // SEH prologue, then we also need a NOP. This is necessary because the
+ // Windows stack unwinder will not invoke a function's exception handler
+ // if the instruction pointer is in the function prologue or epilogue.
+ //
+ // We always emit a NOP before SEH_BeginEpilogue, even if there is no
+ // personality function (unwind info) for this frame. This is the same
+ // behavior as MSVC.
+ if (NextMI.getOpcode() == X86::SEH_BeginEpilogue) {
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ return;
+ }
+
+ if (!NextMI.isPseudo() && !NextMI.isMetaInstruction()) {
+ // We found a real instruction. During the CALL, the return IP will
+ // point to this instruction. Since this instruction has the same EH
+ // state as the call itself (because there is no intervening EH_LABEL),
+ // the IP2State table will be accurate; there is no need to insert a
+ // NOP.
+ return;
+ }
+
+ // The next instruction is a pseudo-op. Ignore it and keep searching.
+ // Because these instructions do not generate any machine code, they
+ // cannot prevent the IP2State table from pointing at the wrong
+ // instruction during a CALL.
+ }
+
+ // We've reached the end of this MBB. Find the next MBB in program order.
+ // MBB order should be finalized by this point, so falling across MBBs is
+ // expected.
+ ++MFI;
+ if (MFI == MFE) {
+ // No more blocks; we've reached the end of the function. This should
+ // only happen with no-return functions, but double-check to be sure.
+ if (HasEHPersonality) {
+ // If the CALL has no successors, then it is a noreturn function.
+ // Insert an INT3 instead of a NOP. This accomplishes the same purpose,
+ // but is more clear to read. Also, analysis tools will understand
+ // that they should not continue disassembling after the CALL (unless
+ // there are other branches to that label).
+ if (MI->getParent()->succ_empty())
+ EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+ else
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ }
+ return;
+ }
+
+ // Set up iterator to scan the next basic block.
+ const MachineBasicBlock *NextMBB = &*MFI;
+ MBBI = NextMBB->instr_begin();
+ MBBE = NextMBB->instr_end();
+ }
+}
+
void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization(
ImportCallKind Kind) {
assert(EnableImportCallOptimization);