diff options
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r-- | llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 63 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86AsmPrinter.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86CallingConv.cpp | 31 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86CallingConv.td | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 33 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InterleavedAccess.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86MCInstLower.cpp | 208 |
9 files changed, 292 insertions, 84 deletions
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 3d060c6..e213923 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -127,7 +127,6 @@ class X86AsmBackend : public MCAsmBackend { unsigned PrevInstOpcode = 0; MCBoundaryAlignFragment *PendingBA = nullptr; std::pair<MCFragment *, size_t> PrevInstPosition; - bool IsRightAfterData = false; uint8_t determinePaddingPrefix(const MCInst &Inst) const; bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const; @@ -156,10 +155,13 @@ public: AlignBranchType = X86AlignBranchKindLoc; if (X86PadMaxPrefixSize.getNumOccurrences()) TargetPrefixMax = X86PadMaxPrefixSize; + + AllowAutoPadding = + AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone; + AllowEnhancedRelaxation = + AllowAutoPadding && TargetPrefixMax != 0 && X86PadForBranchAlign; } - bool allowAutoPadding() const override; - bool allowEnhancedRelaxation() const override; void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst, const MCSubtargetInfo &STI); void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst); @@ -365,14 +367,6 @@ static bool hasVariantSymbol(const MCInst &MI) { return false; } -bool X86AsmBackend::allowAutoPadding() const { - return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone); -} - -bool X86AsmBackend::allowEnhancedRelaxation() const { - return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign; -} - /// X86 has certain instructions which enable interrupts exactly one /// instruction *after* the instruction which stores to SS. Return true if the /// given instruction may have such an interrupt delay slot. @@ -447,7 +441,7 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { // semantic. return false; - if (IsRightAfterData) + if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition)) // If this instruction follows any data, there is no clear // instruction boundary, inserting a nop/prefix would change semantic. return false; @@ -484,13 +478,26 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const { (AlignBranchType & X86::AlignBranchIndirect)); } +void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst, + const MCSubtargetInfo &STI) { + bool AutoPadding = S.getAllowAutoPadding(); + if (LLVM_LIKELY(!AutoPadding && !X86PadForAlign)) { + S.MCObjectStreamer::emitInstruction(Inst, STI); + return; + } + + auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend()); + Backend.emitInstructionBegin(S, Inst, STI); + S.MCObjectStreamer::emitInstruction(Inst, STI); + Backend.emitInstructionEnd(S, Inst); +} + /// Insert BoundaryAlignFragment before instructions to align branches. void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst, const MCSubtargetInfo &STI) { - // Used by canPadInst. Done here, because in emitInstructionEnd, the current - // fragment will have changed. - IsRightAfterData = - isRightAfterData(OS.getCurrentFragment(), PrevInstPosition); + bool CanPadInst = canPadInst(Inst, OS); + if (CanPadInst) + OS.getCurrentFragment()->setAllowAutoPadding(true); if (!canPadBranches(OS)) return; @@ -504,7 +511,7 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, // we call canPadInst (not cheap) twice. However, in the common case, we can // avoid unnecessary calls to that, as this is otherwise only used for // relaxable fragments. - if (!canPadInst(Inst, OS)) + if (!CanPadInst) return; if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) { @@ -542,11 +549,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, /// Set the last fragment to be aligned for the BoundaryAlignFragment. void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) { - MCFragment *CF = OS.getCurrentFragment(); - if (CF->getKind() == MCFragment::FT_Relaxable) - CF->setAllowAutoPadding(canPadInst(Inst, OS)); - // Update PrevInstOpcode here, canPadInst() reads that. + MCFragment *CF = OS.getCurrentFragment(); PrevInstOpcode = Inst.getOpcode(); PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF)); @@ -567,11 +571,10 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, // DataFragment, so that we can get the size of instructions later in // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty // DataFragment. - OS.insert(OS.getContext().allocFragment<MCFragment>()); + OS.newFragment(); // Update the maximum alignment on the current section if necessary. - MCSection *Sec = OS.getCurrentSectionOnly(); - Sec->ensureMinAlignment(AlignBoundary); + CF->getParent()->ensureMinAlignment(AlignBoundary); } std::optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const { @@ -923,13 +926,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const { continue; } - const uint64_t OrigSize = Asm.computeFragmentSize(F); - // To keep the effects local, prefer to relax instructions closest to // the align directive. This is purely about human understandability // of the resulting code. If we later find a reason to expand // particular instructions over others, we can adjust. - unsigned RemainingSize = OrigSize; + unsigned RemainingSize = Asm.computeFragmentSize(F) - F.getFixedSize(); while (!Relaxable.empty() && RemainingSize != 0) { auto &RF = *Relaxable.pop_back_val(); // Give the backend a chance to play any tricks it wishes to increase @@ -1542,14 +1543,6 @@ public: }; } // end anonymous namespace -void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst, - const MCSubtargetInfo &STI) { - auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend()); - Backend.emitInstructionBegin(S, Inst, STI); - S.MCObjectStreamer::emitInstruction(Inst, STI); - Backend.emitInstructionEnd(S, Inst); -} - void X86ELFStreamer::emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) { X86_MC::emitInstruction(*this, Inst, STI); diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h index efb951b..e02b556 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/llvm/lib/Target/X86/X86AsmPrinter.h @@ -151,6 +151,7 @@ private: MCSymbol *LazyPointer) override; void emitCallInstruction(const llvm::MCInst &MCI); + void maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI); // Emits a label to mark the next instruction as being relevant to Import Call // Optimization. diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 0b4c63f..5d5a705 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } +/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized +/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment. +/// Technically only fp128 has a specified ABI, but it makes sense to handle +/// i128 the same until we hear differently. +static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + assert(ValVT == MVT::i32 && "Should have i32 parts"); + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + assert(PendingMembers.size() == 4 && "Should have four parts"); + + int64_t Offset = State.AllocateStack(16, Align(16)); + PendingMembers[0].convertToMem(Offset); + PendingMembers[1].convertToMem(Offset + 4); + PendingMembers[2].convertToMem(Offset + 8); + PendingMembers[3].convertToMem(Offset + 12); + + State.addLoc(PendingMembers[0]); + State.addLoc(PendingMembers[1]); + State.addLoc(PendingMembers[2]); + State.addLoc(PendingMembers[3]); + PendingMembers.clear(); + return true; +} + // Provides entry points of CC_X86 and RetCC_X86. #include "X86GenCallingConv.inc" diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 823e0caa..f020e0b 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[ // The 'nest' parameter, if any, is passed in ECX. CCIfNest<CCAssignToReg<[ECX]>>, + // i128 and fp128 need to be passed on the stack with a higher alignment than + // their legal types. Handle this with a custom function. + CCIfType<[i32], + CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>, + // On swifttailcc pass swiftself in ECX. CCIfCC<"CallingConv::SwiftTail", CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d91ea1ea..11ab8dc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1323,11 +1323,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); } - if (Subtarget.hasGFNI()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) { setOperationAction(ISD::BITREVERSE, MVT::i8, Custom); setOperationAction(ISD::BITREVERSE, MVT::i16, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::i64, Custom); + + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { + setOperationAction(ISD::BITREVERSE, VT, Custom); + } } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { @@ -4997,9 +5001,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, EVT VT = Op.getValueType(); unsigned SizeInBits = VT.getSizeInBits(); - assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); unsigned NumElts = SizeInBits / EltSizeInBits; + // Can't split constant. + if ((SizeInBits % EltSizeInBits) != 0) + return false; + // Bitcast a source array of element bits to the target size. auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) { unsigned NumSrcElts = UndefSrcElts.getBitWidth(); @@ -32694,7 +32701,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasXOP() && !VT.is512BitVector()) return LowerBITREVERSE_XOP(Op, DAG); - assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); + assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) && + "SSSE3 or GFNI required for BITREVERSE"); SDValue In = Op.getOperand(0); SDLoc DL(Op); @@ -45054,6 +45062,10 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( unsigned NumElts = DemandedElts.getBitWidth(); switch (Op.getOpcode()) { + case X86ISD::GlobalBaseReg: + case X86ISD::Wrapper: + case X86ISD::WrapperRIP: + return true; case X86ISD::BLENDI: case X86ISD::PSHUFD: case X86ISD::UNPCKL: @@ -45093,27 +45105,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { switch (Op.getOpcode()) { + // SSE vector insert/extracts use modulo indices. + case X86ISD::PINSRB: + case X86ISD::PINSRW: + case X86ISD::PEXTRB: + case X86ISD::PEXTRW: + return false; // SSE vector multiplies are either inbounds or saturate. case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: + return false; // SSE vector shifts handle out of bounds shift amounts. case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: return false; - // SSE blends. + // SSE blends. case X86ISD::BLENDI: case X86ISD::BLENDV: return false; - // SSE target shuffles. + // SSE target shuffles. case X86ISD::PSHUFD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: case X86ISD::VPERMV3: return false; - // SSE comparisons handle all icmp/fcmp cases. - // TODO: Add CMPM/MM with test coverage. + // SSE comparisons handle all icmp/fcmp cases. + // TODO: Add CMPM/MM with test coverage. case X86ISD::CMPP: case X86ISD::PCMPEQ: case X86ISD::PCMPGT: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 6bcb7a3..547b221 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1661,14 +1661,15 @@ namespace llvm { /// Lower interleaved load(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; /// Lower interleaved store(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const override; SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 9ad3553..b4639ac 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const { - // i128 split into i64 needs to be allocated to two consecutive registers, - // or spilled to the stack as a whole. - return Ty->isIntegerTy(128); + // On x86-64 i128 is split into two i64s and needs to be allocated to two + // consecutive registers, or spilled to the stack as a whole. On x86-32 i128 + // is split to four i32s and never actually passed in registers, but we use + // the consecutive register mark to match it in TableGen. + if (Ty->isIntegerTy(128)) + return true; + + // On x86-32, fp128 acts the same as i128. + if (Subtarget.is32Bit() && Ty->isFP128Ty()) + return true; + + return false; } /// Helper for getByValTypeAlignment to determine diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 1eb47e3..636b072 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // number of shuffles and ISA. // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, @@ -817,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad( return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); } -bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, +bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, + Value *LaneMask, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && @@ -827,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, 0 && "Invalid interleaved store"); + auto *SI = dyn_cast<StoreInst>(Store); + if (!SI) + return false; + assert(!LaneMask && "Unexpected mask on store"); + // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. auto Mask = SVI->getShuffleMask(); diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 45d596b..481a9be 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Mangler.h" @@ -833,6 +834,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, CallInst.setOpcode(CallOpcode); CallInst.addOperand(CallTargetMCOp); OutStreamer->emitInstruction(CallInst, getSubtargetInfo()); + maybeEmitNopAfterCallForWindowsEH(&MI); } // Record our statepoint node in the same section used by STACKMAP @@ -1430,21 +1432,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, OutStreamer->emitLabel(FallthroughLabel); } -// Returns instruction preceding MBBI in MachineFunction. -// If MBBI is the first instruction of the first basic block, returns null. -static MachineBasicBlock::const_iterator -PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) { - const MachineBasicBlock *MBB = MBBI->getParent(); - while (MBBI == MBB->begin()) { - if (MBB == &MBB->getParent()->front()) - return MachineBasicBlock::const_iterator(); - MBB = MBB->getPrevNode(); - MBBI = MBB->end(); - } - --MBBI; - return MBBI; -} - static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) { if (X86II::isKMasked(MI->getDesc().TSFlags)) { // Skip mask operand. @@ -2271,6 +2258,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { OutStreamer->AddComment("EVEX TO EVEX Compression ", false); } + // We use this to suppress NOP padding for Windows EH. + bool IsTailJump = false; + switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); @@ -2325,6 +2315,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // Lower this as normal, but add a comment. OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPr: @@ -2340,6 +2331,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // Lower these as normal, but add some comments. OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPm64_REX: @@ -2349,6 +2341,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { } OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPr64_REX: { @@ -2361,6 +2354,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { } OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; } @@ -2537,26 +2531,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { case X86::SEH_BeginEpilogue: { assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - // Windows unwinder will not invoke function's exception handler if IP is - // either in prologue or in epilogue. This behavior causes a problem when a - // call immediately precedes an epilogue, because the return address points - // into the epilogue. To cope with that, we insert a 'nop' if it ends up - // immediately after a CALL in the final emitted code. - MachineBasicBlock::const_iterator MBBI(MI); - // Check if preceded by a call and emit nop if so. - for (MBBI = PrevCrossBBInst(MBBI); - MBBI != MachineBasicBlock::const_iterator(); - MBBI = PrevCrossBBInst(MBBI)) { - // Pseudo instructions that aren't a call are assumed to not emit any - // code. If they do, we worst case generate unnecessary noops after a - // call. - if (MBBI->isCall() || !MBBI->isPseudo()) { - if (MBBI->isCall()) - EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); - break; - } - } - EmitSEHInstruction(MI); return; } @@ -2585,6 +2559,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); emitCallInstruction(TmpInst); emitNop(*OutStreamer, 5, Subtarget); + maybeEmitNopAfterCallForWindowsEH(MI); return; } @@ -2605,6 +2580,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // For Import Call Optimization to work, we need a 3-byte nop after the // call instruction. emitNop(*OutStreamer, 3, Subtarget); + maybeEmitNopAfterCallForWindowsEH(MI); return; } break; @@ -2638,6 +2614,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { if (MI->isCall()) { emitCallInstruction(TmpInst); + // Since tail calls transfer control without leaving a stack frame, there is + // never a need for NOP padding tail calls. + if (!IsTailJump) + maybeEmitNopAfterCallForWindowsEH(MI); return; } @@ -2659,6 +2639,164 @@ void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) { OutStreamer->emitInstruction(MCI, getSubtargetInfo()); } +// Determines whether a NOP is required after a CALL, so that Windows EH +// IP2State tables have the correct information. +// +// On most Windows platforms (AMD64, ARM64, ARM32, IA64, but *not* x86-32), +// exception handling works by looking up instruction pointers in lookup +// tables. These lookup tables are stored in .xdata sections in executables. +// One element of the lookup tables are the "IP2State" tables (Instruction +// Pointer to State). +// +// If a function has any instructions that require cleanup during exception +// unwinding, then it will have an IP2State table. Each entry in the IP2State +// table describes a range of bytes in the function's instruction stream, and +// associates an "EH state number" with that range of instructions. A value of +// -1 means "the null state", which does not require any code to execute. +// A value other than -1 is an index into the State table. +// +// The entries in the IP2State table contain byte offsets within the instruction +// stream of the function. The Windows ABI requires that these offsets are +// aligned to instruction boundaries; they are not permitted to point to a byte +// that is not the first byte of an instruction. +// +// Unfortunately, CALL instructions present a problem during unwinding. CALL +// instructions push the address of the instruction after the CALL instruction, +// so that execution can resume after the CALL. If the CALL is the last +// instruction within an IP2State region, then the return address (on the stack) +// points to the *next* IP2State region. This means that the unwinder will +// use the wrong cleanup funclet during unwinding. +// +// To fix this problem, the Windows AMD64 ABI requires that CALL instructions +// are never placed at the end of an IP2State region. Stated equivalently, the +// end of a CALL instruction cannot be aligned to an IP2State boundary. If a +// CALL instruction would occur at the end of an IP2State region, then the +// compiler must insert a NOP instruction after the CALL. The NOP instruction +// is placed in the same EH region as the CALL instruction, so that the return +// address points to the NOP and the unwinder will locate the correct region. +// +// NOP padding is only necessary on Windows AMD64 targets. On ARM64 and ARM32, +// instructions have a fixed size so the unwinder knows how to "back up" by +// one instruction. +// +// Interaction with Import Call Optimization (ICO): +// +// Import Call Optimization (ICO) is a compiler + OS feature on Windows which +// improves the performance and security of DLL imports. ICO relies on using a +// specific CALL idiom that can be replaced by the OS DLL loader. This removes +// a load and indirect CALL and replaces it with a single direct CALL. +// +// To achieve this, ICO also inserts NOPs after the CALL instruction. If the +// end of the CALL is aligned with an EH state transition, we *also* insert +// a single-byte NOP. **Both forms of NOPs must be preserved.** They cannot +// be combined into a single larger NOP; nor can the second NOP be removed. +// +// This is necessary because, if ICO is active and the call site is modified +// by the loader, the loader will end up overwriting the NOPs that were inserted +// for ICO. That means that those NOPs cannot be used for the correct +// termination of the exception handling region (the IP2State transition), +// so we still need an additional NOP instruction. The NOPs cannot be combined +// into a longer NOP (which is ordinarily desirable) because then ICO would +// split one instruction, producing a malformed instruction after the ICO call. +void X86AsmPrinter::maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI) { + // We only need to insert NOPs after CALLs when targeting Windows on AMD64. + // (Don't let the name fool you: Itanium refers to table-based exception + // handling, not the Itanium architecture.) + if (MAI->getExceptionHandlingType() != ExceptionHandling::WinEH || + MAI->getWinEHEncodingType() != WinEH::EncodingType::Itanium) { + return; + } + + bool HasEHPersonality = MF->getWinEHFuncInfo() != nullptr; + + // Set up MBB iterator, initially positioned on the same MBB as MI. + MachineFunction::const_iterator MFI(MI->getParent()); + MachineFunction::const_iterator MFE(MF->end()); + + // Set up instruction iterator, positioned immediately *after* MI. + MachineBasicBlock::const_iterator MBBI(MI); + MachineBasicBlock::const_iterator MBBE = MI->getParent()->end(); + ++MBBI; // Step over MI + + // This loop iterates MBBs + for (;;) { + // This loop iterates instructions + for (; MBBI != MBBE; ++MBBI) { + // Check the instruction that follows this CALL. + const MachineInstr &NextMI = *MBBI; + + // If there is an EH_LABEL after this CALL, then there is an EH state + // transition after this CALL. This is exactly the situation which + // requires NOP padding. + if (NextMI.isEHLabel()) { + if (HasEHPersonality) { + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + return; + } + // We actually want to continue, in case there is an SEH_BeginEpilogue + // instruction after the EH_LABEL. In some situations, IR is produced + // that contains EH_LABEL pseudo-instructions, even when we are not + // generating IP2State tables. We still need to insert a NOP before + // SEH_BeginEpilogue in that case. + continue; + } + + // Somewhat similarly, if the CALL is the last instruction before the + // SEH prologue, then we also need a NOP. This is necessary because the + // Windows stack unwinder will not invoke a function's exception handler + // if the instruction pointer is in the function prologue or epilogue. + // + // We always emit a NOP before SEH_BeginEpilogue, even if there is no + // personality function (unwind info) for this frame. This is the same + // behavior as MSVC. + if (NextMI.getOpcode() == X86::SEH_BeginEpilogue) { + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + return; + } + + if (!NextMI.isPseudo() && !NextMI.isMetaInstruction()) { + // We found a real instruction. During the CALL, the return IP will + // point to this instruction. Since this instruction has the same EH + // state as the call itself (because there is no intervening EH_LABEL), + // the IP2State table will be accurate; there is no need to insert a + // NOP. + return; + } + + // The next instruction is a pseudo-op. Ignore it and keep searching. + // Because these instructions do not generate any machine code, they + // cannot prevent the IP2State table from pointing at the wrong + // instruction during a CALL. + } + + // We've reached the end of this MBB. Find the next MBB in program order. + // MBB order should be finalized by this point, so falling across MBBs is + // expected. + ++MFI; + if (MFI == MFE) { + // No more blocks; we've reached the end of the function. This should + // only happen with no-return functions, but double-check to be sure. + if (HasEHPersonality) { + // If the CALL has no successors, then it is a noreturn function. + // Insert an INT3 instead of a NOP. This accomplishes the same purpose, + // but is more clear to read. Also, analysis tools will understand + // that they should not continue disassembling after the CALL (unless + // there are other branches to that label). + if (MI->getParent()->succ_empty()) + EmitAndCountInstruction(MCInstBuilder(X86::INT3)); + else + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + } + return; + } + + // Set up iterator to scan the next basic block. + const MachineBasicBlock *NextMBB = &*MFI; + MBBI = NextMBB->instr_begin(); + MBBE = NextMBB->instr_end(); + } +} + void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization( ImportCallKind Kind) { assert(EnableImportCallOptimization); |