diff options
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r-- | llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp | 28 | ||||
-rw-r--r-- | llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 101 | ||||
-rw-r--r-- | llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86AsmPrinter.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86CallingConv.cpp | 31 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86CallingConv.td | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 33 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InterleavedAccess.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86MCInstLower.cpp | 208 |
11 files changed, 322 insertions, 121 deletions
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index b642c1c..8213e51 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1042,8 +1042,8 @@ private: } PrevState = CurrState; } - void onRParen() { - PrevState = State; + bool onRParen(StringRef &ErrMsg) { + IntelExprState CurrState = State; switch (State) { default: State = IES_ERROR; @@ -1054,9 +1054,27 @@ private: case IES_RBRAC: case IES_RPAREN: State = IES_RPAREN; + // In the case of a multiply, onRegister has already set IndexReg + // directly, with appropriate scale. + // Otherwise if we just saw a register it has only been stored in + // TmpReg, so we need to store it into the state machine. + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // no explicit scale. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + if (IndexReg) + return regsUseUpError(ErrMsg); + IndexReg = TmpReg; + Scale = 0; + } + } IC.pushOperator(IC_RPAREN); break; } + PrevState = CurrState; + return false; } bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID, const InlineAsmIdentifierInfo &IDInfo, @@ -2172,7 +2190,11 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { } break; case AsmToken::LParen: SM.onLParen(); break; - case AsmToken::RParen: SM.onRParen(); break; + case AsmToken::RParen: + if (SM.onRParen(ErrMsg)) { + return Error(Tok.getLoc(), ErrMsg); + } + break; } if (SM.hadError()) return Error(Tok.getLoc(), "unknown token in expression"); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 3d060c6..7f9d474 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -127,7 +127,6 @@ class X86AsmBackend : public MCAsmBackend { unsigned PrevInstOpcode = 0; MCBoundaryAlignFragment *PendingBA = nullptr; std::pair<MCFragment *, size_t> PrevInstPosition; - bool IsRightAfterData = false; uint8_t determinePaddingPrefix(const MCInst &Inst) const; bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const; @@ -156,10 +155,13 @@ public: AlignBranchType = X86AlignBranchKindLoc; if (X86PadMaxPrefixSize.getNumOccurrences()) TargetPrefixMax = X86PadMaxPrefixSize; + + AllowAutoPadding = + AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone; + AllowEnhancedRelaxation = + AllowAutoPadding && TargetPrefixMax != 0 && X86PadForBranchAlign; } - bool allowAutoPadding() const override; - bool allowEnhancedRelaxation() const override; void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst, const MCSubtargetInfo &STI); void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst); @@ -365,14 +367,6 @@ static bool hasVariantSymbol(const MCInst &MI) { return false; } -bool X86AsmBackend::allowAutoPadding() const { - return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone); -} - -bool X86AsmBackend::allowEnhancedRelaxation() const { - return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign; -} - /// X86 has certain instructions which enable interrupts exactly one /// instruction *after* the instruction which stores to SS. Return true if the /// given instruction may have such an interrupt delay slot. @@ -394,36 +388,6 @@ static bool mayHaveInterruptDelaySlot(unsigned InstOpcode) { return false; } -/// Check if the instruction to be emitted is right after any data. -static bool -isRightAfterData(MCFragment *CurrentFragment, - const std::pair<MCFragment *, size_t> &PrevInstPosition) { - MCFragment *F = CurrentFragment; - // Since data is always emitted into a DataFragment, our check strategy is - // simple here. - // - If the fragment is a DataFragment - // - If it's empty (section start or data after align), return false. - // - If it's not the fragment where the previous instruction is, - // returns true. - // - If it's the fragment holding the previous instruction but its - // size changed since the previous instruction was emitted into - // it, returns true. - // - Otherwise returns false. - // - If the fragment is not a DataFragment, returns false. - if (F->getKind() == MCFragment::FT_Data) - return F->getFixedSize() && (F != PrevInstPosition.first || - F->getFixedSize() != PrevInstPosition.second); - - return false; -} - -/// \returns the fragment size if it has instructions, otherwise returns 0. -static size_t getSizeForInstFragment(const MCFragment *F) { - if (!F || !F->hasInstructions()) - return 0; - return F->getSize(); -} - /// Return true if we can insert NOP or prefixes automatically before the /// the instruction to be emitted. bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { @@ -447,9 +411,11 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { // semantic. return false; - if (IsRightAfterData) - // If this instruction follows any data, there is no clear - // instruction boundary, inserting a nop/prefix would change semantic. + // If this instruction follows any data, there is no clear instruction + // boundary, inserting a nop/prefix would change semantic. + auto Offset = OS.getCurFragSize(); + if (Offset && (OS.getCurrentFragment() != PrevInstPosition.first || + Offset != PrevInstPosition.second)) return false; return true; @@ -484,13 +450,26 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const { (AlignBranchType & X86::AlignBranchIndirect)); } +void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst, + const MCSubtargetInfo &STI) { + bool AutoPadding = S.getAllowAutoPadding(); + if (LLVM_LIKELY(!AutoPadding && !X86PadForAlign)) { + S.MCObjectStreamer::emitInstruction(Inst, STI); + return; + } + + auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend()); + Backend.emitInstructionBegin(S, Inst, STI); + S.MCObjectStreamer::emitInstruction(Inst, STI); + Backend.emitInstructionEnd(S, Inst); +} + /// Insert BoundaryAlignFragment before instructions to align branches. void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst, const MCSubtargetInfo &STI) { - // Used by canPadInst. Done here, because in emitInstructionEnd, the current - // fragment will have changed. - IsRightAfterData = - isRightAfterData(OS.getCurrentFragment(), PrevInstPosition); + bool CanPadInst = canPadInst(Inst, OS); + if (CanPadInst) + OS.getCurrentFragment()->setAllowAutoPadding(true); if (!canPadBranches(OS)) return; @@ -504,7 +483,7 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, // we call canPadInst (not cheap) twice. However, in the common case, we can // avoid unnecessary calls to that, as this is otherwise only used for // relaxable fragments. - if (!canPadInst(Inst, OS)) + if (!CanPadInst) return; if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) { @@ -542,13 +521,10 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, /// Set the last fragment to be aligned for the BoundaryAlignFragment. void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) { - MCFragment *CF = OS.getCurrentFragment(); - if (CF->getKind() == MCFragment::FT_Relaxable) - CF->setAllowAutoPadding(canPadInst(Inst, OS)); - // Update PrevInstOpcode here, canPadInst() reads that. + MCFragment *CF = OS.getCurrentFragment(); PrevInstOpcode = Inst.getOpcode(); - PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF)); + PrevInstPosition = std::make_pair(CF, OS.getCurFragSize()); if (!canPadBranches(OS)) return; @@ -567,11 +543,10 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, // DataFragment, so that we can get the size of instructions later in // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty // DataFragment. - OS.insert(OS.getContext().allocFragment<MCFragment>()); + OS.newFragment(); // Update the maximum alignment on the current section if necessary. - MCSection *Sec = OS.getCurrentSectionOnly(); - Sec->ensureMinAlignment(AlignBoundary); + CF->getParent()->ensureMinAlignment(AlignBoundary); } std::optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const { @@ -923,13 +898,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const { continue; } - const uint64_t OrigSize = Asm.computeFragmentSize(F); - // To keep the effects local, prefer to relax instructions closest to // the align directive. This is purely about human understandability // of the resulting code. If we later find a reason to expand // particular instructions over others, we can adjust. - unsigned RemainingSize = OrigSize; + unsigned RemainingSize = Asm.computeFragmentSize(F) - F.getFixedSize(); while (!Relaxable.empty() && RemainingSize != 0) { auto &RF = *Relaxable.pop_back_val(); // Give the backend a chance to play any tricks it wishes to increase @@ -1542,14 +1515,6 @@ public: }; } // end anonymous namespace -void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst, - const MCSubtargetInfo &STI) { - auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend()); - Backend.emitInstructionBegin(S, Inst, STI); - S.MCObjectStreamer::emitInstruction(Inst, STI); - Backend.emitInstructionEnd(S, Inst); -} - void X86ELFStreamer::emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) { X86_MC::emitInstruction(*this, Inst, STI); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index f5eeb3b..d691538 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "X86MCAsmInfo.h" -#include "MCTargetDesc/X86MCExpr.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h index efb951b..e02b556 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/llvm/lib/Target/X86/X86AsmPrinter.h @@ -151,6 +151,7 @@ private: MCSymbol *LazyPointer) override; void emitCallInstruction(const llvm::MCInst &MCI); + void maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI); // Emits a label to mark the next instruction as being relevant to Import Call // Optimization. diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 0b4c63f..5d5a705 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } +/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized +/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment. +/// Technically only fp128 has a specified ABI, but it makes sense to handle +/// i128 the same until we hear differently. +static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + assert(ValVT == MVT::i32 && "Should have i32 parts"); + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + assert(PendingMembers.size() == 4 && "Should have four parts"); + + int64_t Offset = State.AllocateStack(16, Align(16)); + PendingMembers[0].convertToMem(Offset); + PendingMembers[1].convertToMem(Offset + 4); + PendingMembers[2].convertToMem(Offset + 8); + PendingMembers[3].convertToMem(Offset + 12); + + State.addLoc(PendingMembers[0]); + State.addLoc(PendingMembers[1]); + State.addLoc(PendingMembers[2]); + State.addLoc(PendingMembers[3]); + PendingMembers.clear(); + return true; +} + // Provides entry points of CC_X86 and RetCC_X86. #include "X86GenCallingConv.inc" diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 823e0caa..f020e0b 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[ // The 'nest' parameter, if any, is passed in ECX. CCIfNest<CCAssignToReg<[ECX]>>, + // i128 and fp128 need to be passed on the stack with a higher alignment than + // their legal types. Handle this with a custom function. + CCIfType<[i32], + CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>, + // On swifttailcc pass swiftself in ECX. CCIfCC<"CallingConv::SwiftTail", CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d91ea1ea..11ab8dc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1323,11 +1323,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); } - if (Subtarget.hasGFNI()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) { setOperationAction(ISD::BITREVERSE, MVT::i8, Custom); setOperationAction(ISD::BITREVERSE, MVT::i16, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::i64, Custom); + + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { + setOperationAction(ISD::BITREVERSE, VT, Custom); + } } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { @@ -4997,9 +5001,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, EVT VT = Op.getValueType(); unsigned SizeInBits = VT.getSizeInBits(); - assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); unsigned NumElts = SizeInBits / EltSizeInBits; + // Can't split constant. + if ((SizeInBits % EltSizeInBits) != 0) + return false; + // Bitcast a source array of element bits to the target size. auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) { unsigned NumSrcElts = UndefSrcElts.getBitWidth(); @@ -32694,7 +32701,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasXOP() && !VT.is512BitVector()) return LowerBITREVERSE_XOP(Op, DAG); - assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); + assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) && + "SSSE3 or GFNI required for BITREVERSE"); SDValue In = Op.getOperand(0); SDLoc DL(Op); @@ -45054,6 +45062,10 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( unsigned NumElts = DemandedElts.getBitWidth(); switch (Op.getOpcode()) { + case X86ISD::GlobalBaseReg: + case X86ISD::Wrapper: + case X86ISD::WrapperRIP: + return true; case X86ISD::BLENDI: case X86ISD::PSHUFD: case X86ISD::UNPCKL: @@ -45093,27 +45105,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { switch (Op.getOpcode()) { + // SSE vector insert/extracts use modulo indices. + case X86ISD::PINSRB: + case X86ISD::PINSRW: + case X86ISD::PEXTRB: + case X86ISD::PEXTRW: + return false; // SSE vector multiplies are either inbounds or saturate. case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: + return false; // SSE vector shifts handle out of bounds shift amounts. case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: return false; - // SSE blends. + // SSE blends. case X86ISD::BLENDI: case X86ISD::BLENDV: return false; - // SSE target shuffles. + // SSE target shuffles. case X86ISD::PSHUFD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: case X86ISD::VPERMV3: return false; - // SSE comparisons handle all icmp/fcmp cases. - // TODO: Add CMPM/MM with test coverage. + // SSE comparisons handle all icmp/fcmp cases. + // TODO: Add CMPM/MM with test coverage. case X86ISD::CMPP: case X86ISD::PCMPEQ: case X86ISD::PCMPGT: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 6bcb7a3..547b221 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1661,14 +1661,15 @@ namespace llvm { /// Lower interleaved load(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; /// Lower interleaved store(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const override; SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 9ad3553..b4639ac 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const { - // i128 split into i64 needs to be allocated to two consecutive registers, - // or spilled to the stack as a whole. - return Ty->isIntegerTy(128); + // On x86-64 i128 is split into two i64s and needs to be allocated to two + // consecutive registers, or spilled to the stack as a whole. On x86-32 i128 + // is split to four i32s and never actually passed in registers, but we use + // the consecutive register mark to match it in TableGen. + if (Ty->isIntegerTy(128)) + return true; + + // On x86-32, fp128 acts the same as i128. + if (Subtarget.is32Bit() && Ty->isFP128Ty()) + return true; + + return false; } /// Helper for getByValTypeAlignment to determine diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 1eb47e3..636b072 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // number of shuffles and ISA. // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, @@ -817,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad( return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); } -bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, +bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, + Value *LaneMask, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && @@ -827,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, 0 && "Invalid interleaved store"); + auto *SI = dyn_cast<StoreInst>(Store); + if (!SI) + return false; + assert(!LaneMask && "Unexpected mask on store"); + // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. auto Mask = SVI->getShuffleMask(); diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 45d596b..481a9be 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Mangler.h" @@ -833,6 +834,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, CallInst.setOpcode(CallOpcode); CallInst.addOperand(CallTargetMCOp); OutStreamer->emitInstruction(CallInst, getSubtargetInfo()); + maybeEmitNopAfterCallForWindowsEH(&MI); } // Record our statepoint node in the same section used by STACKMAP @@ -1430,21 +1432,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, OutStreamer->emitLabel(FallthroughLabel); } -// Returns instruction preceding MBBI in MachineFunction. -// If MBBI is the first instruction of the first basic block, returns null. -static MachineBasicBlock::const_iterator -PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) { - const MachineBasicBlock *MBB = MBBI->getParent(); - while (MBBI == MBB->begin()) { - if (MBB == &MBB->getParent()->front()) - return MachineBasicBlock::const_iterator(); - MBB = MBB->getPrevNode(); - MBBI = MBB->end(); - } - --MBBI; - return MBBI; -} - static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) { if (X86II::isKMasked(MI->getDesc().TSFlags)) { // Skip mask operand. @@ -2271,6 +2258,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { OutStreamer->AddComment("EVEX TO EVEX Compression ", false); } + // We use this to suppress NOP padding for Windows EH. + bool IsTailJump = false; + switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); @@ -2325,6 +2315,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // Lower this as normal, but add a comment. OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPr: @@ -2340,6 +2331,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // Lower these as normal, but add some comments. OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPm64_REX: @@ -2349,6 +2341,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { } OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPr64_REX: { @@ -2361,6 +2354,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { } OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; } @@ -2537,26 +2531,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { case X86::SEH_BeginEpilogue: { assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - // Windows unwinder will not invoke function's exception handler if IP is - // either in prologue or in epilogue. This behavior causes a problem when a - // call immediately precedes an epilogue, because the return address points - // into the epilogue. To cope with that, we insert a 'nop' if it ends up - // immediately after a CALL in the final emitted code. - MachineBasicBlock::const_iterator MBBI(MI); - // Check if preceded by a call and emit nop if so. - for (MBBI = PrevCrossBBInst(MBBI); - MBBI != MachineBasicBlock::const_iterator(); - MBBI = PrevCrossBBInst(MBBI)) { - // Pseudo instructions that aren't a call are assumed to not emit any - // code. If they do, we worst case generate unnecessary noops after a - // call. - if (MBBI->isCall() || !MBBI->isPseudo()) { - if (MBBI->isCall()) - EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); - break; - } - } - EmitSEHInstruction(MI); return; } @@ -2585,6 +2559,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); emitCallInstruction(TmpInst); emitNop(*OutStreamer, 5, Subtarget); + maybeEmitNopAfterCallForWindowsEH(MI); return; } @@ -2605,6 +2580,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // For Import Call Optimization to work, we need a 3-byte nop after the // call instruction. emitNop(*OutStreamer, 3, Subtarget); + maybeEmitNopAfterCallForWindowsEH(MI); return; } break; @@ -2638,6 +2614,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { if (MI->isCall()) { emitCallInstruction(TmpInst); + // Since tail calls transfer control without leaving a stack frame, there is + // never a need for NOP padding tail calls. + if (!IsTailJump) + maybeEmitNopAfterCallForWindowsEH(MI); return; } @@ -2659,6 +2639,164 @@ void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) { OutStreamer->emitInstruction(MCI, getSubtargetInfo()); } +// Determines whether a NOP is required after a CALL, so that Windows EH +// IP2State tables have the correct information. +// +// On most Windows platforms (AMD64, ARM64, ARM32, IA64, but *not* x86-32), +// exception handling works by looking up instruction pointers in lookup +// tables. These lookup tables are stored in .xdata sections in executables. +// One element of the lookup tables are the "IP2State" tables (Instruction +// Pointer to State). +// +// If a function has any instructions that require cleanup during exception +// unwinding, then it will have an IP2State table. Each entry in the IP2State +// table describes a range of bytes in the function's instruction stream, and +// associates an "EH state number" with that range of instructions. A value of +// -1 means "the null state", which does not require any code to execute. +// A value other than -1 is an index into the State table. +// +// The entries in the IP2State table contain byte offsets within the instruction +// stream of the function. The Windows ABI requires that these offsets are +// aligned to instruction boundaries; they are not permitted to point to a byte +// that is not the first byte of an instruction. +// +// Unfortunately, CALL instructions present a problem during unwinding. CALL +// instructions push the address of the instruction after the CALL instruction, +// so that execution can resume after the CALL. If the CALL is the last +// instruction within an IP2State region, then the return address (on the stack) +// points to the *next* IP2State region. This means that the unwinder will +// use the wrong cleanup funclet during unwinding. +// +// To fix this problem, the Windows AMD64 ABI requires that CALL instructions +// are never placed at the end of an IP2State region. Stated equivalently, the +// end of a CALL instruction cannot be aligned to an IP2State boundary. If a +// CALL instruction would occur at the end of an IP2State region, then the +// compiler must insert a NOP instruction after the CALL. The NOP instruction +// is placed in the same EH region as the CALL instruction, so that the return +// address points to the NOP and the unwinder will locate the correct region. +// +// NOP padding is only necessary on Windows AMD64 targets. On ARM64 and ARM32, +// instructions have a fixed size so the unwinder knows how to "back up" by +// one instruction. +// +// Interaction with Import Call Optimization (ICO): +// +// Import Call Optimization (ICO) is a compiler + OS feature on Windows which +// improves the performance and security of DLL imports. ICO relies on using a +// specific CALL idiom that can be replaced by the OS DLL loader. This removes +// a load and indirect CALL and replaces it with a single direct CALL. +// +// To achieve this, ICO also inserts NOPs after the CALL instruction. If the +// end of the CALL is aligned with an EH state transition, we *also* insert +// a single-byte NOP. **Both forms of NOPs must be preserved.** They cannot +// be combined into a single larger NOP; nor can the second NOP be removed. +// +// This is necessary because, if ICO is active and the call site is modified +// by the loader, the loader will end up overwriting the NOPs that were inserted +// for ICO. That means that those NOPs cannot be used for the correct +// termination of the exception handling region (the IP2State transition), +// so we still need an additional NOP instruction. The NOPs cannot be combined +// into a longer NOP (which is ordinarily desirable) because then ICO would +// split one instruction, producing a malformed instruction after the ICO call. +void X86AsmPrinter::maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI) { + // We only need to insert NOPs after CALLs when targeting Windows on AMD64. + // (Don't let the name fool you: Itanium refers to table-based exception + // handling, not the Itanium architecture.) + if (MAI->getExceptionHandlingType() != ExceptionHandling::WinEH || + MAI->getWinEHEncodingType() != WinEH::EncodingType::Itanium) { + return; + } + + bool HasEHPersonality = MF->getWinEHFuncInfo() != nullptr; + + // Set up MBB iterator, initially positioned on the same MBB as MI. + MachineFunction::const_iterator MFI(MI->getParent()); + MachineFunction::const_iterator MFE(MF->end()); + + // Set up instruction iterator, positioned immediately *after* MI. + MachineBasicBlock::const_iterator MBBI(MI); + MachineBasicBlock::const_iterator MBBE = MI->getParent()->end(); + ++MBBI; // Step over MI + + // This loop iterates MBBs + for (;;) { + // This loop iterates instructions + for (; MBBI != MBBE; ++MBBI) { + // Check the instruction that follows this CALL. + const MachineInstr &NextMI = *MBBI; + + // If there is an EH_LABEL after this CALL, then there is an EH state + // transition after this CALL. This is exactly the situation which + // requires NOP padding. + if (NextMI.isEHLabel()) { + if (HasEHPersonality) { + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + return; + } + // We actually want to continue, in case there is an SEH_BeginEpilogue + // instruction after the EH_LABEL. In some situations, IR is produced + // that contains EH_LABEL pseudo-instructions, even when we are not + // generating IP2State tables. We still need to insert a NOP before + // SEH_BeginEpilogue in that case. + continue; + } + + // Somewhat similarly, if the CALL is the last instruction before the + // SEH prologue, then we also need a NOP. This is necessary because the + // Windows stack unwinder will not invoke a function's exception handler + // if the instruction pointer is in the function prologue or epilogue. + // + // We always emit a NOP before SEH_BeginEpilogue, even if there is no + // personality function (unwind info) for this frame. This is the same + // behavior as MSVC. + if (NextMI.getOpcode() == X86::SEH_BeginEpilogue) { + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + return; + } + + if (!NextMI.isPseudo() && !NextMI.isMetaInstruction()) { + // We found a real instruction. During the CALL, the return IP will + // point to this instruction. Since this instruction has the same EH + // state as the call itself (because there is no intervening EH_LABEL), + // the IP2State table will be accurate; there is no need to insert a + // NOP. + return; + } + + // The next instruction is a pseudo-op. Ignore it and keep searching. + // Because these instructions do not generate any machine code, they + // cannot prevent the IP2State table from pointing at the wrong + // instruction during a CALL. + } + + // We've reached the end of this MBB. Find the next MBB in program order. + // MBB order should be finalized by this point, so falling across MBBs is + // expected. + ++MFI; + if (MFI == MFE) { + // No more blocks; we've reached the end of the function. This should + // only happen with no-return functions, but double-check to be sure. + if (HasEHPersonality) { + // If the CALL has no successors, then it is a noreturn function. + // Insert an INT3 instead of a NOP. This accomplishes the same purpose, + // but is more clear to read. Also, analysis tools will understand + // that they should not continue disassembling after the CALL (unless + // there are other branches to that label). + if (MI->getParent()->succ_empty()) + EmitAndCountInstruction(MCInstBuilder(X86::INT3)); + else + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + } + return; + } + + // Set up iterator to scan the next basic block. + const MachineBasicBlock *NextMBB = &*MFI; + MBBI = NextMBB->instr_begin(); + MBBE = NextMBB->instr_end(); + } +} + void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization( ImportCallKind Kind) { assert(EnableImportCallOptimization); |