diff options
Diffstat (limited to 'llvm/lib/Target/X86')
48 files changed, 1778 insertions, 1454 deletions
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index b7ea672..bac3692 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2470,10 +2470,10 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, // Report back its kind, or IOK_INVALID if does not evaluated as a known one unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) { return StringSwitch<unsigned>(Name) - .Cases("TYPE","type",IOK_TYPE) - .Cases("SIZE","size",IOK_SIZE) - .Cases("LENGTH","length",IOK_LENGTH) - .Default(IOK_INVALID); + .Cases({"TYPE", "type"}, IOK_TYPE) + .Cases({"SIZE", "size"}, IOK_SIZE) + .Cases({"LENGTH", "length"}, IOK_LENGTH) + .Default(IOK_INVALID); } /// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator @@ -2516,8 +2516,8 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) { unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) { return StringSwitch<unsigned>(Name.lower()) .Case("type", MOK_TYPE) - .Cases("size", "sizeof", MOK_SIZEOF) - .Cases("length", "lengthof", MOK_LENGTHOF) + .Cases({"size", "sizeof"}, MOK_SIZEOF) + .Cases({"length", "lengthof"}, MOK_LENGTHOF) .Default(MOK_INVALID); } @@ -2581,21 +2581,21 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) { bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size, StringRef *SizeStr) { Size = StringSwitch<unsigned>(getTok().getString()) - .Cases("BYTE", "byte", 8) - .Cases("WORD", "word", 16) - .Cases("DWORD", "dword", 32) - .Cases("FLOAT", "float", 32) - .Cases("LONG", "long", 32) - .Cases("FWORD", "fword", 48) - .Cases("DOUBLE", "double", 64) - .Cases("QWORD", "qword", 64) - .Cases("MMWORD","mmword", 64) - .Cases("XWORD", "xword", 80) - .Cases("TBYTE", "tbyte", 80) - .Cases("XMMWORD", "xmmword", 128) - .Cases("YMMWORD", "ymmword", 256) - .Cases("ZMMWORD", "zmmword", 512) - .Default(0); + .Cases({"BYTE", "byte"}, 8) + .Cases({"WORD", "word"}, 16) + .Cases({"DWORD", "dword"}, 32) + .Cases({"FLOAT", "float"}, 32) + .Cases({"LONG", "long"}, 32) + .Cases({"FWORD", "fword"}, 48) + .Cases({"DOUBLE", "double"}, 64) + .Cases({"QWORD", "qword"}, 64) + .Cases({"MMWORD", "mmword"}, 64) + .Cases({"XWORD", "xword"}, 80) + .Cases({"TBYTE", "tbyte"}, 80) + .Cases({"XMMWORD", "xmmword"}, 128) + .Cases({"YMMWORD", "ymmword"}, 256) + .Cases({"ZMMWORD", "zmmword"}, 512) + .Default(0); if (Size) { if (SizeStr) *SizeStr = getTok().getString(); @@ -2886,22 +2886,22 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) { // otherwise the EFLAGS Condition Code enumerator. X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) { return StringSwitch<X86::CondCode>(CC) - .Case("o", X86::COND_O) // Overflow - .Case("no", X86::COND_NO) // No Overflow - .Cases("b", "nae", X86::COND_B) // Below/Neither Above nor Equal - .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below - .Cases("e", "z", X86::COND_E) // Equal/Zero - .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero - .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above - .Cases("a", "nbe", X86::COND_A) // Above/Neither Below nor Equal - .Case("s", X86::COND_S) // Sign - .Case("ns", X86::COND_NS) // No Sign - .Cases("p", "pe", X86::COND_P) // Parity/Parity Even - .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd - .Cases("l", "nge", X86::COND_L) // Less/Neither Greater nor Equal - .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less - .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater - .Cases("g", "nle", X86::COND_G) // Greater/Neither Less nor Equal + .Case("o", X86::COND_O) // Overflow + .Case("no", X86::COND_NO) // No Overflow + .Cases({"b", "nae"}, X86::COND_B) // Below/Neither Above nor Equal + .Cases({"ae", "nb"}, X86::COND_AE) // Above or Equal/Not Below + .Cases({"e", "z"}, X86::COND_E) // Equal/Zero + .Cases({"ne", "nz"}, X86::COND_NE) // Not Equal/Not Zero + .Cases({"be", "na"}, X86::COND_BE) // Below or Equal/Not Above + .Cases({"a", "nbe"}, X86::COND_A) // Above/Neither Below nor Equal + .Case("s", X86::COND_S) // Sign + .Case("ns", X86::COND_NS) // No Sign + .Cases({"p", "pe"}, X86::COND_P) // Parity/Parity Even + .Cases({"np", "po"}, X86::COND_NP) // No Parity/Parity Odd + .Cases({"l", "nge"}, X86::COND_L) // Less/Neither Greater nor Equal + .Cases({"ge", "nl"}, X86::COND_GE) // Greater or Equal/Not Less + .Cases({"le", "ng"}, X86::COND_LE) // Less or Equal/Not Greater + .Cases({"g", "nle"}, X86::COND_G) // Greater/Neither Less nor Equal .Default(X86::COND_INVALID); } diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index f9bd233..434a6d2 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -31,7 +31,6 @@ set(sources X86CmovConversion.cpp X86CodeGenPassBuilder.cpp X86DomainReassignment.cpp - X86DiscriminateMemOps.cpp X86LowerTileCopy.cpp X86LowerAMXType.cpp X86LowerAMXIntrinsics.cpp @@ -57,7 +56,6 @@ set(sources X86IndirectBranchTracking.cpp X86IndirectThunks.cpp X86InterleavedAccess.cpp - X86InsertPrefetch.cpp X86InstCombineIntrinsic.cpp X86InstrFMA3Info.cpp X86InstrFoldTables.cpp diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp index c0b9339..b07ce2b 100644 --- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp @@ -280,8 +280,7 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, if (Arg.hasAttribute(Attribute::ByVal) || Arg.hasAttribute(Attribute::InReg) || Arg.hasAttribute(Attribute::SwiftSelf) || - Arg.hasAttribute(Attribute::SwiftError) || - Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1) + Arg.hasAttribute(Attribute::SwiftError) || VRegs[Idx].size() > 1) return false; if (Arg.hasAttribute(Attribute::StructRet)) { diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp index 53ec712..f499e6f 100644 --- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp @@ -312,6 +312,53 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, } } + // Special case GPR16 -> XMM + if (SrcSize == 16 && SrcRegBank.getID() == X86::GPRRegBankID && + (DstRegBank.getID() == X86::VECRRegBankID)) { + + const DebugLoc &DL = I.getDebugLoc(); + + // Any extend GPR16 -> GPR32 + Register ExtReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*I.getParent(), I, DL, TII.get(TargetOpcode::SUBREG_TO_REG), + ExtReg) + .addImm(0) + .addReg(SrcReg) + .addImm(X86::sub_16bit); + + // Copy GR32 -> XMM + BuildMI(*I.getParent(), I, DL, TII.get(TargetOpcode::COPY), DstReg) + .addReg(ExtReg); + + I.eraseFromParent(); + } + + // Special case XMM -> GR16 + if (DstSize == 16 && DstRegBank.getID() == X86::GPRRegBankID && + (SrcRegBank.getID() == X86::VECRRegBankID)) { + + const DebugLoc &DL = I.getDebugLoc(); + + // Move XMM to GR32 register. + Register Temp32 = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*I.getParent(), I, DL, TII.get(TargetOpcode::COPY), Temp32) + .addReg(SrcReg); + + // Extract the lower 16 bits + if (Register Dst32 = TRI.getMatchingSuperReg(DstReg, X86::sub_16bit, + &X86::GR32RegClass)) { + // Optimization for Physical Dst (e.g. AX): Copy to EAX directly. + BuildMI(*I.getParent(), I, DL, TII.get(TargetOpcode::COPY), Dst32) + .addReg(Temp32); + } else { + // Handle if there is no super. + BuildMI(*I.getParent(), I, DL, TII.get(TargetOpcode::COPY), DstReg) + .addReg(Temp32, 0, X86::sub_16bit); + } + + I.eraseFromParent(); + } + return true; } diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index e792b1b..812fa85 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -269,6 +269,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, getActionDefinitionsBuilder(G_ICMP) .legalForCartesianProduct({s8}, Is64Bit ? IntTypes64 : IntTypes32) .clampScalar(0, s8, s8) + .widenScalarToNextPow2(1, /*Min=*/8) .clampScalar(1, s8, sMaxScalar); // bswap diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 74de51c..0a98331 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -195,7 +195,7 @@ public: bool padInstructionEncoding(MCFragment &RF, MCCodeEmitter &Emitter, unsigned &RemainingSize) const; - bool finishLayout(const MCAssembler &Asm) const override; + bool finishLayout() const override; unsigned getMaximumNopSize(const MCSubtargetInfo &STI) const override; @@ -850,7 +850,7 @@ bool X86AsmBackend::padInstructionEncoding(MCFragment &RF, return Changed; } -bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const { +bool X86AsmBackend::finishLayout() const { // See if we can further relax some instructions to cut down on the number of // nop bytes required for code alignment. The actual win is in reducing // instruction count, not number of bytes. Modern X86-64 can easily end up @@ -864,11 +864,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const { // MCSymbols and therefore different relaxation results. X86PadForAlign is // disabled by default to eliminate the -g vs non -g difference. DenseSet<MCFragment *> LabeledFragments; - for (const MCSymbol &S : Asm.symbols()) + for (const MCSymbol &S : Asm->symbols()) LabeledFragments.insert(S.getFragment()); bool Changed = false; - for (MCSection &Sec : Asm) { + for (MCSection &Sec : *Asm) { if (!Sec.isText()) continue; @@ -908,13 +908,13 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const { // the align directive. This is purely about human understandability // of the resulting code. If we later find a reason to expand // particular instructions over others, we can adjust. - unsigned RemainingSize = Asm.computeFragmentSize(F) - F.getFixedSize(); + unsigned RemainingSize = Asm->computeFragmentSize(F) - F.getFixedSize(); while (!Relaxable.empty() && RemainingSize != 0) { auto &RF = *Relaxable.pop_back_val(); // Give the backend a chance to play any tricks it wishes to increase // the encoding size of the given instruction. Target independent code // will try further relaxation, but target's may play further tricks. - Changed |= padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize); + Changed |= padInstructionEncoding(RF, Asm->getEmitter(), RemainingSize); // If we have an instruction which hasn't been fully relaxed, we can't // skip past it and insert bytes before it. Changing its starting @@ -1391,7 +1391,7 @@ public: return CU::UNWIND_MODE_DWARF; MCRegister Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true); - SavedRegs[SavedRegIdx++] = Reg; + SavedRegs[SavedRegIdx++] = Reg.id(); StackAdjust += OffsetSize; MinAbsOffset = std::min(MinAbsOffset, std::abs(Inst.getOffset())); InstrOffset += PushInstrSize(Reg); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index 759d95e..88dd543 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -451,7 +451,7 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, // the assembly would look something like: // "vp2intersect %zmm5, %zmm7, {%k2, %k3}" // but this can work too. - switch (MI->getOperand(OpNo).getReg()) { + switch (MI->getOperand(OpNo).getReg().id()) { case X86::K0_K1: printRegName(OS, X86::K0); return; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index af5a698..0c874b7 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -535,7 +535,7 @@ bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI, const MCRegisterClass &VR128XRC = MRI.getRegClass(X86::VR128XRegClassID); const MCRegisterClass &VR256XRC = MRI.getRegClass(X86::VR256XRegClassID); - auto ClearsSuperReg = [=](unsigned RegID) { + auto ClearsSuperReg = [=](MCRegister RegID) { // On X86-64, a general purpose integer register is viewed as a 64-bit // register internal to the processor. // An update to the lower 32 bits of a 64 bit integer register is diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 1ef10928..abbb0c2 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -65,7 +65,7 @@ void X86WinCOFFStreamer::emitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) { } void X86WinCOFFStreamer::finishImpl() { - emitFrames(nullptr); + emitFrames(); emitWindowsUnwindTables(); MCWinCOFFStreamer::finishImpl(); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp index 9c44231..b722964 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp @@ -55,6 +55,7 @@ struct FPOInstruction { StackAlign, SetFrame, } Op; + // FIXME: This should be a union of MCRegister and unsigned. unsigned RegOrOffset; }; @@ -215,7 +216,7 @@ bool X86WinCOFFTargetStreamer::emitFPOSetFrame(MCRegister Reg, SMLoc L) { FPOInstruction Inst; Inst.Label = emitFPOLabel(); Inst.Op = FPOInstruction::SetFrame; - Inst.RegOrOffset = Reg; + Inst.RegOrOffset = Reg.id(); CurFPOData->Instructions.push_back(Inst); return false; } @@ -226,7 +227,7 @@ bool X86WinCOFFTargetStreamer::emitFPOPushReg(MCRegister Reg, SMLoc L) { FPOInstruction Inst; Inst.Label = emitFPOLabel(); Inst.Op = FPOInstruction::PushReg; - Inst.RegOrOffset = Reg; + Inst.RegOrOffset = Reg.id(); CurFPOData->Instructions.push_back(Inst); return false; } diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 51b540a..97848be 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86_H #define LLVM_LIB_TARGET_X86_X86_H +#include "llvm/CodeGen/MachineFunctionAnalysisManager.h" #include "llvm/IR/Analysis.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/CodeGen.h" @@ -43,7 +44,13 @@ FunctionPass *createCleanupLocalDynamicTLSPass(); /// This function returns a pass which converts floating-point register /// references and pseudo instructions into floating-point stack references and /// physical instructions. -FunctionPass *createX86FloatingPointStackifierPass(); +class X86FPStackifierPass : public PassInfoMixin<X86FPStackifierPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +FunctionPass *createX86FPStackifierLegacyPass(); /// This pass inserts AVX vzeroupper instructions before each call to avoid /// transition penalty between functions encoded with AVX and SSE. @@ -83,7 +90,14 @@ FunctionPass *createX86AvoidStoreForwardingBlocks(); FunctionPass *createX86FlagsCopyLoweringPass(); /// Return a pass that expands DynAlloca pseudo-instructions. -FunctionPass *createX86DynAllocaExpander(); +class X86DynAllocaExpanderPass + : public PassInfoMixin<X86DynAllocaExpanderPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +FunctionPass *createX86DynAllocaExpanderLegacyPass(); /// Return a pass that config the tile registers. FunctionPass *createX86TileConfigPass(); @@ -104,7 +118,15 @@ FunctionPass *createX86LowerTileCopyPass(); /// CALL instruction. The pass does the same for each funclet as well. This /// ensures that the open interval of function start and end PCs contains all /// return addresses for the benefit of the Windows x64 unwinder. -FunctionPass *createX86AvoidTrailingCallPass(); +class X86AvoidTrailingCallPass + : public PassInfoMixin<X86AvoidTrailingCallPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86AvoidTrailingCallLegacyPass(); /// Return a pass that optimizes the code-size of x86 call sequences. This is /// done by replacing esp-relative movs with pushes. @@ -144,13 +166,6 @@ FunctionPass *createX86IndirectThunksPass(); /// This pass replaces ret instructions with jmp's to __x86_return thunk. FunctionPass *createX86ReturnThunksPass(); -/// This pass ensures instructions featuring a memory operand -/// have distinctive <LineNumber, Discriminator> (with respect to each other) -FunctionPass *createX86DiscriminateMemOpsPass(); - -/// This pass applies profiling information to insert cache prefetches. -FunctionPass *createX86InsertPrefetchPass(); - /// This pass insert wait instruction after X87 instructions which could raise /// fp exceptions when strict-fp enabled. FunctionPass *createX86InsertX87waitPass(); @@ -158,7 +173,16 @@ FunctionPass *createX86InsertX87waitPass(); /// This pass optimizes arithmetic based on knowledge that is only used by /// a reduction sequence and is therefore safe to reassociate in interesting /// ways. -FunctionPass *createX86PartialReductionPass(); +class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> { +private: + const X86TargetMachine *TM; + +public: + X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +FunctionPass *createX86PartialReductionLegacyPass(); /// // Analyzes and emits pseudos to support Win x64 Unwind V2. FunctionPass *createX86WinEHUnwindV2Pass(); @@ -179,7 +203,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass(); /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. -FunctionPass *createX86LowerAMXIntrinsicsPass(); +class X86LowerAMXIntrinsicsPass + : public PassInfoMixin<X86LowerAMXIntrinsicsPass> { +private: + const TargetMachine *TM; + +public: + X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86LowerAMXIntrinsicsLegacyPass(); InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &, @@ -193,7 +228,6 @@ FunctionPass *createX86ArgumentStackSlotPass(); FunctionPass *createX86SuppressAPXForRelocationPass(); void initializeCompressEVEXPassPass(PassRegistry &); -void initializeFPSPass(PassRegistry &); void initializeFixupBWInstPassPass(PassRegistry &); void initializeFixupLEAPassPass(PassRegistry &); void initializeX86ArgumentStackSlotPassPass(PassRegistry &); @@ -202,14 +236,15 @@ void initializeX86FixupInstTuningPassPass(PassRegistry &); void initializeX86FixupVectorConstantsPassPass(PassRegistry &); void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); -void initializeX86AvoidTrailingCallPassPass(PassRegistry &); +void initializeX86AvoidTrailingCallLegacyPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); void initializeX86DAGToDAGISelLegacyPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); -void initializeX86DynAllocaExpanderPass(PassRegistry &); +void initializeX86DynAllocaExpanderLegacyPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86ExpandPseudoPass(PassRegistry &); +void initializeX86FPStackifierLegacyPass(PassRegistry &); void initializeX86FastPreTileConfigPass(PassRegistry &); void initializeX86FastTileConfigPass(PassRegistry &); void initializeX86FixupSetCCPassPass(PassRegistry &); @@ -220,7 +255,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); -void initializeX86PartialReductionPass(PassRegistry &); +void initializeX86PartialReductionLegacyPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); void initializeX86ReturnThunksPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 9e291a6..8f29a64 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -795,6 +795,8 @@ include "X86Schedule.td" include "X86InstrInfo.td" include "X86SchedPredicates.td" +defm : RemapAllTargetPseudoPointerOperands<x86_ptr_rc>; + def X86InstrInfo : InstrInfo; //===----------------------------------------------------------------------===// @@ -1334,8 +1336,18 @@ def ProcessorFeatures { !listremove(ARLSFeatures, [FeatureWIDEKL]); // Novalake + list<SubtargetFeature> NVLAdditionalFeatures = [FeatureAVX10_2, + FeatureMOVRS, + FeatureEGPR, + FeaturePush2Pop2, + FeaturePPX, + FeatureNF, + FeatureNDD, + FeatureZU, + FeatureCCMP, + FeaturePREFETCHI]; list<SubtargetFeature> NVLFeatures = - !listconcat(PTLFeatures, [FeaturePREFETCHI]); + !listconcat(PTLFeatures, NVLAdditionalFeatures); // Clearwaterforest list<SubtargetFeature> CWFAdditionalFeatures = [FeaturePREFETCHI, diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index d2e3527..9473e8d 100644 --- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -387,8 +387,8 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, MachineMemOperand *LMMO = *LoadInst->memoperands_begin(); MachineMemOperand *SMMO = *StoreInst->memoperands_begin(); - Register Reg1 = MRI->createVirtualRegister( - TII->getRegClass(TII->get(NLoadOpcode), 0, TRI)); + Register Reg1 = + MRI->createVirtualRegister(TII->getRegClass(TII->get(NLoadOpcode), 0)); MachineInstr *NewLoad = BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode), Reg1) @@ -553,7 +553,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) { } unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) { - const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI); + const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0); return TRI->getRegSizeInBits(*TRC) / 8; } diff --git a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp index 2ecf493..ebd4284 100644 --- a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp +++ b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp @@ -37,6 +37,8 @@ #include "X86Subtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/IR/Analysis.h" +#include "llvm/IR/PassManager.h" #define AVOIDCALL_DESC "X86 avoid trailing call pass" #define AVOIDCALL_NAME "x86-avoid-trailing-call" @@ -46,9 +48,9 @@ using namespace llvm; namespace { -class X86AvoidTrailingCallPass : public MachineFunctionPass { +class X86AvoidTrailingCallLegacyPass : public MachineFunctionPass { public: - X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {} + X86AvoidTrailingCallLegacyPass() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -59,13 +61,14 @@ private: }; } // end anonymous namespace -char X86AvoidTrailingCallPass::ID = 0; +char X86AvoidTrailingCallLegacyPass::ID = 0; -FunctionPass *llvm::createX86AvoidTrailingCallPass() { - return new X86AvoidTrailingCallPass(); +FunctionPass *llvm::createX86AvoidTrailingCallLegacyPass() { + return new X86AvoidTrailingCallLegacyPass(); } -INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false, false) +INITIALIZE_PASS(X86AvoidTrailingCallLegacyPass, AVOIDCALL_NAME, AVOIDCALL_DESC, + false, false) // A real instruction is a non-meta, non-pseudo instruction. Some pseudos // expand to nothing, and some expand to code. This logic conservatively assumes @@ -79,7 +82,7 @@ static bool isCallInstruction(const MachineInstr &MI) { return MI.isCall() && !MI.isReturn(); } -bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { +bool UpdatedOnX86AvoidTrailingCallPass(MachineFunction &MF) { const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); const X86InstrInfo &TII = *STI.getInstrInfo(); assert(STI.isTargetWin64() && "pass only runs on Win64"); @@ -134,3 +137,19 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { return Changed; } + +bool X86AvoidTrailingCallLegacyPass::runOnMachineFunction(MachineFunction &MF) { + return UpdatedOnX86AvoidTrailingCallPass(MF); +} + +PreservedAnalyses +X86AvoidTrailingCallPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + bool Changed = UpdatedOnX86AvoidTrailingCallPass(MF); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index c0c7f5a..0f55c19 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -15,6 +15,7 @@ // c. NDD (EVEX) -> non-NDD (legacy) // d. NF_ND (EVEX) -> NF (EVEX) // e. NonNF (EVEX) -> NF (EVEX) +// f. SETZUCCm (EVEX) -> SETCCm (legacy) // // Compression a, b and c can always reduce code size, with some exceptions // such as promoted 16-bit CRC32 which is as long as the legacy version. @@ -216,14 +217,15 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB, // memory form: broadcast // // APX: - // MAP4: NDD + // MAP4: NDD, ZU // // For AVX512 cases, EVEX prefix is needed in order to carry this information // thus preventing the transformation to VEX encoding. bool IsND = X86II::hasNewDataDest(TSFlags); - if (TSFlags & X86II::EVEX_B && !IsND) - return false; unsigned Opc = MI.getOpcode(); + bool IsSetZUCCm = Opc == X86::SETZUCCm; + if (TSFlags & X86II::EVEX_B && !IsND && !IsSetZUCCm) + return false; // MOVBE*rr is special because it has semantic of NDD but not set EVEX_B. bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr; bool IsRedundantNDD = IsNDLike ? IsRedundantNewDataDest(Opc) : false; @@ -272,7 +274,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB, const MachineOperand &Src2 = MI.getOperand(2); bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND; const MCInstrDesc &NewDesc = - ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r); + ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r); if (Is32BitReg) Src1 = getX86SubSuperRegister(Src1, 64); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst) @@ -339,7 +341,7 @@ bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) { } #endif const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); - if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD()) + if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD() && !ST.hasZU()) return false; bool Changed = false; diff --git a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp deleted file mode 100644 index bd151a4..0000000 --- a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp +++ /dev/null @@ -1,184 +0,0 @@ -//===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// This pass aids profile-driven cache prefetch insertion by ensuring all -/// instructions that have a memory operand are distinguishible from each other. -/// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86Subtarget.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/IR/DebugInfoMetadata.h" -#include "llvm/ProfileData/SampleProf.h" -#include "llvm/ProfileData/SampleProfReader.h" -#include "llvm/Support/Debug.h" -#include <optional> -using namespace llvm; - -#define DEBUG_TYPE "x86-discriminate-memops" - -static cl::opt<bool> EnableDiscriminateMemops( - DEBUG_TYPE, cl::init(false), - cl::desc("Generate unique debug info for each instruction with a memory " - "operand. Should be enabled for profile-driven cache prefetching, " - "both in the build of the binary being profiled, as well as in " - "the build of the binary consuming the profile."), - cl::Hidden); - -static cl::opt<bool> BypassPrefetchInstructions( - "x86-bypass-prefetch-instructions", cl::init(true), - cl::desc("When discriminating instructions with memory operands, ignore " - "prefetch instructions. This ensures the other memory operand " - "instructions have the same identifiers after inserting " - "prefetches, allowing for successive insertions."), - cl::Hidden); - -namespace { - -using Location = std::pair<StringRef, unsigned>; - -Location diToLocation(const DILocation *Loc) { - return std::make_pair(Loc->getFilename(), Loc->getLine()); -} - -/// Ensure each instruction having a memory operand has a distinct <LineNumber, -/// Discriminator> pair. -void updateDebugInfo(MachineInstr *MI, const DILocation *Loc) { - DebugLoc DL(Loc); - MI->setDebugLoc(DL); -} - -class X86DiscriminateMemOps : public MachineFunctionPass { - bool runOnMachineFunction(MachineFunction &MF) override; - StringRef getPassName() const override { - return "X86 Discriminate Memory Operands"; - } - -public: - static char ID; - - /// Default construct and initialize the pass. - X86DiscriminateMemOps(); -}; - -bool IsPrefetchOpcode(unsigned Opcode) { - return Opcode == X86::PREFETCHNTA || Opcode == X86::PREFETCHT0 || - Opcode == X86::PREFETCHT1 || Opcode == X86::PREFETCHT2 || - Opcode == X86::PREFETCHIT0 || Opcode == X86::PREFETCHIT1 || - Opcode == X86::PREFETCHRST2; -} -} // end anonymous namespace - -//===----------------------------------------------------------------------===// -// Implementation -//===----------------------------------------------------------------------===// - -char X86DiscriminateMemOps::ID = 0; - -/// Default construct and initialize the pass. -X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {} - -bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { - if (!EnableDiscriminateMemops) - return false; - - DISubprogram *FDI = MF.getFunction().getSubprogram(); - if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling()) - return false; - - // Have a default DILocation, if we find instructions with memops that don't - // have any debug info. - const DILocation *ReferenceDI = - DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI); - assert(ReferenceDI && "ReferenceDI should not be nullptr"); - DenseMap<Location, unsigned> MemOpDiscriminators; - MemOpDiscriminators[diToLocation(ReferenceDI)] = 0; - - // Figure out the largest discriminator issued for each Location. When we - // issue new discriminators, we can thus avoid issuing discriminators - // belonging to instructions that don't have memops. This isn't a requirement - // for the goals of this pass, however, it avoids unnecessary ambiguity. - for (auto &MBB : MF) { - for (auto &MI : MBB) { - const auto &DI = MI.getDebugLoc(); - if (!DI) - continue; - if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode)) - continue; - Location Loc = diToLocation(DI); - unsigned &Disc = MemOpDiscriminators[Loc]; - Disc = std::max(Disc, DI->getBaseDiscriminator()); - } - } - - // Keep track of the discriminators seen at each Location. If an instruction's - // DebugInfo has a Location and discriminator we've already seen, replace its - // discriminator with a new one, to guarantee uniqueness. - DenseMap<Location, DenseSet<unsigned>> Seen; - - bool Changed = false; - for (auto &MBB : MF) { - for (auto &MI : MBB) { - if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0) - continue; - if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode)) - continue; - const DILocation *DI = MI.getDebugLoc(); - bool HasDebug = DI; - if (!HasDebug) { - DI = ReferenceDI; - } - Location L = diToLocation(DI); - DenseSet<unsigned> &Set = Seen[L]; - const std::pair<DenseSet<unsigned>::iterator, bool> TryInsert = - Set.insert(DI->getBaseDiscriminator()); - if (!TryInsert.second || !HasDebug) { - unsigned BF, DF, CI = 0; - DILocation::decodeDiscriminator(DI->getDiscriminator(), BF, DF, CI); - std::optional<unsigned> EncodedDiscriminator = - DILocation::encodeDiscriminator(MemOpDiscriminators[L] + 1, DF, CI); - - if (!EncodedDiscriminator) { - // FIXME(mtrofin): The assumption is that this scenario is infrequent/OK - // not to support. If evidence points otherwise, we can explore synthesizeing - // unique DIs by adding fake line numbers, or by constructing 64 bit - // discriminators. - LLVM_DEBUG(dbgs() << "Unable to create a unique discriminator " - "for instruction with memory operand in: " - << DI->getFilename() << " Line: " << DI->getLine() - << " Column: " << DI->getColumn() - << ". This is likely due to a large macro expansion. \n"); - continue; - } - // Since we were able to encode, bump the MemOpDiscriminators. - ++MemOpDiscriminators[L]; - DI = DI->cloneWithDiscriminator(*EncodedDiscriminator); - assert(DI && "DI should not be nullptr"); - updateDebugInfo(&MI, DI); - Changed = true; - std::pair<DenseSet<unsigned>::iterator, bool> MustInsert = - Set.insert(DI->getBaseDiscriminator()); - (void)MustInsert; // Silence warning in release build. - assert(MustInsert.second && "New discriminator shouldn't be present in set"); - } - - // Bump the reference DI to avoid cramming discriminators on line 0. - // FIXME(mtrofin): pin ReferenceDI on blocks or first instruction with DI - // in a block. It's more consistent than just relying on the last memop - // instruction we happened to see. - ReferenceDI = DI; - } - } - return Changed; -} - -FunctionPass *llvm::createX86DiscriminateMemOpsPass() { - return new X86DiscriminateMemOps(); -} diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index 5d19011..2047a53 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -174,8 +174,8 @@ public: MachineBasicBlock *MBB = MI->getParent(); const DebugLoc &DL = MI->getDebugLoc(); - Register Reg = MRI->createVirtualRegister( - TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo())); + Register Reg = + MRI->createVirtualRegister(TII->getRegClass(TII->get(DstOpcode), 0)); MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg); for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) Bld.add(MO); diff --git a/llvm/lib/Target/X86/X86DynAllocaExpander.cpp b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp index c2a06ef..10f46f7 100644 --- a/llvm/lib/Target/X86/X86DynAllocaExpander.cpp +++ b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp @@ -20,22 +20,22 @@ #include "X86Subtarget.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineFunctionAnalysisManager.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Function.h" using namespace llvm; namespace { -class X86DynAllocaExpander : public MachineFunctionPass { +class X86DynAllocaExpander { public: - X86DynAllocaExpander() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override; + bool run(MachineFunction &MF); private: /// Strategies for lowering a DynAlloca. @@ -61,22 +61,30 @@ private: unsigned SlotSize = 0; int64_t StackProbeSize = 0; bool NoStackArgProbe = false; +}; + +class X86DynAllocaExpanderLegacy : public MachineFunctionPass { +public: + X86DynAllocaExpanderLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; +private: StringRef getPassName() const override { return "X86 DynAlloca Expander"; } public: static char ID; }; -char X86DynAllocaExpander::ID = 0; +char X86DynAllocaExpanderLegacy::ID = 0; } // end anonymous namespace -INITIALIZE_PASS(X86DynAllocaExpander, "x86-dyn-alloca-expander", +INITIALIZE_PASS(X86DynAllocaExpanderLegacy, "x86-dyn-alloca-expander", "X86 DynAlloca Expander", false, false) -FunctionPass *llvm::createX86DynAllocaExpander() { - return new X86DynAllocaExpander(); +FunctionPass *llvm::createX86DynAllocaExpanderLegacyPass() { + return new X86DynAllocaExpanderLegacy(); } /// Return the allocation amount for a DynAlloca instruction, or -1 if unknown. @@ -277,7 +285,7 @@ void X86DynAllocaExpander::lower(MachineInstr *MI, Lowering L) { AmountDef->eraseFromParent(); } -bool X86DynAllocaExpander::runOnMachineFunction(MachineFunction &MF) { +bool X86DynAllocaExpander::run(MachineFunction &MF) { if (!MF.getInfo<X86MachineFunctionInfo>()->hasDynAlloca()) return false; @@ -299,3 +307,19 @@ bool X86DynAllocaExpander::runOnMachineFunction(MachineFunction &MF) { return true; } + +bool X86DynAllocaExpanderLegacy::runOnMachineFunction(MachineFunction &MF) { + return X86DynAllocaExpander().run(MF); +} + +PreservedAnalyses +X86DynAllocaExpanderPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + bool Changed = X86DynAllocaExpander().run(MF); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index e3c44c0..6a18086 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -608,40 +608,40 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1); break; case X86::PTCVTROWD2PSrreV: - Opc = X86::TCVTROWD2PSrre; + Opc = X86::TCVTROWD2PSrte; break; case X86::PTCVTROWD2PSrriV: - Opc = X86::TCVTROWD2PSrri; + Opc = X86::TCVTROWD2PSrti; break; case X86::PTCVTROWPS2BF16HrreV: - Opc = X86::TCVTROWPS2BF16Hrre; + Opc = X86::TCVTROWPS2BF16Hrte; break; case X86::PTCVTROWPS2BF16HrriV: - Opc = X86::TCVTROWPS2BF16Hrri; + Opc = X86::TCVTROWPS2BF16Hrti; break; case X86::PTCVTROWPS2BF16LrreV: - Opc = X86::TCVTROWPS2BF16Lrre; + Opc = X86::TCVTROWPS2BF16Lrte; break; case X86::PTCVTROWPS2BF16LrriV: - Opc = X86::TCVTROWPS2BF16Lrri; + Opc = X86::TCVTROWPS2BF16Lrti; break; case X86::PTCVTROWPS2PHHrreV: - Opc = X86::TCVTROWPS2PHHrre; + Opc = X86::TCVTROWPS2PHHrte; break; case X86::PTCVTROWPS2PHHrriV: - Opc = X86::TCVTROWPS2PHHrri; + Opc = X86::TCVTROWPS2PHHrti; break; case X86::PTCVTROWPS2PHLrreV: - Opc = X86::TCVTROWPS2PHLrre; + Opc = X86::TCVTROWPS2PHLrte; break; case X86::PTCVTROWPS2PHLrriV: - Opc = X86::TCVTROWPS2PHLrri; + Opc = X86::TCVTROWPS2PHLrti; break; case X86::PTILEMOVROWrreV: - Opc = X86::TILEMOVROWrre; + Opc = X86::TILEMOVROWrte; break; case X86::PTILEMOVROWrriV: - Opc = X86::TILEMOVROWrri; + Opc = X86::TILEMOVROWrti; break; default: llvm_unreachable("Unexpected Opcode"); diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index 06f729a..25799f4 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -206,8 +206,7 @@ void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before, const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); // Don't need shape information for tile store, becasue it is adjacent to // the tile def instruction. - TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI, - Register()); + TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, Register()); ++NumStores; // TODO: update DBG_VALUEs diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp index 6274cb4..6e0a0f6 100644 --- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp +++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp @@ -202,7 +202,8 @@ Register FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI) const { MCRegUnitIterator I = Range.begin(), E = Range.end(); for (MCRegUnit S : TRI->regunits(SuperDestReg)) { I = std::lower_bound(I, E, S); - if ((I == E || *I > S) && LiveUnits.getBitVector().test(S)) { + if ((I == E || *I > S) && + LiveUnits.getBitVector().test(static_cast<unsigned>(S))) { SuperIsLive = true; break; } diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index ab6e6d0..b3bf37a 100644 --- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -50,7 +50,6 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include <algorithm> #include <cassert> #include <iterator> #include <utility> diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp index 9f88fda..6af2050 100644 --- a/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -31,6 +31,8 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/EdgeBundles.h" #include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysisManager.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -38,6 +40,7 @@ #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Config/llvm-config.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/InlineAsm.h" #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" @@ -48,265 +51,272 @@ #include <bitset> using namespace llvm; -#define DEBUG_TYPE "x86-codegen" +#define DEBUG_TYPE "x86-fp-stackifier" STATISTIC(NumFXCH, "Number of fxch instructions inserted"); -STATISTIC(NumFP , "Number of floating point instructions"); +STATISTIC(NumFP, "Number of floating point instructions"); namespace { - const unsigned ScratchFPReg = 7; - - struct FPS : public MachineFunctionPass { - static char ID; - FPS() : MachineFunctionPass(ID) { - // This is really only to keep valgrind quiet. - // The logic in isLive() is too much for it. - memset(Stack, 0, sizeof(Stack)); - memset(RegMap, 0, sizeof(RegMap)); - } +const unsigned ScratchFPReg = 7; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<EdgeBundlesWrapperLegacy>(); - AU.addPreservedID(MachineLoopInfoID); - AU.addPreservedID(MachineDominatorsID); - MachineFunctionPass::getAnalysisUsage(AU); - } +class FPS { +public: + bool shouldRun(MachineFunction &MF); + bool run(MachineFunction &MF, EdgeBundles *EdgeBundles); - bool runOnMachineFunction(MachineFunction &MF) override; +private: + const TargetInstrInfo *TII = nullptr; // Machine instruction info. - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().setNoVRegs(); - } + // Two CFG edges are related if they leave the same block, or enter the same + // block. The transitive closure of an edge under this relation is a + // LiveBundle. It represents a set of CFG edges where the live FP stack + // registers must be allocated identically in the x87 stack. + // + // A LiveBundle is usually all the edges leaving a block, or all the edges + // entering a block, but it can contain more edges if critical edges are + // present. + // + // The set of live FP registers in a LiveBundle is calculated by bundleCFG, + // but the exact mapping of FP registers to stack slots is fixed later. + struct LiveBundle { + // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c. + unsigned Mask = 0; - StringRef getPassName() const override { return "X86 FP Stackifier"; } + // Number of pre-assigned live registers in FixStack. This is 0 when the + // stack order has not yet been fixed. + unsigned FixCount = 0; - private: - const TargetInstrInfo *TII = nullptr; // Machine instruction info. + // Assigned stack order for live-in registers. + // FixStack[i] == getStackEntry(i) for all i < FixCount. + unsigned char FixStack[8]; - // Two CFG edges are related if they leave the same block, or enter the same - // block. The transitive closure of an edge under this relation is a - // LiveBundle. It represents a set of CFG edges where the live FP stack - // registers must be allocated identically in the x87 stack. - // - // A LiveBundle is usually all the edges leaving a block, or all the edges - // entering a block, but it can contain more edges if critical edges are - // present. - // - // The set of live FP registers in a LiveBundle is calculated by bundleCFG, - // but the exact mapping of FP registers to stack slots is fixed later. - struct LiveBundle { - // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c. - unsigned Mask = 0; - - // Number of pre-assigned live registers in FixStack. This is 0 when the - // stack order has not yet been fixed. - unsigned FixCount = 0; - - // Assigned stack order for live-in registers. - // FixStack[i] == getStackEntry(i) for all i < FixCount. - unsigned char FixStack[8]; - - LiveBundle() = default; - - // Have the live registers been assigned a stack order yet? - bool isFixed() const { return !Mask || FixCount; } - }; - - // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges - // with no live FP registers. - SmallVector<LiveBundle, 8> LiveBundles; - - // The edge bundle analysis provides indices into the LiveBundles vector. - EdgeBundles *Bundles = nullptr; - - // Return a bitmask of FP registers in block's live-in list. - static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) { - unsigned Mask = 0; - for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(); - I != MBB->livein_end(); ) { - MCPhysReg Reg = I->PhysReg; - static_assert(X86::FP6 - X86::FP0 == 6, "sequential regnums"); - if (Reg >= X86::FP0 && Reg <= X86::FP6) { - Mask |= 1 << (Reg - X86::FP0); - if (RemoveFPs) { - I = MBB->removeLiveIn(I); - continue; - } + LiveBundle() = default; + + // Have the live registers been assigned a stack order yet? + bool isFixed() const { return !Mask || FixCount; } + }; + + // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges + // with no live FP registers. + SmallVector<LiveBundle, 8> LiveBundles; + + // The edge bundle analysis provides indices into the LiveBundles vector. + EdgeBundles *Bundles = nullptr; + + // Return a bitmask of FP registers in block's live-in list. + static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) { + unsigned Mask = 0; + for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(); + I != MBB->livein_end();) { + MCPhysReg Reg = I->PhysReg; + static_assert(X86::FP6 - X86::FP0 == 6, "sequential regnums"); + if (Reg >= X86::FP0 && Reg <= X86::FP6) { + Mask |= 1 << (Reg - X86::FP0); + if (RemoveFPs) { + I = MBB->removeLiveIn(I); + continue; } - ++I; } - return Mask; + ++I; } + return Mask; + } - // Partition all the CFG edges into LiveBundles. - void bundleCFGRecomputeKillFlags(MachineFunction &MF); + // Partition all the CFG edges into LiveBundles. + void bundleCFGRecomputeKillFlags(MachineFunction &MF); - MachineBasicBlock *MBB = nullptr; // Current basic block + MachineBasicBlock *MBB = nullptr; // Current basic block - // The hardware keeps track of how many FP registers are live, so we have - // to model that exactly. Usually, each live register corresponds to an - // FP<n> register, but when dealing with calls, returns, and inline - // assembly, it is sometimes necessary to have live scratch registers. - unsigned Stack[8]; // FP<n> Registers in each stack slot... - unsigned StackTop = 0; // The current top of the FP stack. + // The hardware keeps track of how many FP registers are live, so we have + // to model that exactly. Usually, each live register corresponds to an + // FP<n> register, but when dealing with calls, returns, and inline + // assembly, it is sometimes necessary to have live scratch registers. + unsigned Stack[8] = {}; // FP<n> Registers in each stack slot... + unsigned StackTop = 0; // The current top of the FP stack. - enum { - NumFPRegs = 8 // Including scratch pseudo-registers. - }; + enum { + NumFPRegs = 8 // Including scratch pseudo-registers. + }; - // For each live FP<n> register, point to its Stack[] entry. - // The first entries correspond to FP0-FP6, the rest are scratch registers - // used when we need slightly different live registers than what the - // register allocator thinks. - unsigned RegMap[NumFPRegs]; + // For each live FP<n> register, point to its Stack[] entry. + // The first entries correspond to FP0-FP6, the rest are scratch registers + // used when we need slightly different live registers than what the + // register allocator thinks. + unsigned RegMap[NumFPRegs] = {}; - // Set up our stack model to match the incoming registers to MBB. - void setupBlockStack(); + // Set up our stack model to match the incoming registers to MBB. + void setupBlockStack(); - // Shuffle live registers to match the expectations of successor blocks. - void finishBlockStack(); + // Shuffle live registers to match the expectations of successor blocks. + void finishBlockStack(); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dumpStack() const { - dbgs() << "Stack contents:"; - for (unsigned i = 0; i != StackTop; ++i) { - dbgs() << " FP" << Stack[i]; - assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!"); - } + void dumpStack() const { + dbgs() << "Stack contents:"; + for (unsigned i = 0; i != StackTop; ++i) { + dbgs() << " FP" << Stack[i]; + assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!"); } + } #endif - /// getSlot - Return the stack slot number a particular register number is - /// in. - unsigned getSlot(unsigned RegNo) const { - assert(RegNo < NumFPRegs && "Regno out of range!"); - return RegMap[RegNo]; - } + /// getSlot - Return the stack slot number a particular register number is + /// in. + unsigned getSlot(unsigned RegNo) const { + assert(RegNo < NumFPRegs && "Regno out of range!"); + return RegMap[RegNo]; + } - /// isLive - Is RegNo currently live in the stack? - bool isLive(unsigned RegNo) const { - unsigned Slot = getSlot(RegNo); - return Slot < StackTop && Stack[Slot] == RegNo; - } + /// isLive - Is RegNo currently live in the stack? + bool isLive(unsigned RegNo) const { + unsigned Slot = getSlot(RegNo); + return Slot < StackTop && Stack[Slot] == RegNo; + } - /// getStackEntry - Return the X86::FP<n> register in register ST(i). - unsigned getStackEntry(unsigned STi) const { - if (STi >= StackTop) - report_fatal_error("Access past stack top!"); - return Stack[StackTop-1-STi]; - } + /// getStackEntry - Return the X86::FP<n> register in register ST(i). + unsigned getStackEntry(unsigned STi) const { + if (STi >= StackTop) + report_fatal_error("Access past stack top!"); + return Stack[StackTop - 1 - STi]; + } - /// getSTReg - Return the X86::ST(i) register which contains the specified - /// FP<RegNo> register. - unsigned getSTReg(unsigned RegNo) const { - return StackTop - 1 - getSlot(RegNo) + X86::ST0; - } + /// getSTReg - Return the X86::ST(i) register which contains the specified + /// FP<RegNo> register. + unsigned getSTReg(unsigned RegNo) const { + return StackTop - 1 - getSlot(RegNo) + X86::ST0; + } - // pushReg - Push the specified FP<n> register onto the stack. - void pushReg(unsigned Reg) { - assert(Reg < NumFPRegs && "Register number out of range!"); - if (StackTop >= 8) - report_fatal_error("Stack overflow!"); - Stack[StackTop] = Reg; - RegMap[Reg] = StackTop++; - } + // pushReg - Push the specified FP<n> register onto the stack. + void pushReg(unsigned Reg) { + assert(Reg < NumFPRegs && "Register number out of range!"); + if (StackTop >= 8) + report_fatal_error("Stack overflow!"); + Stack[StackTop] = Reg; + RegMap[Reg] = StackTop++; + } - // popReg - Pop a register from the stack. - void popReg() { - if (StackTop == 0) - report_fatal_error("Cannot pop empty stack!"); - RegMap[Stack[--StackTop]] = ~0; // Update state - } + // popReg - Pop a register from the stack. + void popReg() { + if (StackTop == 0) + report_fatal_error("Cannot pop empty stack!"); + RegMap[Stack[--StackTop]] = ~0; // Update state + } - bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; } - void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) { - DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); - if (isAtTop(RegNo)) return; + bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop - 1; } + void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) { + DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); + if (isAtTop(RegNo)) + return; - unsigned STReg = getSTReg(RegNo); - unsigned RegOnTop = getStackEntry(0); + unsigned STReg = getSTReg(RegNo); + unsigned RegOnTop = getStackEntry(0); - // Swap the slots the regs are in. - std::swap(RegMap[RegNo], RegMap[RegOnTop]); + // Swap the slots the regs are in. + std::swap(RegMap[RegNo], RegMap[RegOnTop]); - // Swap stack slot contents. - if (RegMap[RegOnTop] >= StackTop) - report_fatal_error("Access past stack top!"); - std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]); + // Swap stack slot contents. + if (RegMap[RegOnTop] >= StackTop) + report_fatal_error("Access past stack top!"); + std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop - 1]); - // Emit an fxch to update the runtime processors version of the state. - BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg); - ++NumFXCH; - } + // Emit an fxch to update the runtime processors version of the state. + BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg); + ++NumFXCH; + } - void duplicateToTop(unsigned RegNo, unsigned AsReg, - MachineBasicBlock::iterator I) { - DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); - unsigned STReg = getSTReg(RegNo); - pushReg(AsReg); // New register on top of stack + void duplicateToTop(unsigned RegNo, unsigned AsReg, + MachineBasicBlock::iterator I) { + DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); + unsigned STReg = getSTReg(RegNo); + pushReg(AsReg); // New register on top of stack - BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg); - } + BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg); + } - /// popStackAfter - Pop the current value off of the top of the FP stack - /// after the specified instruction. - void popStackAfter(MachineBasicBlock::iterator &I); - - /// freeStackSlotAfter - Free the specified register from the register - /// stack, so that it is no longer in a register. If the register is - /// currently at the top of the stack, we just pop the current instruction, - /// otherwise we store the current top-of-stack into the specified slot, - /// then pop the top of stack. - void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg); - - /// freeStackSlotBefore - Just the pop, no folding. Return the inserted - /// instruction. - MachineBasicBlock::iterator - freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo); - - /// Adjust the live registers to be the set in Mask. - void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I); - - /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is - /// st(0), FP reg FixStack[1] is st(1) etc. - void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount, - MachineBasicBlock::iterator I); - - bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); - - void handleCall(MachineBasicBlock::iterator &I); - void handleReturn(MachineBasicBlock::iterator &I); - void handleZeroArgFP(MachineBasicBlock::iterator &I); - void handleOneArgFP(MachineBasicBlock::iterator &I); - void handleOneArgFPRW(MachineBasicBlock::iterator &I); - void handleTwoArgFP(MachineBasicBlock::iterator &I); - void handleCompareFP(MachineBasicBlock::iterator &I); - void handleCondMovFP(MachineBasicBlock::iterator &I); - void handleSpecialFP(MachineBasicBlock::iterator &I); - - // Check if a COPY instruction is using FP registers. - static bool isFPCopy(MachineInstr &MI) { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - - return X86::RFP80RegClass.contains(DstReg) || - X86::RFP80RegClass.contains(SrcReg); - } + /// popStackAfter - Pop the current value off of the top of the FP stack + /// after the specified instruction. + void popStackAfter(MachineBasicBlock::iterator &I); + + /// freeStackSlotAfter - Free the specified register from the register + /// stack, so that it is no longer in a register. If the register is + /// currently at the top of the stack, we just pop the current instruction, + /// otherwise we store the current top-of-stack into the specified slot, + /// then pop the top of stack. + void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg); + + /// freeStackSlotBefore - Just the pop, no folding. Return the inserted + /// instruction. + MachineBasicBlock::iterator freeStackSlotBefore(MachineBasicBlock::iterator I, + unsigned FPRegNo); + + /// Adjust the live registers to be the set in Mask. + void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I); + + /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is + /// st(0), FP reg FixStack[1] is st(1) etc. + void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount, + MachineBasicBlock::iterator I); + + bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + + void handleCall(MachineBasicBlock::iterator &I); + void handleReturn(MachineBasicBlock::iterator &I); + void handleZeroArgFP(MachineBasicBlock::iterator &I); + void handleOneArgFP(MachineBasicBlock::iterator &I); + void handleOneArgFPRW(MachineBasicBlock::iterator &I); + void handleTwoArgFP(MachineBasicBlock::iterator &I); + void handleCompareFP(MachineBasicBlock::iterator &I); + void handleCondMovFP(MachineBasicBlock::iterator &I); + void handleSpecialFP(MachineBasicBlock::iterator &I); + + // Check if a COPY instruction is using FP registers. + static bool isFPCopy(MachineInstr &MI) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + + return X86::RFP80RegClass.contains(DstReg) || + X86::RFP80RegClass.contains(SrcReg); + } - void setKillFlags(MachineBasicBlock &MBB) const; - }; -} + void setKillFlags(MachineBasicBlock &MBB) const; +}; -char FPS::ID = 0; +class X86FPStackifierLegacy : public MachineFunctionPass { +public: + X86FPStackifierLegacy() : MachineFunctionPass(ID) {} -INITIALIZE_PASS_BEGIN(FPS, DEBUG_TYPE, "X86 FP Stackifier", + static char ID; + +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<EdgeBundlesWrapperLegacy>(); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().setNoVRegs(); + } + + StringRef getPassName() const override { return "X86 FP Stackifier"; } +}; +} // namespace + +char X86FPStackifierLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(X86FPStackifierLegacy, DEBUG_TYPE, "X86 FP Stackifier", false, false) INITIALIZE_PASS_DEPENDENCY(EdgeBundlesWrapperLegacy) -INITIALIZE_PASS_END(FPS, DEBUG_TYPE, "X86 FP Stackifier", +INITIALIZE_PASS_END(X86FPStackifierLegacy, DEBUG_TYPE, "X86 FP Stackifier", false, false) -FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } +FunctionPass *llvm::createX86FPStackifierLegacyPass() { + return new X86FPStackifierLegacy(); +} /// getFPReg - Return the X86::FPx register number for the specified operand. /// For example, this returns 3 for X86::FP3. @@ -317,26 +327,25 @@ static unsigned getFPReg(const MachineOperand &MO) { return Reg - X86::FP0; } -/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP -/// register references into FP stack references. -/// -bool FPS::runOnMachineFunction(MachineFunction &MF) { +bool FPS::shouldRun(MachineFunction &MF) { // We only need to run this pass if there are any FP registers used in this // function. If it is all integer, there is nothing for us to do! - bool FPIsUsed = false; - - static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!"); + static_assert(X86::FP6 == X86::FP0 + 6, + "Register enums aren't sorted right!"); const MachineRegisterInfo &MRI = MF.getRegInfo(); - for (unsigned i = 0; i <= 6; ++i) - if (!MRI.reg_nodbg_empty(X86::FP0 + i)) { - FPIsUsed = true; - break; + for (unsigned I = 0; I <= 6; ++I) + if (!MRI.reg_nodbg_empty(X86::FP0 + I)) { + return true; } - // Early exit. - if (!FPIsUsed) return false; + return false; +} - Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles(); +/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP +/// register references into FP stack references. +/// +bool FPS::run(MachineFunction &MF, EdgeBundles *FunctionBundles) { + Bundles = FunctionBundles; TII = MF.getSubtarget().getInstrInfo(); // Prepare cross-MBB liveness. @@ -346,16 +355,17 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // Process the function in depth first order so that we process at least one // of the predecessors for every reachable block in the function. - df_iterator_default_set<MachineBasicBlock*> Processed; + df_iterator_default_set<MachineBasicBlock *> Processed; MachineBasicBlock *Entry = &MF.front(); LiveBundle &Bundle = - LiveBundles[Bundles->getBundle(Entry->getNumber(), false)]; + LiveBundles[Bundles->getBundle(Entry->getNumber(), false)]; // In regcall convention, some FP registers may not be passed through // the stack, so they will need to be assigned to the stack first if ((Entry->getParent()->getFunction().getCallingConv() == - CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) { + CallingConv::X86_RegCall) && + (Bundle.Mask && !Bundle.FixCount)) { // In the register calling convention, up to one FP argument could be // saved in the first FP register. // If bundle.mask is non-zero and Bundle.FixCount is zero, it means @@ -363,7 +373,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // The actual value is passed in FP0. // Here we fix the stack and mark FP0 as pre-assigned register. assert((Bundle.Mask & 0xFE) == 0 && - "Only FP0 could be passed as an argument"); + "Only FP0 could be passed as an argument"); Bundle.FixCount = 1; Bundle.FixStack[0] = 0; } @@ -450,13 +460,13 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { } if (FPInstClass == X86II::NotFP) - continue; // Efficiently ignore non-fp insts! + continue; // Efficiently ignore non-fp insts! MachineInstr *PrevMI = nullptr; if (I != BB.begin()) PrevMI = &*std::prev(I); - ++NumFP; // Keep track of # of pseudo instrs + ++NumFP; // Keep track of # of pseudo instrs LLVM_DEBUG(dbgs() << "\nFPInst:\t" << MI); // Get dead variables list now because the MI pointer may be deleted as part @@ -467,14 +477,29 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { DeadRegs.push_back(MO.getReg()); switch (FPInstClass) { - case X86II::ZeroArgFP: handleZeroArgFP(I); break; - case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0) - case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0)) - case X86II::TwoArgFP: handleTwoArgFP(I); break; - case X86II::CompareFP: handleCompareFP(I); break; - case X86II::CondMovFP: handleCondMovFP(I); break; - case X86II::SpecialFP: handleSpecialFP(I); break; - default: llvm_unreachable("Unknown FP Type!"); + case X86II::ZeroArgFP: + handleZeroArgFP(I); + break; + case X86II::OneArgFP: + handleOneArgFP(I); + break; // fstp ST(0) + case X86II::OneArgFPRW: + handleOneArgFPRW(I); + break; // ST(0) = fsqrt(ST(0)) + case X86II::TwoArgFP: + handleTwoArgFP(I); + break; + case X86II::CompareFP: + handleCompareFP(I); + break; + case X86II::CondMovFP: + handleCondMovFP(I); + break; + case X86II::SpecialFP: + handleSpecialFP(I); + break; + default: + llvm_unreachable("Unknown FP Type!"); } // Check to see if any of the values defined by this instruction are dead @@ -483,9 +508,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { // Check if Reg is live on the stack. An inline-asm register operand that // is in the clobber list and marked dead might not be live on the stack. static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers"); - if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) { + if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg - X86::FP0)) { LLVM_DEBUG(dbgs() << "Register FP#" << Reg - X86::FP0 << " is dead!\n"); - freeStackSlotAfter(I, Reg-X86::FP0); + freeStackSlotAfter(I, Reg - X86::FP0); } } @@ -524,7 +549,7 @@ void FPS::setupBlockStack() { StackTop = 0; // Get the live-in bundle for MBB. const LiveBundle &Bundle = - LiveBundles[Bundles->getBundle(MBB->getNumber(), false)]; + LiveBundles[Bundles->getBundle(MBB->getNumber(), false)]; if (!Bundle.Mask) { LLVM_DEBUG(dbgs() << "Block has no FP live-ins.\n"); @@ -538,7 +563,7 @@ void FPS::setupBlockStack() { for (unsigned i = Bundle.FixCount; i > 0; --i) { LLVM_DEBUG(dbgs() << "Live-in st(" << (i - 1) << "): %fp" << unsigned(Bundle.FixStack[i - 1]) << '\n'); - pushReg(Bundle.FixStack[i-1]); + pushReg(Bundle.FixStack[i - 1]); } // Kill off unwanted live-ins. This can happen with a critical edge. @@ -589,24 +614,23 @@ void FPS::finishBlockStack() { } } - //===----------------------------------------------------------------------===// // Efficient Lookup Table Support //===----------------------------------------------------------------------===// namespace { - struct TableEntry { - uint16_t from; - uint16_t to; - bool operator<(const TableEntry &TE) const { return from < TE.from; } - friend bool operator<(const TableEntry &TE, unsigned V) { - return TE.from < V; - } - [[maybe_unused]] friend bool operator<(unsigned V, const TableEntry &TE) { - return V < TE.from; - } - }; -} +struct TableEntry { + uint16_t from; + uint16_t to; + bool operator<(const TableEntry &TE) const { return from < TE.from; } + friend bool operator<(const TableEntry &TE, unsigned V) { + return TE.from < V; + } + [[maybe_unused]] friend bool operator<(unsigned V, const TableEntry &TE) { + return V < TE.from; + } +}; +} // namespace static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) { const TableEntry *I = llvm::lower_bound(Table, Opcode); @@ -638,168 +662,168 @@ static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) { // concrete X86 instruction which uses the register stack. // static const TableEntry OpcodeTable[] = { - { X86::ABS_Fp32 , X86::ABS_F }, - { X86::ABS_Fp64 , X86::ABS_F }, - { X86::ABS_Fp80 , X86::ABS_F }, - { X86::ADD_Fp32m , X86::ADD_F32m }, - { X86::ADD_Fp64m , X86::ADD_F64m }, - { X86::ADD_Fp64m32 , X86::ADD_F32m }, - { X86::ADD_Fp80m32 , X86::ADD_F32m }, - { X86::ADD_Fp80m64 , X86::ADD_F64m }, - { X86::ADD_FpI16m32 , X86::ADD_FI16m }, - { X86::ADD_FpI16m64 , X86::ADD_FI16m }, - { X86::ADD_FpI16m80 , X86::ADD_FI16m }, - { X86::ADD_FpI32m32 , X86::ADD_FI32m }, - { X86::ADD_FpI32m64 , X86::ADD_FI32m }, - { X86::ADD_FpI32m80 , X86::ADD_FI32m }, - { X86::CHS_Fp32 , X86::CHS_F }, - { X86::CHS_Fp64 , X86::CHS_F }, - { X86::CHS_Fp80 , X86::CHS_F }, - { X86::CMOVBE_Fp32 , X86::CMOVBE_F }, - { X86::CMOVBE_Fp64 , X86::CMOVBE_F }, - { X86::CMOVBE_Fp80 , X86::CMOVBE_F }, - { X86::CMOVB_Fp32 , X86::CMOVB_F }, - { X86::CMOVB_Fp64 , X86::CMOVB_F }, - { X86::CMOVB_Fp80 , X86::CMOVB_F }, - { X86::CMOVE_Fp32 , X86::CMOVE_F }, - { X86::CMOVE_Fp64 , X86::CMOVE_F }, - { X86::CMOVE_Fp80 , X86::CMOVE_F }, - { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F }, - { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F }, - { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F }, - { X86::CMOVNB_Fp32 , X86::CMOVNB_F }, - { X86::CMOVNB_Fp64 , X86::CMOVNB_F }, - { X86::CMOVNB_Fp80 , X86::CMOVNB_F }, - { X86::CMOVNE_Fp32 , X86::CMOVNE_F }, - { X86::CMOVNE_Fp64 , X86::CMOVNE_F }, - { X86::CMOVNE_Fp80 , X86::CMOVNE_F }, - { X86::CMOVNP_Fp32 , X86::CMOVNP_F }, - { X86::CMOVNP_Fp64 , X86::CMOVNP_F }, - { X86::CMOVNP_Fp80 , X86::CMOVNP_F }, - { X86::CMOVP_Fp32 , X86::CMOVP_F }, - { X86::CMOVP_Fp64 , X86::CMOVP_F }, - { X86::CMOVP_Fp80 , X86::CMOVP_F }, - { X86::COM_FpIr32 , X86::COM_FIr }, - { X86::COM_FpIr64 , X86::COM_FIr }, - { X86::COM_FpIr80 , X86::COM_FIr }, - { X86::COM_Fpr32 , X86::COM_FST0r }, - { X86::COM_Fpr64 , X86::COM_FST0r }, - { X86::COM_Fpr80 , X86::COM_FST0r }, - { X86::DIVR_Fp32m , X86::DIVR_F32m }, - { X86::DIVR_Fp64m , X86::DIVR_F64m }, - { X86::DIVR_Fp64m32 , X86::DIVR_F32m }, - { X86::DIVR_Fp80m32 , X86::DIVR_F32m }, - { X86::DIVR_Fp80m64 , X86::DIVR_F64m }, - { X86::DIVR_FpI16m32, X86::DIVR_FI16m}, - { X86::DIVR_FpI16m64, X86::DIVR_FI16m}, - { X86::DIVR_FpI16m80, X86::DIVR_FI16m}, - { X86::DIVR_FpI32m32, X86::DIVR_FI32m}, - { X86::DIVR_FpI32m64, X86::DIVR_FI32m}, - { X86::DIVR_FpI32m80, X86::DIVR_FI32m}, - { X86::DIV_Fp32m , X86::DIV_F32m }, - { X86::DIV_Fp64m , X86::DIV_F64m }, - { X86::DIV_Fp64m32 , X86::DIV_F32m }, - { X86::DIV_Fp80m32 , X86::DIV_F32m }, - { X86::DIV_Fp80m64 , X86::DIV_F64m }, - { X86::DIV_FpI16m32 , X86::DIV_FI16m }, - { X86::DIV_FpI16m64 , X86::DIV_FI16m }, - { X86::DIV_FpI16m80 , X86::DIV_FI16m }, - { X86::DIV_FpI32m32 , X86::DIV_FI32m }, - { X86::DIV_FpI32m64 , X86::DIV_FI32m }, - { X86::DIV_FpI32m80 , X86::DIV_FI32m }, - { X86::ILD_Fp16m32 , X86::ILD_F16m }, - { X86::ILD_Fp16m64 , X86::ILD_F16m }, - { X86::ILD_Fp16m80 , X86::ILD_F16m }, - { X86::ILD_Fp32m32 , X86::ILD_F32m }, - { X86::ILD_Fp32m64 , X86::ILD_F32m }, - { X86::ILD_Fp32m80 , X86::ILD_F32m }, - { X86::ILD_Fp64m32 , X86::ILD_F64m }, - { X86::ILD_Fp64m64 , X86::ILD_F64m }, - { X86::ILD_Fp64m80 , X86::ILD_F64m }, - { X86::ISTT_Fp16m32 , X86::ISTT_FP16m}, - { X86::ISTT_Fp16m64 , X86::ISTT_FP16m}, - { X86::ISTT_Fp16m80 , X86::ISTT_FP16m}, - { X86::ISTT_Fp32m32 , X86::ISTT_FP32m}, - { X86::ISTT_Fp32m64 , X86::ISTT_FP32m}, - { X86::ISTT_Fp32m80 , X86::ISTT_FP32m}, - { X86::ISTT_Fp64m32 , X86::ISTT_FP64m}, - { X86::ISTT_Fp64m64 , X86::ISTT_FP64m}, - { X86::ISTT_Fp64m80 , X86::ISTT_FP64m}, - { X86::IST_Fp16m32 , X86::IST_F16m }, - { X86::IST_Fp16m64 , X86::IST_F16m }, - { X86::IST_Fp16m80 , X86::IST_F16m }, - { X86::IST_Fp32m32 , X86::IST_F32m }, - { X86::IST_Fp32m64 , X86::IST_F32m }, - { X86::IST_Fp32m80 , X86::IST_F32m }, - { X86::IST_Fp64m32 , X86::IST_FP64m }, - { X86::IST_Fp64m64 , X86::IST_FP64m }, - { X86::IST_Fp64m80 , X86::IST_FP64m }, - { X86::LD_Fp032 , X86::LD_F0 }, - { X86::LD_Fp064 , X86::LD_F0 }, - { X86::LD_Fp080 , X86::LD_F0 }, - { X86::LD_Fp132 , X86::LD_F1 }, - { X86::LD_Fp164 , X86::LD_F1 }, - { X86::LD_Fp180 , X86::LD_F1 }, - { X86::LD_Fp32m , X86::LD_F32m }, - { X86::LD_Fp32m64 , X86::LD_F32m }, - { X86::LD_Fp32m80 , X86::LD_F32m }, - { X86::LD_Fp64m , X86::LD_F64m }, - { X86::LD_Fp64m80 , X86::LD_F64m }, - { X86::LD_Fp80m , X86::LD_F80m }, - { X86::MUL_Fp32m , X86::MUL_F32m }, - { X86::MUL_Fp64m , X86::MUL_F64m }, - { X86::MUL_Fp64m32 , X86::MUL_F32m }, - { X86::MUL_Fp80m32 , X86::MUL_F32m }, - { X86::MUL_Fp80m64 , X86::MUL_F64m }, - { X86::MUL_FpI16m32 , X86::MUL_FI16m }, - { X86::MUL_FpI16m64 , X86::MUL_FI16m }, - { X86::MUL_FpI16m80 , X86::MUL_FI16m }, - { X86::MUL_FpI32m32 , X86::MUL_FI32m }, - { X86::MUL_FpI32m64 , X86::MUL_FI32m }, - { X86::MUL_FpI32m80 , X86::MUL_FI32m }, - { X86::SQRT_Fp32 , X86::SQRT_F }, - { X86::SQRT_Fp64 , X86::SQRT_F }, - { X86::SQRT_Fp80 , X86::SQRT_F }, - { X86::ST_Fp32m , X86::ST_F32m }, - { X86::ST_Fp64m , X86::ST_F64m }, - { X86::ST_Fp64m32 , X86::ST_F32m }, - { X86::ST_Fp80m32 , X86::ST_F32m }, - { X86::ST_Fp80m64 , X86::ST_F64m }, - { X86::ST_FpP80m , X86::ST_FP80m }, - { X86::SUBR_Fp32m , X86::SUBR_F32m }, - { X86::SUBR_Fp64m , X86::SUBR_F64m }, - { X86::SUBR_Fp64m32 , X86::SUBR_F32m }, - { X86::SUBR_Fp80m32 , X86::SUBR_F32m }, - { X86::SUBR_Fp80m64 , X86::SUBR_F64m }, - { X86::SUBR_FpI16m32, X86::SUBR_FI16m}, - { X86::SUBR_FpI16m64, X86::SUBR_FI16m}, - { X86::SUBR_FpI16m80, X86::SUBR_FI16m}, - { X86::SUBR_FpI32m32, X86::SUBR_FI32m}, - { X86::SUBR_FpI32m64, X86::SUBR_FI32m}, - { X86::SUBR_FpI32m80, X86::SUBR_FI32m}, - { X86::SUB_Fp32m , X86::SUB_F32m }, - { X86::SUB_Fp64m , X86::SUB_F64m }, - { X86::SUB_Fp64m32 , X86::SUB_F32m }, - { X86::SUB_Fp80m32 , X86::SUB_F32m }, - { X86::SUB_Fp80m64 , X86::SUB_F64m }, - { X86::SUB_FpI16m32 , X86::SUB_FI16m }, - { X86::SUB_FpI16m64 , X86::SUB_FI16m }, - { X86::SUB_FpI16m80 , X86::SUB_FI16m }, - { X86::SUB_FpI32m32 , X86::SUB_FI32m }, - { X86::SUB_FpI32m64 , X86::SUB_FI32m }, - { X86::SUB_FpI32m80 , X86::SUB_FI32m }, - { X86::TST_Fp32 , X86::TST_F }, - { X86::TST_Fp64 , X86::TST_F }, - { X86::TST_Fp80 , X86::TST_F }, - { X86::UCOM_FpIr32 , X86::UCOM_FIr }, - { X86::UCOM_FpIr64 , X86::UCOM_FIr }, - { X86::UCOM_FpIr80 , X86::UCOM_FIr }, - { X86::UCOM_Fpr32 , X86::UCOM_Fr }, - { X86::UCOM_Fpr64 , X86::UCOM_Fr }, - { X86::UCOM_Fpr80 , X86::UCOM_Fr }, - { X86::XAM_Fp32 , X86::XAM_F }, - { X86::XAM_Fp64 , X86::XAM_F }, - { X86::XAM_Fp80 , X86::XAM_F }, + {X86::ABS_Fp32, X86::ABS_F}, + {X86::ABS_Fp64, X86::ABS_F}, + {X86::ABS_Fp80, X86::ABS_F}, + {X86::ADD_Fp32m, X86::ADD_F32m}, + {X86::ADD_Fp64m, X86::ADD_F64m}, + {X86::ADD_Fp64m32, X86::ADD_F32m}, + {X86::ADD_Fp80m32, X86::ADD_F32m}, + {X86::ADD_Fp80m64, X86::ADD_F64m}, + {X86::ADD_FpI16m32, X86::ADD_FI16m}, + {X86::ADD_FpI16m64, X86::ADD_FI16m}, + {X86::ADD_FpI16m80, X86::ADD_FI16m}, + {X86::ADD_FpI32m32, X86::ADD_FI32m}, + {X86::ADD_FpI32m64, X86::ADD_FI32m}, + {X86::ADD_FpI32m80, X86::ADD_FI32m}, + {X86::CHS_Fp32, X86::CHS_F}, + {X86::CHS_Fp64, X86::CHS_F}, + {X86::CHS_Fp80, X86::CHS_F}, + {X86::CMOVBE_Fp32, X86::CMOVBE_F}, + {X86::CMOVBE_Fp64, X86::CMOVBE_F}, + {X86::CMOVBE_Fp80, X86::CMOVBE_F}, + {X86::CMOVB_Fp32, X86::CMOVB_F}, + {X86::CMOVB_Fp64, X86::CMOVB_F}, + {X86::CMOVB_Fp80, X86::CMOVB_F}, + {X86::CMOVE_Fp32, X86::CMOVE_F}, + {X86::CMOVE_Fp64, X86::CMOVE_F}, + {X86::CMOVE_Fp80, X86::CMOVE_F}, + {X86::CMOVNBE_Fp32, X86::CMOVNBE_F}, + {X86::CMOVNBE_Fp64, X86::CMOVNBE_F}, + {X86::CMOVNBE_Fp80, X86::CMOVNBE_F}, + {X86::CMOVNB_Fp32, X86::CMOVNB_F}, + {X86::CMOVNB_Fp64, X86::CMOVNB_F}, + {X86::CMOVNB_Fp80, X86::CMOVNB_F}, + {X86::CMOVNE_Fp32, X86::CMOVNE_F}, + {X86::CMOVNE_Fp64, X86::CMOVNE_F}, + {X86::CMOVNE_Fp80, X86::CMOVNE_F}, + {X86::CMOVNP_Fp32, X86::CMOVNP_F}, + {X86::CMOVNP_Fp64, X86::CMOVNP_F}, + {X86::CMOVNP_Fp80, X86::CMOVNP_F}, + {X86::CMOVP_Fp32, X86::CMOVP_F}, + {X86::CMOVP_Fp64, X86::CMOVP_F}, + {X86::CMOVP_Fp80, X86::CMOVP_F}, + {X86::COM_FpIr32, X86::COM_FIr}, + {X86::COM_FpIr64, X86::COM_FIr}, + {X86::COM_FpIr80, X86::COM_FIr}, + {X86::COM_Fpr32, X86::COM_FST0r}, + {X86::COM_Fpr64, X86::COM_FST0r}, + {X86::COM_Fpr80, X86::COM_FST0r}, + {X86::DIVR_Fp32m, X86::DIVR_F32m}, + {X86::DIVR_Fp64m, X86::DIVR_F64m}, + {X86::DIVR_Fp64m32, X86::DIVR_F32m}, + {X86::DIVR_Fp80m32, X86::DIVR_F32m}, + {X86::DIVR_Fp80m64, X86::DIVR_F64m}, + {X86::DIVR_FpI16m32, X86::DIVR_FI16m}, + {X86::DIVR_FpI16m64, X86::DIVR_FI16m}, + {X86::DIVR_FpI16m80, X86::DIVR_FI16m}, + {X86::DIVR_FpI32m32, X86::DIVR_FI32m}, + {X86::DIVR_FpI32m64, X86::DIVR_FI32m}, + {X86::DIVR_FpI32m80, X86::DIVR_FI32m}, + {X86::DIV_Fp32m, X86::DIV_F32m}, + {X86::DIV_Fp64m, X86::DIV_F64m}, + {X86::DIV_Fp64m32, X86::DIV_F32m}, + {X86::DIV_Fp80m32, X86::DIV_F32m}, + {X86::DIV_Fp80m64, X86::DIV_F64m}, + {X86::DIV_FpI16m32, X86::DIV_FI16m}, + {X86::DIV_FpI16m64, X86::DIV_FI16m}, + {X86::DIV_FpI16m80, X86::DIV_FI16m}, + {X86::DIV_FpI32m32, X86::DIV_FI32m}, + {X86::DIV_FpI32m64, X86::DIV_FI32m}, + {X86::DIV_FpI32m80, X86::DIV_FI32m}, + {X86::ILD_Fp16m32, X86::ILD_F16m}, + {X86::ILD_Fp16m64, X86::ILD_F16m}, + {X86::ILD_Fp16m80, X86::ILD_F16m}, + {X86::ILD_Fp32m32, X86::ILD_F32m}, + {X86::ILD_Fp32m64, X86::ILD_F32m}, + {X86::ILD_Fp32m80, X86::ILD_F32m}, + {X86::ILD_Fp64m32, X86::ILD_F64m}, + {X86::ILD_Fp64m64, X86::ILD_F64m}, + {X86::ILD_Fp64m80, X86::ILD_F64m}, + {X86::ISTT_Fp16m32, X86::ISTT_FP16m}, + {X86::ISTT_Fp16m64, X86::ISTT_FP16m}, + {X86::ISTT_Fp16m80, X86::ISTT_FP16m}, + {X86::ISTT_Fp32m32, X86::ISTT_FP32m}, + {X86::ISTT_Fp32m64, X86::ISTT_FP32m}, + {X86::ISTT_Fp32m80, X86::ISTT_FP32m}, + {X86::ISTT_Fp64m32, X86::ISTT_FP64m}, + {X86::ISTT_Fp64m64, X86::ISTT_FP64m}, + {X86::ISTT_Fp64m80, X86::ISTT_FP64m}, + {X86::IST_Fp16m32, X86::IST_F16m}, + {X86::IST_Fp16m64, X86::IST_F16m}, + {X86::IST_Fp16m80, X86::IST_F16m}, + {X86::IST_Fp32m32, X86::IST_F32m}, + {X86::IST_Fp32m64, X86::IST_F32m}, + {X86::IST_Fp32m80, X86::IST_F32m}, + {X86::IST_Fp64m32, X86::IST_FP64m}, + {X86::IST_Fp64m64, X86::IST_FP64m}, + {X86::IST_Fp64m80, X86::IST_FP64m}, + {X86::LD_Fp032, X86::LD_F0}, + {X86::LD_Fp064, X86::LD_F0}, + {X86::LD_Fp080, X86::LD_F0}, + {X86::LD_Fp132, X86::LD_F1}, + {X86::LD_Fp164, X86::LD_F1}, + {X86::LD_Fp180, X86::LD_F1}, + {X86::LD_Fp32m, X86::LD_F32m}, + {X86::LD_Fp32m64, X86::LD_F32m}, + {X86::LD_Fp32m80, X86::LD_F32m}, + {X86::LD_Fp64m, X86::LD_F64m}, + {X86::LD_Fp64m80, X86::LD_F64m}, + {X86::LD_Fp80m, X86::LD_F80m}, + {X86::MUL_Fp32m, X86::MUL_F32m}, + {X86::MUL_Fp64m, X86::MUL_F64m}, + {X86::MUL_Fp64m32, X86::MUL_F32m}, + {X86::MUL_Fp80m32, X86::MUL_F32m}, + {X86::MUL_Fp80m64, X86::MUL_F64m}, + {X86::MUL_FpI16m32, X86::MUL_FI16m}, + {X86::MUL_FpI16m64, X86::MUL_FI16m}, + {X86::MUL_FpI16m80, X86::MUL_FI16m}, + {X86::MUL_FpI32m32, X86::MUL_FI32m}, + {X86::MUL_FpI32m64, X86::MUL_FI32m}, + {X86::MUL_FpI32m80, X86::MUL_FI32m}, + {X86::SQRT_Fp32, X86::SQRT_F}, + {X86::SQRT_Fp64, X86::SQRT_F}, + {X86::SQRT_Fp80, X86::SQRT_F}, + {X86::ST_Fp32m, X86::ST_F32m}, + {X86::ST_Fp64m, X86::ST_F64m}, + {X86::ST_Fp64m32, X86::ST_F32m}, + {X86::ST_Fp80m32, X86::ST_F32m}, + {X86::ST_Fp80m64, X86::ST_F64m}, + {X86::ST_FpP80m, X86::ST_FP80m}, + {X86::SUBR_Fp32m, X86::SUBR_F32m}, + {X86::SUBR_Fp64m, X86::SUBR_F64m}, + {X86::SUBR_Fp64m32, X86::SUBR_F32m}, + {X86::SUBR_Fp80m32, X86::SUBR_F32m}, + {X86::SUBR_Fp80m64, X86::SUBR_F64m}, + {X86::SUBR_FpI16m32, X86::SUBR_FI16m}, + {X86::SUBR_FpI16m64, X86::SUBR_FI16m}, + {X86::SUBR_FpI16m80, X86::SUBR_FI16m}, + {X86::SUBR_FpI32m32, X86::SUBR_FI32m}, + {X86::SUBR_FpI32m64, X86::SUBR_FI32m}, + {X86::SUBR_FpI32m80, X86::SUBR_FI32m}, + {X86::SUB_Fp32m, X86::SUB_F32m}, + {X86::SUB_Fp64m, X86::SUB_F64m}, + {X86::SUB_Fp64m32, X86::SUB_F32m}, + {X86::SUB_Fp80m32, X86::SUB_F32m}, + {X86::SUB_Fp80m64, X86::SUB_F64m}, + {X86::SUB_FpI16m32, X86::SUB_FI16m}, + {X86::SUB_FpI16m64, X86::SUB_FI16m}, + {X86::SUB_FpI16m80, X86::SUB_FI16m}, + {X86::SUB_FpI32m32, X86::SUB_FI32m}, + {X86::SUB_FpI32m64, X86::SUB_FI32m}, + {X86::SUB_FpI32m80, X86::SUB_FI32m}, + {X86::TST_Fp32, X86::TST_F}, + {X86::TST_Fp64, X86::TST_F}, + {X86::TST_Fp80, X86::TST_F}, + {X86::UCOM_FpIr32, X86::UCOM_FIr}, + {X86::UCOM_FpIr64, X86::UCOM_FIr}, + {X86::UCOM_FpIr80, X86::UCOM_FIr}, + {X86::UCOM_Fpr32, X86::UCOM_Fr}, + {X86::UCOM_Fpr64, X86::UCOM_Fr}, + {X86::UCOM_Fpr80, X86::UCOM_Fr}, + {X86::XAM_Fp32, X86::XAM_F}, + {X86::XAM_Fp64, X86::XAM_F}, + {X86::XAM_Fp80, X86::XAM_F}, }; static unsigned getConcreteOpcode(unsigned Opcode) { @@ -817,31 +841,25 @@ static unsigned getConcreteOpcode(unsigned Opcode) { // element is an instruction, the second is the version which pops. // static const TableEntry PopTable[] = { - { X86::ADD_FrST0 , X86::ADD_FPrST0 }, + {X86::ADD_FrST0, X86::ADD_FPrST0}, - { X86::COMP_FST0r, X86::FCOMPP }, - { X86::COM_FIr , X86::COM_FIPr }, - { X86::COM_FST0r , X86::COMP_FST0r }, + {X86::COMP_FST0r, X86::FCOMPP}, {X86::COM_FIr, X86::COM_FIPr}, + {X86::COM_FST0r, X86::COMP_FST0r}, - { X86::DIVR_FrST0, X86::DIVR_FPrST0 }, - { X86::DIV_FrST0 , X86::DIV_FPrST0 }, + {X86::DIVR_FrST0, X86::DIVR_FPrST0}, {X86::DIV_FrST0, X86::DIV_FPrST0}, - { X86::IST_F16m , X86::IST_FP16m }, - { X86::IST_F32m , X86::IST_FP32m }, + {X86::IST_F16m, X86::IST_FP16m}, {X86::IST_F32m, X86::IST_FP32m}, - { X86::MUL_FrST0 , X86::MUL_FPrST0 }, + {X86::MUL_FrST0, X86::MUL_FPrST0}, - { X86::ST_F32m , X86::ST_FP32m }, - { X86::ST_F64m , X86::ST_FP64m }, - { X86::ST_Frr , X86::ST_FPrr }, + {X86::ST_F32m, X86::ST_FP32m}, {X86::ST_F64m, X86::ST_FP64m}, + {X86::ST_Frr, X86::ST_FPrr}, - { X86::SUBR_FrST0, X86::SUBR_FPrST0 }, - { X86::SUB_FrST0 , X86::SUB_FPrST0 }, + {X86::SUBR_FrST0, X86::SUBR_FPrST0}, {X86::SUB_FrST0, X86::SUB_FPrST0}, - { X86::UCOM_FIr , X86::UCOM_FIPr }, + {X86::UCOM_FIr, X86::UCOM_FIPr}, - { X86::UCOM_FPr , X86::UCOM_FPPr }, - { X86::UCOM_Fr , X86::UCOM_FPr }, + {X86::UCOM_FPr, X86::UCOM_FPPr}, {X86::UCOM_Fr, X86::UCOM_FPr}, }; static bool doesInstructionSetFPSW(MachineInstr &MI) { @@ -883,7 +901,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) { if (Opcode == X86::FCOMPP || Opcode == X86::UCOM_FPPr) I->removeOperand(0); MI.dropDebugNumber(); - } else { // Insert an explicit pop + } else { // Insert an explicit pop // If this instruction sets FPSW, which is read in following instruction, // insert pop after that reader. if (doesInstructionSetFPSW(MI)) { @@ -901,7 +919,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) { /// of the stack, we just pop the current instruction, otherwise we store the /// current top-of-stack into the specified slot, then pop the top of stack. void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) { - if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy. + if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy. popStackAfter(I); return; } @@ -916,12 +934,12 @@ void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) { /// folding. MachineBasicBlock::iterator FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) { - unsigned STReg = getSTReg(FPRegNo); - unsigned OldSlot = getSlot(FPRegNo); - unsigned TopReg = Stack[StackTop-1]; - Stack[OldSlot] = TopReg; - RegMap[TopReg] = OldSlot; - RegMap[FPRegNo] = ~0; + unsigned STReg = getSTReg(FPRegNo); + unsigned OldSlot = getSlot(FPRegNo); + unsigned TopReg = Stack[StackTop - 1]; + Stack[OldSlot] = TopReg; + RegMap[TopReg] = OldSlot; + RegMap[FPRegNo] = ~0; Stack[--StackTop] = ~0; return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)) .addReg(STReg) @@ -978,7 +996,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { } // Load zeros for all the imp-defs. - while(Defs) { + while (Defs) { unsigned DReg = llvm::countr_zero(Defs); LLVM_DEBUG(dbgs() << "Defining %fp" << DReg << " as 0\n"); BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0)); @@ -994,8 +1012,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { /// shuffleStackTop - emit fxch instructions before I to shuffle the top /// FixCount entries into the order given by FixStack. /// FIXME: Is there a better algorithm than insertion sort? -void FPS::shuffleStackTop(const unsigned char *FixStack, - unsigned FixCount, +void FPS::shuffleStackTop(const unsigned char *FixStack, unsigned FixCount, MachineBasicBlock::iterator I) { // Move items into place, starting from the desired stack bottom. while (FixCount--) { @@ -1013,7 +1030,6 @@ void FPS::shuffleStackTop(const unsigned char *FixStack, LLVM_DEBUG(dumpStack()); } - //===----------------------------------------------------------------------===// // Instruction transformation implementation //===----------------------------------------------------------------------===// @@ -1122,7 +1138,8 @@ void FPS::handleReturn(MachineBasicBlock::iterator &I) { // We may have been carrying spurious live-ins, so make sure only the // returned registers are left live. adjustLiveRegs(LiveMask, MI); - if (!LiveMask) return; // Quick check to see if any are possible. + if (!LiveMask) + return; // Quick check to see if any are possible. // There are only four possibilities here: // 1) we are returning a single FP value. In this case, it has to be in @@ -1144,7 +1161,7 @@ void FPS::handleReturn(MachineBasicBlock::iterator &I) { // 2) If returning the same value for both, we only have one thing in the FP // stack. Consider: RET FP1, FP1 if (StackTop == 1) { - assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&& + assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0) && "Stack misconfiguration for RET!"); // Duplicate the TOS so that we return it twice. Just pick some other FPx @@ -1222,7 +1239,7 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { MI.getOpcode() == X86::ST_FpP80m)) { duplicateToTop(Reg, ScratchFPReg, I); } else { - moveToTop(Reg, I); // Move to the top of the stack... + moveToTop(Reg, I); // Move to the top of the stack... } // Convert from the pseudo instruction to the concrete instruction. @@ -1244,7 +1261,6 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { MI.dropDebugNumber(); } - /// handleOneArgFPRW: Handle instructions that read from the top of stack and /// replace the value with a newly computed value. These instructions may have /// non-fp operands after their FP operands. @@ -1285,76 +1301,62 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) { MI.dropDebugNumber(); } - //===----------------------------------------------------------------------===// // Define tables of various ways to map pseudo instructions // // ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i) static const TableEntry ForwardST0Table[] = { - { X86::ADD_Fp32 , X86::ADD_FST0r }, - { X86::ADD_Fp64 , X86::ADD_FST0r }, - { X86::ADD_Fp80 , X86::ADD_FST0r }, - { X86::DIV_Fp32 , X86::DIV_FST0r }, - { X86::DIV_Fp64 , X86::DIV_FST0r }, - { X86::DIV_Fp80 , X86::DIV_FST0r }, - { X86::MUL_Fp32 , X86::MUL_FST0r }, - { X86::MUL_Fp64 , X86::MUL_FST0r }, - { X86::MUL_Fp80 , X86::MUL_FST0r }, - { X86::SUB_Fp32 , X86::SUB_FST0r }, - { X86::SUB_Fp64 , X86::SUB_FST0r }, - { X86::SUB_Fp80 , X86::SUB_FST0r }, + {X86::ADD_Fp32, X86::ADD_FST0r}, {X86::ADD_Fp64, X86::ADD_FST0r}, + {X86::ADD_Fp80, X86::ADD_FST0r}, {X86::DIV_Fp32, X86::DIV_FST0r}, + {X86::DIV_Fp64, X86::DIV_FST0r}, {X86::DIV_Fp80, X86::DIV_FST0r}, + {X86::MUL_Fp32, X86::MUL_FST0r}, {X86::MUL_Fp64, X86::MUL_FST0r}, + {X86::MUL_Fp80, X86::MUL_FST0r}, {X86::SUB_Fp32, X86::SUB_FST0r}, + {X86::SUB_Fp64, X86::SUB_FST0r}, {X86::SUB_Fp80, X86::SUB_FST0r}, }; // ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0) static const TableEntry ReverseST0Table[] = { - { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative - { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative - { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative - { X86::DIV_Fp32 , X86::DIVR_FST0r }, - { X86::DIV_Fp64 , X86::DIVR_FST0r }, - { X86::DIV_Fp80 , X86::DIVR_FST0r }, - { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative - { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative - { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative - { X86::SUB_Fp32 , X86::SUBR_FST0r }, - { X86::SUB_Fp64 , X86::SUBR_FST0r }, - { X86::SUB_Fp80 , X86::SUBR_FST0r }, + {X86::ADD_Fp32, X86::ADD_FST0r}, // commutative + {X86::ADD_Fp64, X86::ADD_FST0r}, // commutative + {X86::ADD_Fp80, X86::ADD_FST0r}, // commutative + {X86::DIV_Fp32, X86::DIVR_FST0r}, + {X86::DIV_Fp64, X86::DIVR_FST0r}, + {X86::DIV_Fp80, X86::DIVR_FST0r}, + {X86::MUL_Fp32, X86::MUL_FST0r}, // commutative + {X86::MUL_Fp64, X86::MUL_FST0r}, // commutative + {X86::MUL_Fp80, X86::MUL_FST0r}, // commutative + {X86::SUB_Fp32, X86::SUBR_FST0r}, + {X86::SUB_Fp64, X86::SUBR_FST0r}, + {X86::SUB_Fp80, X86::SUBR_FST0r}, }; // ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i) static const TableEntry ForwardSTiTable[] = { - { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative - { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative - { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative - { X86::DIV_Fp32 , X86::DIVR_FrST0 }, - { X86::DIV_Fp64 , X86::DIVR_FrST0 }, - { X86::DIV_Fp80 , X86::DIVR_FrST0 }, - { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative - { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative - { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative - { X86::SUB_Fp32 , X86::SUBR_FrST0 }, - { X86::SUB_Fp64 , X86::SUBR_FrST0 }, - { X86::SUB_Fp80 , X86::SUBR_FrST0 }, + {X86::ADD_Fp32, X86::ADD_FrST0}, // commutative + {X86::ADD_Fp64, X86::ADD_FrST0}, // commutative + {X86::ADD_Fp80, X86::ADD_FrST0}, // commutative + {X86::DIV_Fp32, X86::DIVR_FrST0}, + {X86::DIV_Fp64, X86::DIVR_FrST0}, + {X86::DIV_Fp80, X86::DIVR_FrST0}, + {X86::MUL_Fp32, X86::MUL_FrST0}, // commutative + {X86::MUL_Fp64, X86::MUL_FrST0}, // commutative + {X86::MUL_Fp80, X86::MUL_FrST0}, // commutative + {X86::SUB_Fp32, X86::SUBR_FrST0}, + {X86::SUB_Fp64, X86::SUBR_FrST0}, + {X86::SUB_Fp80, X86::SUBR_FrST0}, }; // ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0) static const TableEntry ReverseSTiTable[] = { - { X86::ADD_Fp32 , X86::ADD_FrST0 }, - { X86::ADD_Fp64 , X86::ADD_FrST0 }, - { X86::ADD_Fp80 , X86::ADD_FrST0 }, - { X86::DIV_Fp32 , X86::DIV_FrST0 }, - { X86::DIV_Fp64 , X86::DIV_FrST0 }, - { X86::DIV_Fp80 , X86::DIV_FrST0 }, - { X86::MUL_Fp32 , X86::MUL_FrST0 }, - { X86::MUL_Fp64 , X86::MUL_FrST0 }, - { X86::MUL_Fp80 , X86::MUL_FrST0 }, - { X86::SUB_Fp32 , X86::SUB_FrST0 }, - { X86::SUB_Fp64 , X86::SUB_FrST0 }, - { X86::SUB_Fp80 , X86::SUB_FrST0 }, + {X86::ADD_Fp32, X86::ADD_FrST0}, {X86::ADD_Fp64, X86::ADD_FrST0}, + {X86::ADD_Fp80, X86::ADD_FrST0}, {X86::DIV_Fp32, X86::DIV_FrST0}, + {X86::DIV_Fp64, X86::DIV_FrST0}, {X86::DIV_Fp80, X86::DIV_FrST0}, + {X86::MUL_Fp32, X86::MUL_FrST0}, {X86::MUL_Fp64, X86::MUL_FrST0}, + {X86::MUL_Fp80, X86::MUL_FrST0}, {X86::SUB_Fp32, X86::SUB_FrST0}, + {X86::SUB_Fp64, X86::SUB_FrST0}, {X86::SUB_Fp80, X86::SUB_FrST0}, }; - /// handleTwoArgFP - Handle instructions like FADD and friends which are virtual /// instructions which need to be simplified and possibly transformed. /// @@ -1364,8 +1366,10 @@ static const TableEntry ReverseSTiTable[] = { /// ST(i) = fsubr ST(0), ST(i) /// void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { - ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); - ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); + ASSERT_SORTED(ForwardST0Table); + ASSERT_SORTED(ReverseST0Table); + ASSERT_SORTED(ForwardSTiTable); + ASSERT_SORTED(ReverseSTiTable); MachineInstr &MI = *I; unsigned NumOperands = MI.getDesc().getNumOperands(); @@ -1381,12 +1385,12 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { // One of our operands must be on the top of the stack. If neither is yet, we // need to move one. - if (Op0 != TOS && Op1 != TOS) { // No operand at TOS? + if (Op0 != TOS && Op1 != TOS) { // No operand at TOS? // We can choose to move either operand to the top of the stack. If one of // the operands is killed by this instruction, we want that one so that we // can update right on top of the old version. if (KillsOp0) { - moveToTop(Op0, I); // Move dead operand to TOS. + moveToTop(Op0, I); // Move dead operand to TOS. TOS = Op0; } else if (KillsOp1) { moveToTop(Op1, I); @@ -1449,15 +1453,15 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { // overwriting the other one. if (KillsOp0 && KillsOp1 && Op0 != Op1) { assert(!updateST0 && "Should have updated other operand!"); - popStackAfter(I); // Pop the top of stack + popStackAfter(I); // Pop the top of stack } // Update stack information so that we know the destination register is now on // the stack. unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS); assert(UpdatedSlot < StackTop && Dest < 7); - Stack[UpdatedSlot] = Dest; - RegMap[Dest] = UpdatedSlot; + Stack[UpdatedSlot] = Dest; + RegMap[Dest] = UpdatedSlot; MBB->getParent()->deleteMachineInstr(&MI); // Remove the old instruction } @@ -1485,8 +1489,10 @@ void FPS::handleCompareFP(MachineBasicBlock::iterator &I) { MI.dropDebugNumber(); // If any of the operands are killed by this instruction, free them. - if (KillsOp0) freeStackSlotAfter(I, Op0); - if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1); + if (KillsOp0) + freeStackSlotAfter(I, Op0); + if (KillsOp1 && Op0 != Op1) + freeStackSlotAfter(I, Op1); } /// handleCondMovFP - Handle two address conditional move instructions. These @@ -1518,7 +1524,6 @@ void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) { } } - /// handleSpecialFP - Handle special instructions which behave unlike other /// floating point instructions. This is primarily intended for use by pseudo /// instructions. @@ -1537,7 +1542,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { } switch (MI.getOpcode()) { - default: llvm_unreachable("Unknown SpecialFP instruction!"); + default: + llvm_unreachable("Unknown SpecialFP instruction!"); case TargetOpcode::COPY: { // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP. const MachineOperand &MO1 = MI.getOperand(1); @@ -1770,7 +1776,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { } } - Inst = MBB->erase(Inst); // Remove the pseudo instruction + Inst = MBB->erase(Inst); // Remove the pseudo instruction // We want to leave I pointing to the previous instruction, but what if we // just erased the first instruction? @@ -1819,3 +1825,29 @@ void FPS::setKillFlags(MachineBasicBlock &MBB) const { LPR.stepBackward(MI); } } + +bool X86FPStackifierLegacy::runOnMachineFunction(MachineFunction &MF) { + FPS Impl; + if (!Impl.shouldRun(MF)) + return false; + + EdgeBundles *Bundles = + &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles(); + return FPS().run(MF, Bundles); +} + +PreservedAnalyses +X86FPStackifierPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + FPS Impl; + if (!Impl.shouldRun(MF)) + return PreservedAnalyses::all(); + + EdgeBundles *Bundles = &MFAM.getResult<EdgeBundlesAnalysis>(MF); + bool Changed = Impl.run(MF, Bundles); + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index a66a321..8bca634 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -3093,8 +3093,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters( MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI, - Register(), MachineInstr::FrameSetup); + TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, Register(), + MachineInstr::FrameSetup); } return true; @@ -3166,8 +3166,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI, - Register()); + TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, Register()); } // Clear the stack slot for spill base pointer register. diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index d4418c8..e7903a7 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1004,7 +1004,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() { if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) { APInt SplatVal; - if (X86::isConstantSplat(N->getOperand(1), SplatVal) && + if (!ISD::isBuildVectorOfConstantSDNodes( + peekThroughBitcasts(N->getOperand(0)).getNode()) && + X86::isConstantSplat(N->getOperand(1), SplatVal) && SplatVal.isOne()) { SDLoc DL(N); @@ -4728,9 +4730,9 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) { if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() && ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) { - SDValue InnerOp = Op->getOperand(0); + SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0)); - if (!getFoldableLogicOp(InnerOp)) + if (!InnerOp) return SDValue(); N0 = InnerOp.getOperand(0); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 007074c..fbd875a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -130,7 +130,7 @@ static cl::opt<bool> MulConstantOptimization( X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) - : TargetLowering(TM), Subtarget(STI) { + : TargetLowering(TM, STI), Subtarget(STI) { bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); @@ -635,6 +635,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FROUNDEVEN, VT, Action); setOperationAction(ISD::FTRUNC, VT, Action); setOperationAction(ISD::FLDEXP, VT, Action); + setOperationAction(ISD::FSINCOSPI, VT, Action); }; if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -2072,8 +2073,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasVBMI2()) { for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) { - setOperationAction(ISD::FSHL, VT, Custom); - setOperationAction(ISD::FSHR, VT, Custom); + setOperationAction(ISD::FSHL, VT, Legal); + setOperationAction(ISD::FSHR, VT, Legal); } setOperationAction(ISD::ROTL, MVT::v32i16, Custom); @@ -2088,8 +2089,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) { for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32, MVT::v4i64}) { - setOperationAction(ISD::FSHL, VT, Custom); - setOperationAction(ISD::FSHR, VT, Custom); + setOperationAction(ISD::FSHL, VT, Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::FSHR, VT, Subtarget.hasVLX() ? Legal : Custom); } } @@ -2097,9 +2098,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // pre-AVX512 equivalents. Without VLX we use 512-bit operations for // narrower widths. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { + for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32, + MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16, + MVT::v16f32, MVT::v8f64}) + setOperationAction(ISD::FLDEXP, VT, Custom); + // These operations are handled on non-VLX by artificially widening in // isel patterns. - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); @@ -2150,6 +2155,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (Subtarget.hasCDI()) { + for (auto VT : {MVT::i256, MVT::i512}) { + if (VT == MVT::i512 && !Subtarget.useAVX512Regs()) + continue; + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); + } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::CTLZ, VT, Legal); } @@ -2572,8 +2585,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -2655,6 +2668,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::AVGFLOORU, ISD::BITREVERSE, ISD::ADD, + ISD::SADDSAT, + ISD::SSUBSAT, ISD::FADD, ISD::FSUB, ISD::FNEG, @@ -2694,6 +2709,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::STRICT_FP_EXTEND, ISD::FP_ROUND, ISD::STRICT_FP_ROUND, + ISD::FSHL, + ISD::FSHR, ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN}); @@ -2871,6 +2888,8 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VZEXT_MOVL: + case X86ISD::COMPRESS: + case X86ISD::EXPAND: return true; } } @@ -3087,7 +3106,7 @@ static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) { } bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &I, + const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const { Info.flags = MachineMemOperand::MONone; @@ -3454,6 +3473,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) return true; + // If we have a large vector type (even if illegal), don't bitcast to large + // (illegal) scalar types. Better to load fewer vectors and extract. + if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() && + BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0) + return false; + return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } @@ -5358,12 +5383,12 @@ bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) { int getRoundingModeX86(unsigned RM) { switch (static_cast<::llvm::RoundingMode>(RM)) { // clang-format off - case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break; - case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break; - case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break; - case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break; - default: - return X86::rmInvalid; // Invalid rounding mode + case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; + case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; + case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; + case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; + default: return X86::rmInvalid; + // clang-format on } } @@ -5816,6 +5841,48 @@ static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, } return false; } + case X86ISD::COMPRESS: { + SDValue CmpVec = N.getOperand(0); + SDValue PassThru = N.getOperand(1); + SDValue CmpMask = N.getOperand(2); + APInt UndefElts; + SmallVector<APInt> EltBits; + if (!getTargetConstantBitsFromNode(CmpMask, 1, UndefElts, EltBits)) + return false; + assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems && + "Illegal compression mask"); + for (unsigned I = 0; I != NumElems; ++I) { + if (!EltBits[I].isZero()) + Mask.push_back(I); + } + while (Mask.size() != NumElems) { + Mask.push_back(NumElems + Mask.size()); + } + Ops.push_back(CmpVec); + Ops.push_back(PassThru); + return true; + } + case X86ISD::EXPAND: { + SDValue ExpVec = N.getOperand(0); + SDValue PassThru = N.getOperand(1); + SDValue ExpMask = N.getOperand(2); + APInt UndefElts; + SmallVector<APInt> EltBits; + if (!getTargetConstantBitsFromNode(ExpMask, 1, UndefElts, EltBits)) + return false; + assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems && + "Illegal expansion mask"); + unsigned ExpIndex = 0; + for (unsigned I = 0; I != NumElems; ++I) { + if (EltBits[I].isZero()) + Mask.push_back(I + NumElems); + else + Mask.push_back(ExpIndex++); + } + Ops.push_back(ExpVec); + Ops.push_back(PassThru); + return true; + } default: llvm_unreachable("unknown target shuffle node"); } @@ -7270,7 +7337,10 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, - bool IsAfterLegalize) { + bool IsAfterLegalize, + unsigned Depth = 0) { + if (Depth >= SelectionDAG::MaxRecursionDepth) + return SDValue(); // Limit search depth. if ((VT.getScalarSizeInBits() % 8) != 0) return SDValue(); @@ -7444,7 +7514,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); SDValue HalfLD = EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, - DAG, Subtarget, IsAfterLegalize); + DAG, Subtarget, IsAfterLegalize, Depth + 1); if (HalfLD) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), HalfLD, DAG.getVectorIdxConstant(0, DL)); @@ -7521,7 +7591,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, VT.getSizeInBits() / ScalarSize); if (TLI.isTypeLegal(BroadcastVT)) { if (SDValue RepeatLoad = EltsFromConsecutiveLoads( - RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) { + RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize, + Depth + 1)) { SDValue Broadcast = RepeatLoad; if (RepeatSize > ScalarSize) { while (Broadcast.getValueSizeInBits() < VT.getSizeInBits()) @@ -7542,6 +7613,20 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } + // REVERSE - attempt to match the loads in reverse and then shuffle back. + // TODO: Do this for any permute or mismatching element counts. + if (Depth == 0 && ZeroMask.isZero() && UndefMask.isZero() && + TLI.isTypeLegal(VT) && VT.isVector() && + NumElems == VT.getVectorNumElements()) { + SmallVector<SDValue, 16> ReverseElts(Elts.rbegin(), Elts.rend()); + if (SDValue RevLd = EltsFromConsecutiveLoads( + VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) { + SmallVector<int, 16> ReverseMask(NumElems); + std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0); + return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask); + } + } + return SDValue(); } @@ -7948,7 +8033,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); - if (Opc == ISD::UNDEF) + if (Opc == ISD::POISON || Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { @@ -7991,7 +8076,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, if (!VecIn1.getNode()) return SDValue(); - VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); + VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getPOISON(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); for (unsigned Idx : InsertIndices) @@ -8115,6 +8200,8 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, case X86ISD::FHSUB: case X86ISD::HADD: case X86ISD::HSUB: + case X86ISD::HADDS: + case X86ISD::HSUBS: return true; } return false; @@ -8426,9 +8513,7 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, // DAGCombiner::visitFADDForFMACombine. It would be good to have one // function that would answer if it is Ok to fuse MUL + ADD to FMADD // or MUL + ADDSUB to FMADDSUB. - const TargetOptions &Options = DAG.getTarget().Options; bool AllowFusion = - Options.AllowFPOpFusion == FPOpFusion::Fast || (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract()); if (!AllowFusion) return false; @@ -8856,6 +8941,56 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, return SDValue(); } +/// Widen a BUILD_VECTOR if the scalar operands are freely mergeable. +static SDValue widenBuildVector(BuildVectorSDNode *BVOp, SDLoc const &DL, + X86Subtarget const &Subtarget, + SelectionDAG &DAG) { + using namespace SDPatternMatch; + MVT VT = BVOp->getSimpleValueType(0); + MVT SVT = VT.getScalarType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltBits = SVT.getSizeInBits(); + + if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32) + return SDValue(); + + unsigned WideBits = 2 * EltBits; + MVT WideSVT = MVT::getIntegerVT(WideBits); + MVT WideVT = MVT::getVectorVT(WideSVT, NumElts / 2); + if (!DAG.getTargetLoweringInfo().isTypeLegal(WideSVT)) + return SDValue(); + + SmallVector<SDValue, 8> WideOps; + for (unsigned I = 0; I != NumElts; I += 2) { + SDValue Op0 = BVOp->getOperand(I + 0); + SDValue Op1 = BVOp->getOperand(I + 1); + + if (Op0.isUndef() && Op1.isUndef()) { + WideOps.push_back(DAG.getUNDEF(WideSVT)); + continue; + } + + // TODO: Constant repacking? + + // Merge scalars that have been split from the same source. + SDValue X, Y; + if (sd_match(Op0, m_Trunc(m_Value(X))) && + sd_match(Op1, m_Trunc(m_Srl(m_Value(Y), m_SpecificInt(EltBits)))) && + peekThroughTruncates(X) == peekThroughTruncates(Y) && + X.getValueType().bitsGE(WideSVT)) { + if (X.getValueType().bitsGT(WideSVT)) + X = DAG.getNode(ISD::TRUNCATE, DL, WideSVT, X); + WideOps.push_back(X); + continue; + } + + return SDValue(); + } + + assert(WideOps.size() == (NumElts / 2) && "Failed to widen build vector"); + return DAG.getBitcast(VT, DAG.getBuildVector(WideVT, DL, WideOps)); +} + /// Create a vector constant without a load. SSE/AVX provide the bare minimum /// functionality to do this, so it's all zeros, all ones, or some derivation /// that is cheap to calculate. @@ -9326,6 +9461,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return BitOp; if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG)) return Blend; + if (SDValue WideBV = widenBuildVector(BV, dl, Subtarget, DAG)) + return WideBV; unsigned NumZero = ZeroMask.popcount(); unsigned NumNonZero = NonZeroMask.popcount(); @@ -18370,16 +18507,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, SmallVector<int> Mask(OrigMask); // Canonicalize the shuffle with any horizontal ops inputs. + // Don't attempt this if the shuffle can still be widened as we may lose + // whole lane shuffle patterns. // NOTE: This may update Ops and Mask. - if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( - Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget)) - return DAG.getBitcast(VT, HOp); + if (!canWidenShuffleElements(Mask)) { + if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( + Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget)) + return DAG.getBitcast(VT, HOp); - V1 = DAG.getBitcast(VT, Ops[0]); - V2 = DAG.getBitcast(VT, Ops[1]); - assert(NumElements == (int)Mask.size() && - "canonicalizeShuffleMaskWithHorizOp " - "shouldn't alter the shuffle mask size"); + V1 = DAG.getBitcast(VT, Ops[0]); + V2 = DAG.getBitcast(VT, Ops[1]); + assert(NumElements == (int)Mask.size() && + "canonicalizeShuffleMaskWithHorizOp " + "shouldn't alter the shuffle mask size"); + } // Canonicalize zeros/ones/fp splat constants to ensure no undefs. // These will be materialized uniformly anyway, so make splat matching easier. @@ -19142,6 +19283,72 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return SDValue(); } +static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue X = Op.getOperand(0); + MVT XTy = X.getSimpleValueType(); + SDValue Exp = Op.getOperand(1); + + switch (XTy.SimpleTy) { + default: + return SDValue(); + case MVT::f16: + if (!Subtarget.hasFP16()) + X = DAG.getFPExtendOrRound(X, DL, MVT::f32); + [[fallthrough]]; + case MVT::f32: + case MVT::f64: { + MVT VT = MVT::getVectorVT(X.getSimpleValueType(), + 128 / X.getSimpleValueType().getSizeInBits()); + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); + SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X); + SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp); + SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp); + SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0); + return DAG.getFPExtendOrRound(Final, DL, XTy); + } + case MVT::v4f32: + case MVT::v2f64: + case MVT::v8f32: + case MVT::v4f64: + case MVT::v16f32: + case MVT::v8f64: + if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp); + } + break; + case MVT::v8f16: + case MVT::v16f16: + if (Subtarget.hasFP16()) { + if (Subtarget.hasVLX()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp); + } + break; + } + X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32)); + Exp = DAG.getSExtOrTrunc(Exp, DL, + X.getSimpleValueType().changeTypeToInteger()); + break; + case MVT::v32f16: + if (Subtarget.hasFP16()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp); + } + return splitVectorOp(Op, DAG, DL); + } + SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512); + SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512); + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp); + SDValue Scalef = + DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp); + SDValue Final = + DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0); + return DAG.getFPExtendOrRound(Final, DL, XTy); +} + static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -22861,6 +23068,13 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, if (!OpVT.isScalarInteger() || OpSize < 128) return SDValue(); + // Don't do this if we're not supposed to use the FPU. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (Subtarget.useSoftFloat() || NoImplicitFloatOps) + return SDValue(); + // Ignore a comparison with zero because that gets special treatment in // EmitTest(). But make an exception for the special case of a pair of // logically-combined vector-sized operands compared to zero. This pattern may @@ -22883,13 +23097,9 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands. // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands. // Otherwise use PCMPEQ (plus AND) and mask testing. - bool NoImplicitFloatOps = - DAG.getMachineFunction().getFunction().hasFnAttribute( - Attribute::NoImplicitFloat); - if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && - ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && Subtarget.hasAVX()) || - (OpSize == 512 && Subtarget.useAVX512Regs()))) { + if ((OpSize == 128 && Subtarget.hasSSE2()) || + (OpSize == 256 && Subtarget.hasAVX()) || + (OpSize == 512 && Subtarget.useAVX512Regs())) { bool HasPT = Subtarget.hasSSE41(); // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened @@ -29565,9 +29775,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, } if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) { SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT)); - SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B); SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B); - SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo); + SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, DAG.getBitcast(ExVT, A), + DAG.getBitcast(ExVT, B)); SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi); RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask); RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi, @@ -29583,26 +29793,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SDValue Undef = DAG.getUNDEF(VT); SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef)); SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef)); - - SDValue BLo, BHi; - if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { - // If the RHS is a constant, manually unpackl/unpackh. - SmallVector<SDValue, 16> LoOps, HiOps; - for (unsigned i = 0; i != NumElts; i += 16) { - for (unsigned j = 0; j != 8; ++j) { - LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, - MVT::i16)); - HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, - MVT::i16)); - } - } - - BLo = DAG.getBuildVector(ExVT, dl, LoOps); - BHi = DAG.getBuildVector(ExVT, dl, HiOps); - } else { - BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef)); - BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef)); - } + SDValue BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef)); + SDValue BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef)); // Multiply, mask the lower 8bits of the lo/hi results and pack. SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); @@ -30905,6 +31097,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } + if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) { + // On AVX512BW, we can use variable 16-bit shifts to implement variable + // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi. + // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane + // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors + // can efficiently be merged together using a masked move. + MVT ExtVT = MVT::v32i16; + + SDValue RLo, RHi; + // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and + // right shifting AmtHi. + SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), + DAG.getConstant(0x00ff, dl, ExtVT)); + SDValue AmtHi = getTargetVShiftByConstNode( + X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG); + switch (Opc) { + case ISD::SHL: + // Because we shift left, no bits from the high half can influence the low + // half, so we don't need to mask RLo. We do however need to mask RHi, to + // prevent high bits of an even lane overflowing into low bits of an odd + // lane. + RLo = DAG.getBitcast(ExtVT, R); + RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo, + DAG.getConstant(0xff00, dl, ExtVT)); + break; + case ISD::SRL: + // Same idea as above, but this time we need to make sure no low bits of + // an odd lane can overflow into high bits of an even lane. + RHi = DAG.getBitcast(ExtVT, R); + RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi, + DAG.getConstant(0x00ff, dl, ExtVT)); + break; + case ISD::SRA: + // For arithmetic right shifts, we want to sign extend each even lane of R + // such that the upper half of the corresponding lane of RLo is 0 or -1 + // depending on the sign bit of the original lane. We do this using 2 + // immediate shifts. + RHi = DAG.getBitcast(ExtVT, R); + RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG); + RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG); + break; + default: + llvm_unreachable("Unexpected Shift Op"); + } + + SDValue ShiftedLo = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo)); + SDValue ShiftedHi = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi)); + + // To merge the shifted vectors back together, we select even lanes + // from ShiftedLo and odd lanes from ShiftedHi. + SDValue SelectMask = DAG.getBitcast( + MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64)); + return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { @@ -31124,19 +31373,15 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt); unsigned NumElts = VT.getVectorNumElements(); - if (Subtarget.hasVBMI2() && EltSizeInBits > 8) { - - if (IsCstSplat) { - if (IsFSHR) - std::swap(Op0, Op1); - uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); - SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8); - return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, - {Op0, Op1, Imm}, DAG, Subtarget); - } + // For non-VLX VBMI2 targets, widen 128/256-bit to 512-bit so + // the rest of the lowering/isel can select the VBMI2 forms. + // Only Custom types (v8i16, v4i32, v2i64, v16i16, v8i32, v4i64) can + // reach LowerFunnelShift with VBMI2 but no VLX, so no type check needed. + if (Subtarget.hasVBMI2() && !Subtarget.hasVLX() && EltSizeInBits > 8) { return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT, {Op0, Op1, Amt}, DAG, Subtarget); } + assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && @@ -33001,60 +33246,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - bool isF64 = ArgVT == MVT::f64; - - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); - if (!LibcallName) - return SDValue(); - - assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); - - // For MacOSX, we want to call an alternative entry point: __sincos_stret, - // which returns the values as { float, float } (in XMM0) or - // { double, double } (which is returned in XMM0, XMM1). - SDLoc dl(Op); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - TargetLowering::ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - // Only optimize x86_64 for now. i386 is a bit messy. For f32, - // the small struct {f32, f32} is returned in (eax, edx). For f64, - // the results are returned via SRet in memory. - SDValue Callee = - DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - - Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) - : (Type *)FixedVectorType::get(ArgTy, 4); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); - - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - - if (isF64) - // Returned in xmm0 and xmm1. - return CallResult.first; - - // Returned in bits 0:31 and 32:64 xmm0. - SDValue SinVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(0, dl)); - SDValue CosVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(1, dl)); - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); -} - /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, @@ -33659,7 +33850,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ABDS: case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); @@ -33669,7 +33859,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG); - // clang-format on + case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG); + // clang-format on } } @@ -33753,6 +33944,59 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::CTLZ: + case ISD::CTTZ: + case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_UNDEF: { + // Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512 + // vXi64 CTLZ/CTTZ and VECTOR_COMPRESS. + // Compute the CTLZ/CTTZ of each element, add the element's bit offset, + // compress the result to remove all zero elements (passthru is set to + // scalar bitwidth if all elements are zero) and extract the lowest + // compressed element. + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + assert(Subtarget.hasCDI() && "AVX512CD required"); + assert((VT == MVT::i256 || VT == MVT::i512) && "Unexpected VT!"); + if (VT == MVT::i256 && !X86::mayFoldLoad(N0, Subtarget)) + return; + + unsigned SizeInBits = VT.getSizeInBits(); + MVT VecVT = MVT::getVectorVT(MVT::i64, SizeInBits / 64); + MVT BoolVT = VecVT.changeVectorElementType(MVT::i1); + SDValue Vec = DAG.getBitcast(VecVT, N0); + + SmallVector<int, 8> RevMask; + SmallVector<SDValue, 8> Offsets; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I != E; ++I) { + RevMask.push_back((int)((E - 1) - I)); + Offsets.push_back(DAG.getConstant(I * 64, dl, MVT::i64)); + } + + // CTLZ - reverse the elements as we want the top non-zero element at the + // bottom for compression. + unsigned VecOpc = ISD::CTTZ; + if (Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF) { + VecOpc = ISD::CTLZ; + Vec = DAG.getVectorShuffle(VecVT, dl, Vec, Vec, RevMask); + } + + SDValue PassThrough = DAG.getUNDEF(VecVT); + if (Opc == ISD::CTLZ || Opc == ISD::CTTZ) + PassThrough = DAG.getConstant(SizeInBits, dl, VecVT); + + SDValue IsNonZero = DAG.getSetCC(dl, BoolVT, Vec, + DAG.getConstant(0, dl, VecVT), ISD::SETNE); + SDValue Cnt = DAG.getNode(VecOpc, dl, VecVT, Vec); + Cnt = DAG.getNode(ISD::ADD, dl, VecVT, Cnt, + DAG.getBuildVector(VecVT, dl, Offsets)); + Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, dl, VecVT, Cnt, IsNonZero, + PassThrough); + Cnt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cnt, + DAG.getVectorIdxConstant(0, dl)); + Results.push_back(DAG.getZExtOrTrunc(Cnt, dl, VT)); + return; + } case ISD::MUL: { EVT VT = N->getValueType(0); assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && @@ -34928,6 +35172,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BLENDV) NODE_NAME_CASE(HADD) NODE_NAME_CASE(HSUB) + NODE_NAME_CASE(HADDS) + NODE_NAME_CASE(HSUBS) NODE_NAME_CASE(FHADD) NODE_NAME_CASE(FHSUB) NODE_NAME_CASE(CONFLICT) @@ -38165,22 +38411,22 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, default: llvm_unreachable("Unexpected instruction!"); case X86::PTCVTROWD2PSrri: - Opc = X86::TCVTROWD2PSrri; + Opc = X86::TCVTROWD2PSrti; break; case X86::PTCVTROWPS2BF16Hrri: - Opc = X86::TCVTROWPS2BF16Hrri; + Opc = X86::TCVTROWPS2BF16Hrti; break; case X86::PTCVTROWPS2PHHrri: - Opc = X86::TCVTROWPS2PHHrri; + Opc = X86::TCVTROWPS2PHHrti; break; case X86::PTCVTROWPS2BF16Lrri: - Opc = X86::TCVTROWPS2BF16Lrri; + Opc = X86::TCVTROWPS2BF16Lrti; break; case X86::PTCVTROWPS2PHLrri: - Opc = X86::TCVTROWPS2PHLrri; + Opc = X86::TCVTROWPS2PHLrti; break; case X86::PTILEMOVROWrri: - Opc = X86::TILEMOVROWrri; + Opc = X86::TILEMOVROWrti; break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); @@ -38203,22 +38449,22 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, default: llvm_unreachable("Unexpected instruction!"); case X86::PTCVTROWD2PSrre: - Opc = X86::TCVTROWD2PSrre; + Opc = X86::TCVTROWD2PSrte; break; case X86::PTCVTROWPS2BF16Hrre: - Opc = X86::TCVTROWPS2BF16Hrre; + Opc = X86::TCVTROWPS2BF16Hrte; break; case X86::PTCVTROWPS2BF16Lrre: - Opc = X86::TCVTROWPS2BF16Lrre; + Opc = X86::TCVTROWPS2BF16Lrte; break; case X86::PTCVTROWPS2PHHrre: - Opc = X86::TCVTROWPS2PHHrre; + Opc = X86::TCVTROWPS2PHHrte; break; case X86::PTCVTROWPS2PHLrre: - Opc = X86::TCVTROWPS2PHLrre; + Opc = X86::TCVTROWPS2PHLrte; break; case X86::PTILEMOVROWrre: - Opc = X86::TILEMOVROWrre; + Opc = X86::TILEMOVROWrte; break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); @@ -40704,8 +40950,9 @@ static SDValue canonicalizeShuffleMaskWithHorizOp( })) return SDValue(); - bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || - Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB); + bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::FHSUB || + Opcode0 == X86ISD::HADD || Opcode0 == X86ISD::HSUB || + Opcode0 == X86ISD::HADDS || Opcode0 == X86ISD::HSUBS); bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS); if (!isHoriz && !isPack) return SDValue(); @@ -45011,11 +45258,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( case X86ISD::INSERTPS: case X86ISD::BLENDI: case X86ISD::PSHUFB: + case X86ISD::VZEXT_MOVL: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: { SmallVector<int, 8> Mask; @@ -45041,6 +45293,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( } break; } + case X86ISD::VBROADCAST: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + if (SrcVT.isVector()) { + APInt DemandedSrc = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0); + return DAG.isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrc, PoisonOnly, + Depth + 1); + } + return DAG.isGuaranteedNotToBeUndefOrPoison(Src, PoisonOnly, Depth + 1); + } } return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( Op, DemandedElts, DAG, PoisonOnly, Depth); @@ -45085,13 +45347,19 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( // SSE target shuffles. case X86ISD::INSERTPS: case X86ISD::PSHUFB: + case X86ISD::VZEXT_MOVL: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: + case X86ISD::VBROADCAST: return false; // SSE comparisons handle all icmp/fcmp cases. // TODO: Add CMPM/MM with test coverage. @@ -53304,18 +53572,48 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, if (Mst->isCompressingStore()) return SDValue(); - EVT VT = Mst->getValue().getValueType(); + if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget)) + return ScalarStore; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDLoc DL(N); - if (Mst->isTruncatingStore()) - return SDValue(); + SDValue Mask = Mst->getMask(); + SDValue Value = Mst->getValue(); + EVT MemVT = Mst->getMemoryVT(); + EVT VT = Value.getValueType(); - if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget)) - return ScalarStore; + // See if the truncating store can be a saturating truncated store. + if (Mst->isTruncatingStore()) { + if (VT.isVector() && MemVT.isVector() && VT.getScalarType().isInteger() && + MemVT.getScalarType().isInteger() && + VT.getVectorNumElements() == MemVT.getVectorNumElements() && + Subtarget.hasBWI() && Subtarget.hasVLX()) { + + SDValue SatSrc; + unsigned Opc; + if (SDValue SVal = detectSSatPattern(Value, MemVT)) { + SatSrc = SVal; + Opc = X86ISD::VMTRUNCSTORES; + } else if (SDValue UVal = detectUSatPattern(Value, MemVT, DAG, DL)) { + SatSrc = UVal; + Opc = X86ISD::VMTRUNCSTOREUS; + } else { + return SDValue(); + } + + SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue Ops[] = {Mst->getChain(), SatSrc, Mst->getBasePtr(), Mask}; + MachineMemOperand *MMO = Mst->getMemOperand(); + return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO); + } + + // Otherwise don't combine if this store already truncates. + return SDValue(); + } // If the mask value has been legalized to a non-boolean vector, try to // simplify ops leading up to it. We only demand the MSB of each lane. - SDValue Mask = Mst->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) { @@ -53331,14 +53629,12 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, Mst->getAddressingMode()); } - SDValue Value = Mst->getValue(); if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && - TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), - Mst->getMemoryVT())) { - return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), - Mst->getBasePtr(), Mst->getOffset(), Mask, - Mst->getMemoryVT(), Mst->getMemOperand(), - Mst->getAddressingMode(), true); + TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), MemVT)) { + return DAG.getMaskedStore(Mst->getChain(), DL, Value.getOperand(0), + Mst->getBasePtr(), Mst->getOffset(), Mask, MemVT, + Mst->getMemOperand(), Mst->getAddressingMode(), + true); } return SDValue(); @@ -53349,23 +53645,14 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, // i32 sub value. static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { using namespace SDPatternMatch; - - // Only handle normal stores and its chain was a matching normal load. - auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); - if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || - !ISD::isNormalLoad(Ld) || !Ld->isSimple() || - Ld->getBasePtr() != St->getBasePtr() || - Ld->getOffset() != St->getOffset()) - return SDValue(); - - SDValue LoadVal(Ld, 0); SDValue StoredVal = St->getValue(); EVT VT = StoredVal.getValueType(); - // Only narrow larger than legal scalar integers. - if (!VT.isScalarInteger() || + // Only narrow normal stores of larger than legal scalar integers. + if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() || VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) return SDValue(); @@ -53374,18 +53661,25 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, // BTC: X ^ (1 << ShAmt) // // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) - SDValue InsertBit, ShAmt; - if (!StoredVal.hasOneUse() || - !(sd_match(StoredVal, m_And(m_Specific(LoadVal), + SDValue SrcVal, InsertBit, ShAmt; + if (!(sd_match(StoredVal, m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, - m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || - sd_match(StoredVal, - m_Or(m_And(m_Specific(LoadVal), - m_Not(m_Shl(m_One(), m_Value(ShAmt)))), - m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match( + StoredVal, + m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + return SDValue(); + + // SrcVal must be a matching normal load further up the chain. + auto *Ld = dyn_cast<LoadSDNode>(peekThroughBitcasts(SrcVal)); + if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset() || + !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1))) return SDValue(); // Ensure the shift amount is in bounds. @@ -53419,7 +53713,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SDNodeFlags::NoUnsignedWrap); // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. - SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); + SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt); X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, @@ -53439,8 +53733,21 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); } - return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), - Align(), St->getMemOperand()->getFlags()); + SDValue NewStore = + DAG.getStore(St->getChain(), DL, Res, NewPtr, + MachinePointerInfo(St->getPointerInfo().getAddrSpace()), + Align(), St->getMemOperand()->getFlags()); + + // If there are other uses of StoredVal, replace with a new load of the + // whole (updated) value. + if (!StoredVal.hasOneUse()) { + SDValue NewLoad = + DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand()); + for (SDNode *User : StoredVal->users()) + DCI.AddToWorklist(User); + DAG.ReplaceAllUsesWith(StoredVal, NewLoad); + } + return NewStore; } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, @@ -53669,7 +53976,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } } - if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) + if (SDValue R = narrowBitOpRMW(St, dl, DAG, DCI, Subtarget)) return R; // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC) @@ -54006,7 +54313,9 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); unsigned Opcode = N->getOpcode(); - bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD); + bool IsAdd = + (Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT); + bool IsSat = (Opcode == ISD::SADDSAT) || (Opcode == ISD::SSUBSAT); SmallVector<int, 8> PostShuffleMask; auto MergableHorizOp = [N](unsigned HorizOpcode) { @@ -54036,11 +54345,17 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, break; case ISD::ADD: case ISD::SUB: - if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v16i16 || VT == MVT::v8i32)) { + case ISD::SADDSAT: + case ISD::SSUBSAT: + if (!Subtarget.hasSSSE3()) + break; + if (VT == MVT::v8i16 || VT == MVT::v16i16 || + (!IsSat && (VT == MVT::v4i32 || VT == MVT::v8i32))) { + SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB; + auto HorizOpcode = IsSat ? (IsAdd ? X86ISD::HADDS : X86ISD::HSUBS) + : (IsAdd ? X86ISD::HADD : X86ISD::HSUB); if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd, PostShuffleMask, MergableHorizOp(HorizOpcode))) { auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL, @@ -54117,11 +54432,6 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, // FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A) static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - auto AllowContract = [&DAG](const SDNodeFlags &Flags) { - return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || - Flags.hasAllowContract(); - }; - auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) { return DAG.getTarget().Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros(); @@ -54134,7 +54444,7 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, }; if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() || - !AllowContract(N->getFlags())) + !N->getFlags().hasAllowContract()) return SDValue(); EVT VT = N->getValueType(0); @@ -54145,14 +54455,13 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); bool IsConj; SDValue FAddOp1, MulOp0, MulOp1; - auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract, - &IsVectorAllNegativeZero, + auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &IsVectorAllNegativeZero, &HasNoSignedZero](SDValue N) -> bool { if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST) return false; SDValue Op0 = N.getOperand(0); unsigned Opcode = Op0.getOpcode(); - if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) { + if (Op0.hasOneUse() && Op0->getFlags().hasAllowContract()) { if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) { MulOp0 = Op0.getOperand(0); MulOp1 = Op0.getOperand(1); @@ -54614,11 +54923,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); // Check the shift amount is byte aligned. // Check the truncation doesn't use any shifted in (zero) top bits. - // Check the shift amount doesn't depend on the original load. + // Check the shift amount doesn't depend on the original load chain. if (KnownAmt.countMinTrailingZeros() >= 3 && KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() - VT.getSizeInBits()) && - !Ld->isPredecessorOf(ShAmt.getNode())) { + none_of(Ld->uses(), [&ShAmt](SDUse &Use) { + return Use.getResNo() == 1 && + Use.getUser()->isPredecessorOf(ShAmt.getNode()); + })) { EVT PtrVT = Ld->getBasePtr().getValueType(); SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT); SDValue PtrByteOfs = @@ -54627,10 +54939,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewPtr = DAG.getMemBasePlusOffset( Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); SDValue NewLoad = - DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, + MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()), Align(), Ld->getMemOperand()->getFlags()); - DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), - NewLoad.getValue(1)); + DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); return NewLoad; } } @@ -57400,6 +57712,40 @@ static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Combiner: turn uniform-constant splat funnel shifts into VSHLD/VSHRD +static SDValue combineFunnelShift(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + SDLoc DL(N); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDValue Amt = N->getOperand(2); + EVT VT = Op0.getValueType(); + + if (!VT.isVector()) + return SDValue(); + + // Only combine if the operation is legal for this type. + // This ensures we don't try to convert types that need to be + // widened/promoted. + if (!DAG.getTargetLoweringInfo().isOperationLegal(N->getOpcode(), VT)) + return SDValue(); + + unsigned EltSize = VT.getScalarSizeInBits(); + APInt ShiftVal; + if (!X86::isConstantSplat(Amt, ShiftVal)) + return SDValue(); + + uint64_t ModAmt = ShiftVal.urem(EltSize); + SDValue Imm = DAG.getTargetConstant(ModAmt, DL, MVT::i8); + bool IsFSHR = N->getOpcode() == ISD::FSHR; + + if (IsFSHR) + std::swap(Op0, Op1); + unsigned Opcode = IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD; + return DAG.getNode(Opcode, DL, VT, {Op0, Op1, Imm}); +} + static bool needCarryOrOverflowFlag(SDValue Flags) { assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!"); @@ -59086,7 +59432,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, case X86ISD::ANDNP: // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2. if (!IsSplat && (VT.is256BitVector() || - (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { + (VT.is512BitVector() && Subtarget.useAVX512Regs()) || + (EltSizeInBits == 1 && TLI.isTypeLegal(VT)))) { // Don't concatenate root AVX1 NOT patterns. // TODO: Allow NOT folding if Concat0 succeeds. if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() && @@ -59096,7 +59443,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, break; SDValue Concat0 = CombineSubOperand(VT, Ops, 0); SDValue Concat1 = CombineSubOperand(VT, Ops, 1); - if (Concat0 || Concat1 || Subtarget.useAVX512Regs()) + if (Concat0 || Concat1 || + (EltSizeInBits != 1 && Subtarget.useAVX512Regs())) return DAG.getNode(Opcode, DL, VT, Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0), Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1)); @@ -59156,6 +59504,31 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } break; + case ISD::SETCC: + if (!IsSplat && EltSizeInBits == 1 && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op0.getOperand(0).getValueType() == + Op.getOperand(0).getValueType() && + Op0.getOperand(2) == Op.getOperand(2); + })) { + EVT SrcVT = Op0.getOperand(0).getValueType(); + EVT NewSrcVT = EVT::getVectorVT(Ctx, SrcVT.getScalarType(), + NumOps * SrcVT.getVectorNumElements()); + unsigned SrcSizeInBits = SrcVT.getScalarSizeInBits(); + if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(NewSrcVT) && + (NewSrcVT.is256BitVector() || + (NewSrcVT.is512BitVector() && Subtarget.useAVX512Regs() && + (SrcSizeInBits >= 32 || Subtarget.useBWIRegs())))) { + SDValue LHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 0); + SDValue RHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 1); + if (LHS || RHS) + return DAG.getNode(Opcode, DL, VT, + LHS ? LHS : ConcatSubOperand(NewSrcVT, Ops, 0), + RHS ? RHS : ConcatSubOperand(NewSrcVT, Ops, 1), + Op0.getOperand(2)); + } + } + break; case ISD::CTPOP: case ISD::CTTZ: case ISD::CTLZ: @@ -59219,6 +59592,36 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ConcatSubOperand(VT, Ops, 1)); } break; + case ISD::FSQRT: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FROUNDEVEN: + case ISD::FFLOOR: + if (!IsSplat && (VT.is256BitVector() || + (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { + return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0)); + } + break; + case X86ISD::FRCP: + case X86ISD::FRSQRT: + if (!IsSplat && VT.is256BitVector()) { + return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0)); + } + break; + case X86ISD::VRNDSCALE: + if (!IsSplat && + (VT.is256BitVector() || + (VT.is512BitVector() && Subtarget.useAVX512Regs())) && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op0.getOperand(1) == Op.getOperand(1); + })) { + return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0), + Op0.getOperand(1)); + } + break; case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: @@ -59350,8 +59753,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT, *FirstLd->getMemOperand(), &Fast) && Fast) { - if (SDValue Ld = - EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) + if (SDValue Ld = EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, + false, Depth + 1)) return Ld; } } @@ -59490,6 +59893,17 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, } } + // Attempt to merge comparison/logic ops if the type is legal. + if (TLI.isTypeLegal(VT) && + (all_of(Ops, [](SDValue Op) { return Op.getOpcode() == ISD::SETCC; }) || + all_of(Ops, [](SDValue Op) { + return ISD::isBitwiseLogicOp(Op.getOpcode()); + }))) { + if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, + DAG, Subtarget)) + return R; + } + // Don't do anything else for i1 vectors. return SDValue(); } @@ -60830,6 +61244,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget); case X86ISD::ADD: case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget); + case ISD::SADDSAT: + case ISD::SSUBSAT: return combineToHorizontalAddSub(N, DAG, Subtarget); case X86ISD::CLOAD: case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG); case X86ISD::SBB: return combineSBB(N, DAG); @@ -60953,6 +61369,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case X86ISD::SHUF128: case X86ISD::VZEXT_MOVL: + case X86ISD::COMPRESS: + case X86ISD::EXPAND: case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case X86ISD::FMADD_RND: case X86ISD::FMSUB: @@ -61000,6 +61418,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI); case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget); + case ISD::FSHL: + case ISD::FSHR: return combineFunnelShift(N, DAG, DCI, Subtarget); // clang-format on } @@ -61554,8 +61974,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, if (auto *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) { - Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), - Op.getValueType()); + Result = DAG.getSignedTargetConstant(C->getSExtValue(), SDLoc(Op), + Op.getValueType()); break; } } @@ -61593,7 +62013,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getSExtValue())) { // Widen to 64 bits here to get it sign extended. - Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); + Result = + DAG.getSignedTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); break; } // FIXME gcc accepts some relocatable values here too, but only in certain @@ -61642,9 +62063,11 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, BooleanContent BCont = getBooleanContents(MVT::i64); ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) : ISD::SIGN_EXTEND; - int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue() - : CST->getSExtValue(); - Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64); + SDLoc DL(Op); + Result = + ExtOpc == ISD::ZERO_EXTEND + ? DAG.getTargetConstant(CST->getZExtValue(), DL, MVT::i64) + : DAG.getSignedTargetConstant(CST->getSExtValue(), DL, MVT::i64); break; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index b7151f6..848fe4b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -270,6 +270,10 @@ namespace llvm { HADD, HSUB, + /// Integer horizontal saturating add/sub. + HADDS, + HSUBS, + /// Floating point horizontal add/sub. FHADD, FHSUB, @@ -1478,7 +1482,7 @@ namespace llvm { /// to a MemIntrinsicNode (touches memory). If this is the case, it returns /// true and stores the intrinsic information into the IntrinsicInfo that was /// passed to the function. - bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index a61bbe5..8db3e50 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -553,7 +553,7 @@ static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { static Constant* SegmentOffset(IRBuilderBase &IRB, int Offset, unsigned AddressSpace) { return ConstantExpr::getIntToPtr( - ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), + ConstantInt::getSigned(Type::getInt32Ty(IRB.getContext()), Offset), IRB.getPtrTy(AddressSpace)); } diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp deleted file mode 100644 index 953b755..0000000 --- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp +++ /dev/null @@ -1,259 +0,0 @@ -//===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass applies cache prefetch instructions based on a profile. The pass -// assumes DiscriminateMemOps ran immediately before, to ensure debug info -// matches the one used at profile generation time. The profile is encoded in -// afdo format (text or binary). It contains prefetch hints recommendations. -// Each recommendation is made in terms of debug info locations, a type (i.e. -// nta, t{0|1|2}) and a delta. The debug info identifies an instruction with a -// memory operand (see X86DiscriminateMemOps). The prefetch will be made for -// a location at that memory operand + the delta specified in the -// recommendation. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86Subtarget.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/IR/DebugInfoMetadata.h" -#include "llvm/IR/Module.h" -#include "llvm/ProfileData/SampleProf.h" -#include "llvm/ProfileData/SampleProfReader.h" -#include "llvm/Support/VirtualFileSystem.h" -#include "llvm/Transforms/IPO/SampleProfile.h" -using namespace llvm; -using namespace sampleprof; - -static cl::opt<std::string> - PrefetchHintsFile("prefetch-hints-file", - cl::desc("Path to the prefetch hints profile. See also " - "-x86-discriminate-memops"), - cl::Hidden); -namespace { - -class X86InsertPrefetch : public MachineFunctionPass { - void getAnalysisUsage(AnalysisUsage &AU) const override; - bool doInitialization(Module &) override; - - bool runOnMachineFunction(MachineFunction &MF) override; - struct PrefetchInfo { - unsigned InstructionID; - int64_t Delta; - }; - typedef SmallVectorImpl<PrefetchInfo> Prefetches; - bool findPrefetchInfo(const FunctionSamples *Samples, const MachineInstr &MI, - Prefetches &prefetches) const; - -public: - static char ID; - X86InsertPrefetch(const std::string &PrefetchHintsFilename); - StringRef getPassName() const override { - return "X86 Insert Cache Prefetches"; - } - -private: - std::string Filename; - std::unique_ptr<SampleProfileReader> Reader; -}; - -using PrefetchHints = SampleRecord::CallTargetMap; - -// Return any prefetching hints for the specified MachineInstruction. The hints -// are returned as pairs (name, delta). -ErrorOr<const PrefetchHints &> -getPrefetchHints(const FunctionSamples *TopSamples, const MachineInstr &MI) { - if (const auto &Loc = MI.getDebugLoc()) - if (const auto *Samples = TopSamples->findFunctionSamples(Loc)) - return Samples->findCallTargetMapAt(FunctionSamples::getOffset(Loc), - Loc->getBaseDiscriminator()); - return std::error_code(); -} - -// The prefetch instruction can't take memory operands involving vector -// registers. -bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) { - Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg(); - Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg(); - return (BaseReg == 0 || - X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) || - X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) && - (IndexReg == 0 || - X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) || - X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)); -} - -} // end anonymous namespace - -//===----------------------------------------------------------------------===// -// Implementation -//===----------------------------------------------------------------------===// - -char X86InsertPrefetch::ID = 0; - -X86InsertPrefetch::X86InsertPrefetch(const std::string &PrefetchHintsFilename) - : MachineFunctionPass(ID), Filename(PrefetchHintsFilename) {} - -/// Return true if the provided MachineInstruction has cache prefetch hints. In -/// that case, the prefetch hints are stored, in order, in the Prefetches -/// vector. -bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples, - const MachineInstr &MI, - Prefetches &Prefetches) const { - assert(Prefetches.empty() && - "Expected caller passed empty PrefetchInfo vector."); - - // There is no point to match prefetch hints if the profile is using MD5. - if (FunctionSamples::UseMD5) - return false; - - static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = { - {"_nta_", X86::PREFETCHNTA}, - {"_t0_", X86::PREFETCHT0}, - {"_t1_", X86::PREFETCHT1}, - {"_t2_", X86::PREFETCHT2}, - }; - static const char *SerializedPrefetchPrefix = "__prefetch"; - - auto T = getPrefetchHints(TopSamples, MI); - if (!T) - return false; - int16_t max_index = -1; - // Convert serialized prefetch hints into PrefetchInfo objects, and populate - // the Prefetches vector. - for (const auto &S_V : *T) { - StringRef Name = S_V.first.stringRef(); - if (Name.consume_front(SerializedPrefetchPrefix)) { - int64_t D = static_cast<int64_t>(S_V.second); - unsigned IID = 0; - for (const auto &HintType : HintTypes) { - if (Name.consume_front(HintType.first)) { - IID = HintType.second; - break; - } - } - if (IID == 0) - return false; - uint8_t index = 0; - Name.consumeInteger(10, index); - - if (index >= Prefetches.size()) - Prefetches.resize(index + 1); - Prefetches[index] = {IID, D}; - max_index = std::max(max_index, static_cast<int16_t>(index)); - } - } - assert(max_index + 1 >= 0 && - "Possible overflow: max_index + 1 should be positive."); - assert(static_cast<size_t>(max_index + 1) == Prefetches.size() && - "The number of prefetch hints received should match the number of " - "PrefetchInfo objects returned"); - return !Prefetches.empty(); -} - -bool X86InsertPrefetch::doInitialization(Module &M) { - if (Filename.empty()) - return false; - - LLVMContext &Ctx = M.getContext(); - // TODO: Propagate virtual file system into LLVM targets. - auto FS = vfs::getRealFileSystem(); - ErrorOr<std::unique_ptr<SampleProfileReader>> ReaderOrErr = - SampleProfileReader::create(Filename, Ctx, *FS); - if (std::error_code EC = ReaderOrErr.getError()) { - std::string Msg = "Could not open profile: " + EC.message(); - Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg, - DiagnosticSeverity::DS_Warning)); - return false; - } - Reader = std::move(ReaderOrErr.get()); - Reader->read(); - return true; -} - -void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); -} - -bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) { - if (!Reader) - return false; - const FunctionSamples *Samples = Reader->getSamplesFor(MF.getFunction()); - if (!Samples) - return false; - - bool Changed = false; - - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - SmallVector<PrefetchInfo, 4> Prefetches; - for (auto &MBB : MF) { - for (auto MI = MBB.instr_begin(); MI != MBB.instr_end();) { - auto Current = MI; - ++MI; - - int Offset = X86II::getMemoryOperandNo(Current->getDesc().TSFlags); - if (Offset < 0) - continue; - unsigned Bias = X86II::getOperandBias(Current->getDesc()); - int MemOpOffset = Offset + Bias; - // FIXME(mtrofin): ORE message when the recommendation cannot be taken. - if (!IsMemOpCompatibleWithPrefetch(*Current, MemOpOffset)) - continue; - Prefetches.clear(); - if (!findPrefetchInfo(Samples, *Current, Prefetches)) - continue; - assert(!Prefetches.empty() && - "The Prefetches vector should contain at least a value if " - "findPrefetchInfo returned true."); - for (auto &PrefInfo : Prefetches) { - unsigned PFetchInstrID = PrefInfo.InstructionID; - int64_t Delta = PrefInfo.Delta; - const MCInstrDesc &Desc = TII->get(PFetchInstrID); - MachineInstr *PFetch = - MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true); - MachineInstrBuilder MIB(MF, PFetch); - - static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 && - X86::AddrIndexReg == 2 && X86::AddrDisp == 3 && - X86::AddrSegmentReg == 4, - "Unexpected change in X86 operand offset order."); - - // This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc. - // FIXME(mtrofin): consider adding a: - // MachineInstrBuilder::set(unsigned offset, op). - MIB.addReg(Current->getOperand(MemOpOffset + X86::AddrBaseReg).getReg()) - .addImm( - Current->getOperand(MemOpOffset + X86::AddrScaleAmt).getImm()) - .addReg( - Current->getOperand(MemOpOffset + X86::AddrIndexReg).getReg()) - .addImm(Current->getOperand(MemOpOffset + X86::AddrDisp).getImm() + - Delta) - .addReg(Current->getOperand(MemOpOffset + X86::AddrSegmentReg) - .getReg()); - - if (!Current->memoperands_empty()) { - MachineMemOperand *CurrentOp = *(Current->memoperands_begin()); - MIB.addMemOperand(MF.getMachineMemOperand( - CurrentOp, CurrentOp->getOffset() + Delta, CurrentOp->getSize())); - } - - // Insert before Current. This is because Current may clobber some of - // the registers used to describe the input memory operand. - MBB.insert(Current, PFetch); - Changed = true; - } - } - } - return Changed; -} - -FunctionPass *llvm::createX86InsertPrefetchPass() { - return new X86InsertPrefetch(PrefetchHintsFile); -} diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 522782a..6b8b8f7 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -370,11 +370,11 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { multiclass m_tcvtrowd2ps { let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { - def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst), + def rti : Ii8<0x7, MRMSrcReg, (outs VR512:$dst), (ins TILE:$src1, i32u8imm:$src2), "tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, TA,XS, EVEX, EVEX_V512; - def rre : I<0x4A, MRMSrcReg4VOp3, (outs VR512:$dst), + def rte : I<0x4A, MRMSrcReg4VOp3, (outs VR512:$dst), (ins TILE:$src1, GR32:$src2), "tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, T8,XS, EVEX, VVVV, EVEX_V512; @@ -450,12 +450,12 @@ multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr, Prefix P1, Prefix P2> { let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode], SchedRW = [WriteSystem] in { let OpPrefix = P1 in - def rre : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst), + def rte : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst), (ins TILE:$src1, GR32:$src2), !strconcat(Opstr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX, VVVV, EVEX_V512, T8; let OpPrefix = P2 in - def rri : Ii8<Opcode2, MRMSrcReg, (outs VR512:$dst), + def rti : Ii8<Opcode2, MRMSrcReg, (outs VR512:$dst), (ins TILE:$src1, i32u8imm:$src2), !strconcat(Opstr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX, EVEX_V512, TA; @@ -475,22 +475,22 @@ defm TCVTROWPS2PHL : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2phl", PD, XD>; defm TCVTROWPS2BF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2bf16h", XD, XD>; defm TCVTROWPS2BF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2bf16l", XS, XS>; -multiclass m_tilemovrow { +multiclass AMXAVX512_TILEMOVE<bits<8> Opcode1, bits<8> Opcode2, string Opstr> { let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { - def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst), + def rti : Ii8<Opcode1, MRMSrcReg, (outs VR512:$dst), (ins TILE:$src1, u8imm:$src2), - "tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, TA,PD, EVEX, EVEX_V512; - def rre : I<0x4A, MRMSrcReg4VOp3, (outs VR512:$dst), + !strconcat(Opstr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, TA, PD, EVEX, EVEX_V512; + def rte : I<Opcode2, MRMSrcReg4VOp3, (outs VR512:$dst), (ins TILE:$src1, GR32:$src2), - "tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, T8,PD, EVEX, VVVV, EVEX_V512; + !strconcat(Opstr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, T8, PD, EVEX, VVVV, EVEX_V512; } } // HasAMXAVX512, HasAVX10_2, In64BitMode } -defm TILEMOVROW : m_tilemovrow; +defm TILEMOVROW : AMXAVX512_TILEMOVE<0x07, 0x4A, "tilemovrow">; let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 1b748b7..e8fda82 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -300,6 +300,12 @@ def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "", [(set VR512:$dst, (v16i32 immAllZerosV))]>; def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "", [(set VR512:$dst, (v16i32 immAllOnesV))]>; +let AddedComplexity = 1, Predicates = [HasVLX] in { + def AVX512_128_SETALLONES : I<0, Pseudo, (outs VR128X:$dst), (ins), + "", [(set VR128X:$dst, (v4i32 immAllOnesV))]>; + def AVX512_256_SETALLONES : I<0, Pseudo, (outs VR256X:$dst), (ins), + "", [(set VR256X:$dst, (v8i32 immAllOnesV))]>; +} } let Predicates = [HasAVX512] in { @@ -3161,6 +3167,12 @@ multiclass avx512_mask_setop_w<SDPatternOperator Val> { defm KSET0 : avx512_mask_setop_w<immAllZerosV>; defm KSET1 : avx512_mask_setop_w<immAllOnesV>; +// 8-bit mask set operations for AVX512DQ +let Predicates = [HasDQI] in { + defm KSET0B : avx512_mask_setop<VK8, v8i1, immAllZerosV>; + defm KSET1B : avx512_mask_setop<VK8, v8i1, immAllOnesV>; +} + // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512] in { def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; @@ -3173,6 +3185,34 @@ let Predicates = [HasAVX512] in { def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; } +// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper +// bits +let Predicates = [HasDQI] in { + def : Pat<(v8i1 immAllZerosV), (KSET0B)>; + def : Pat<(v8i1 immAllOnesV), (KSET1B)>; +} + +// Optimize bitconvert of all-ones constants to use kxnor instructions +let Predicates = [HasDQI] in { + def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>; + def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>; +} +let Predicates = [HasBWI] in { + def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>; + def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>; +} +// Submask patterns: lower N bits set in larger mask registers +let Predicates = [HasBWI, HasDQI] in { + // v32i1 submasks + def : Pat<(v32i1(bitconvert(i32 255))), (COPY_TO_REGCLASS(KSET1B), VK32)>; + def : Pat<(v32i1(bitconvert(i32 65535))), (COPY_TO_REGCLASS(KSET1W), VK32)>; + // v64i1 submasks + def : Pat<(v64i1(bitconvert(i64 255))), (COPY_TO_REGCLASS(KSET1B), VK64)>; + def : Pat<(v64i1(bitconvert(i64 65535))), (COPY_TO_REGCLASS(KSET1W), VK64)>; + def : Pat<(v64i1(bitconvert(i64 4294967295))), (COPY_TO_REGCLASS(KSET1D), + VK64)>; +} + // Patterns for kmask insert_subvector/extract_subvector to/from index=0 multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT, RegisterClass RC, ValueType VT> { diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 7d5d7cf..b1599f2 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -150,7 +150,7 @@ let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in { // SetZUCC and promoted SetCC instructions. let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1, - hasSideEffects = 0, Predicates = [In64BitMode], Predicates = [HasNDD] in { + hasSideEffects = 0, Predicates = [In64BitMode] in { def SETZUCCr : I<0x40, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond), "setzu${cond}\t$dst", []>, XD, ZU, NoCD8, Sched<[WriteSETCC]>; @@ -167,6 +167,10 @@ let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1, } } +let Predicates = [HasZU] in + def : Pat<(i32 (zext (X86setcc timm:$cond, EFLAGS))), + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), (SETZUCCr ccode:$cond), sub_8bit)>; + // SALC is an undocumented instruction. Information for this instruction can be found // here http://www.rcollins.org/secrets/opcodes/SALC.html // Set AL if carry. diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 5321ecf..0803a49 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -71,6 +71,8 @@ def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; +def X86hadds : SDNode<"X86ISD::HADDS", SDTIntBinOp>; +def X86hsubs : SDNode<"X86ISD::HSUBS", SDTIntBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>; def X86comi512 : SDNode<"X86ISD::COMX", SDTX86FCmp>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 6b2a7a4..ebed733 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -85,7 +85,7 @@ static cl::opt<unsigned> UndefRegClearance( void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(const X86Subtarget &STI) - : X86GenInstrInfo(STI, + : X86GenInstrInfo(STI, RI, (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 @@ -93,10 +93,9 @@ X86InstrInfo::X86InstrInfo(const X86Subtarget &STI) X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)), Subtarget(STI), RI(STI.getTargetTriple()) {} -const TargetRegisterClass * -X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { - auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI); +const TargetRegisterClass *X86InstrInfo::getRegClass(const MCInstrDesc &MCID, + unsigned OpNum) const { + auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum); // If the target does not have egpr, then r16-r31 will be resereved for all // instructions. if (!RC || !Subtarget.hasEGPR()) @@ -779,6 +778,8 @@ bool X86InstrInfo::isReMaterializableImpl( case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: + case X86::AVX512_128_SETALLONES: + case X86::AVX512_256_SETALLONES: case X86::AVX512_512_SETALLONES: case X86::AVX512_FsFLD0SD: case X86::AVX512_FsFLD0SH: @@ -789,9 +790,11 @@ bool X86InstrInfo::isReMaterializableImpl( case X86::FsFLD0SS: case X86::FsFLD0SH: case X86::FsFLD0F128: + case X86::KSET0B: case X86::KSET0D: case X86::KSET0Q: case X86::KSET0W: + case X86::KSET1B: case X86::KSET1D: case X86::KSET1Q: case X86::KSET1W: @@ -958,8 +961,7 @@ bool X86InstrInfo::isReMaterializableImpl( void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const { + const MachineInstr &Orig) const { bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI); if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) != MachineBasicBlock::LQR_Dead) { @@ -4294,10 +4296,11 @@ static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg, if (X86::VR128XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) - // Copy from a VR128 register to a VR128 register. + // Copy from a GR32 register to a VR128 register. return HasAVX512 ? X86::VMOVDI2PDIZrr : HasAVX ? X86::VMOVDI2PDIrr : X86::MOVDI2PDIrr; + return 0; } @@ -4366,6 +4369,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (X86::VK16RegClass.contains(DestReg, SrcReg)) Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk) : (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk); + if (!Opc) Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget); @@ -4782,14 +4786,14 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, void X86InstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && + assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) && "Stack slot too small for store"); - unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); + unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx)); @@ -4803,15 +4807,17 @@ void X86InstrInfo::storeRegToStackSlot( .setMIFlag(Flags); } -void X86InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && + assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) && "Load size exceeds stack slot"); - unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); + unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx)); @@ -5553,7 +5559,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return false; ShouldUpdateCC = true; } else if (ImmDelta != 0) { - unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg)); + unsigned BitWidth = RI.getRegSizeInBits(*MRI->getRegClass(SrcReg)); // Shift amount for min/max constants to adjust for 8/16/32 instruction // sizes. switch (OldCC) { @@ -6244,9 +6250,31 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); return true; } + case X86::AVX512_128_SETALLONES: + case X86::AVX512_256_SETALLONES: case X86::AVX512_512_SETALLONES: { Register Reg = MIB.getReg(0); - MIB->setDesc(get(X86::VPTERNLOGDZrri)); + unsigned Opc; + switch (MI.getOpcode()) { + case X86::AVX512_128_SETALLONES: { + if (X86::VR128RegClass.contains(Reg)) + return Expand2AddrUndef(MIB, get(X86::VPCMPEQDrr)); + + Opc = X86::VPTERNLOGDZ128rri; + break; + } + case X86::AVX512_256_SETALLONES: { + if (X86::VR256RegClass.contains(Reg)) + return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); + + Opc = X86::VPTERNLOGDZ256rri; + break; + } + case X86::AVX512_512_SETALLONES: + Opc = X86::VPTERNLOGDZrri; + break; + } + MIB->setDesc(get(Opc)); // VPTERNLOGD needs 3 register inputs and an immediate. // 0xff will return 1s for any input. MIB.addReg(Reg, RegState::Undef) @@ -6352,12 +6380,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // registers, since it is not usable as a write mask. // FIXME: A more advanced approach would be to choose the best input mask // register based on context. + case X86::KSET0B: + return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0); case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0); case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0); case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0); + case X86::KSET1B: + return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0); case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0); case X86::KSET1D: @@ -7235,7 +7267,6 @@ static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) { MachineOperand &MO = NewMI.getOperand(Idx); @@ -7247,7 +7278,7 @@ static void updateOperandRegConstraints(MachineFunction &MF, continue; auto *NewRC = - MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI)); + MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx)); if (!NewRC) { LLVM_DEBUG( dbgs() << "WARNING: Unable to update register constraint for operand " @@ -7345,7 +7376,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( unsigned SrcIdx = (Imm >> 6) & 3; const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) { @@ -7370,7 +7401,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. if (OpNum == 2) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) { unsigned NewOpCode = @@ -7389,7 +7420,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( // table twice. if (OpNum == 2) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) { MachineInstr *NewMI = @@ -7524,7 +7555,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( bool NarrowToMOV32rm = false; if (Size) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. @@ -8118,9 +8149,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass; }; - if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1, &RI))) + if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1))) MaskReg = Op1.getReg(); - else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2, &RI))) + else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2))) MaskReg = Op2.getReg(); if (MaskReg) { @@ -8185,6 +8216,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_256_SET0: + case X86::AVX512_256_SETALLONES: Alignment = Align(32); break; case X86::V_SET0: @@ -8192,6 +8224,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_128_SET0: case X86::FsFLD0F128: case X86::AVX512_FsFLD0F128: + case X86::AVX512_128_SETALLONES: Alignment = Align(16); break; case X86::MMX_SET0: @@ -8250,6 +8283,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: + case X86::AVX512_128_SETALLONES: + case X86::AVX512_256_SETALLONES: case X86::AVX512_512_SETALLONES: case X86::FsFLD0SH: case X86::AVX512_FsFLD0SH: @@ -8310,6 +8345,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( break; case X86::AVX1_SETALLONES: case X86::AVX2_SETALLONES: + case X86::AVX512_256_SETALLONES: IsAllOnes = true; [[fallthrough]]; case X86::AVX512_256_SET0: @@ -8323,6 +8359,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( 2); break; case X86::V_SETALLONES: + case X86::AVX512_128_SETALLONES: IsAllOnes = true; [[fallthrough]]; case X86::V_SET0: @@ -8524,7 +8561,7 @@ bool X86InstrInfo::unfoldMemoryOperand( const MCInstrDesc &MCID = get(Opc); - const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI); + const TargetRegisterClass *RC = getRegClass(MCID, Index); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // TODO: Check if 32-byte or greater accesses are slow too? if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass && @@ -8635,7 +8672,7 @@ bool X86InstrInfo::unfoldMemoryOperand( // Emit the store instruction. if (UnfoldStore) { - const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI); + const TargetRegisterClass *DstRC = getRegClass(MCID, 0); auto MMOs = extractStoreMMOs(MI.memoperands(), MF); unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16); bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; @@ -8667,7 +8704,7 @@ bool X86InstrInfo::unfoldMemoryOperand( const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI); + const TargetRegisterClass *RC = getRegClass(MCID, Index); unsigned NumDefs = MCID.NumDefs; std::vector<SDValue> AddrOps; std::vector<SDValue> BeforeOps; @@ -8718,7 +8755,7 @@ bool X86InstrInfo::unfoldMemoryOperand( std::vector<EVT> VTs; const TargetRegisterClass *DstRC = nullptr; if (MCID.getNumDefs() > 0) { - DstRC = getRegClass(MCID, 0, &RI); + DstRC = getRegClass(MCID, 0); VTs.push_back(*TRI.legalclasstypes_begin(*DstRC)); } for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 5f75559..a547fcd 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -246,9 +246,8 @@ public: /// GR*RegClass (definition in TD file) /// -> /// GR*_NOREX2RegClass (Returned register class) - const TargetRegisterClass * - getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const override; + const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID, + unsigned OpNum) const override; /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should @@ -343,8 +342,7 @@ public: bool isReMaterializableImpl(const MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const override; + const MachineInstr &Orig) const override; /// Given an operand within a MachineInstr, insert preceding code to put it /// into the right format for a particular kind of LEA instruction. This may @@ -469,14 +467,14 @@ public: bool RenamableSrc = false) const override; void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 806b02b9..e4aaa1e 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -4864,12 +4864,12 @@ let isCommutable = 0 in { defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", int_x86_ssse3_psign_d_128, SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG; - defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", - int_x86_ssse3_phadd_sw_128, - SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG; - defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", - int_x86_ssse3_phsub_sw_128, - SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG; + defm VPHADDSW : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v8i16, v8i16, VR128, + load, i128mem, + SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; + defm VPHSUBSW : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v8i16, v8i16, VR128, + load, i128mem, + SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; } } @@ -4907,12 +4907,12 @@ let isCommutable = 0 in { SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG; defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG; - defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", - int_x86_avx2_phadd_sw, - SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG; - defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", - int_x86_avx2_phsub_sw, - SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG; + defm VPHADDSWY : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v16i16, v16i16, + VR256, load, i256mem, + SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; + defm VPHSUBSWY : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v16i16, v16i16, + VR256, load, i256mem, + SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; } } @@ -4935,12 +4935,10 @@ let isCommutable = 0 in { SchedWriteVecALU.XMM, memop>; defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, memop, i128mem, SchedWriteVarShuffle.XMM>; - defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", - int_x86_ssse3_phadd_sw_128, - SchedWritePHAdd.XMM, memop>; - defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", - int_x86_ssse3_phsub_sw_128, - SchedWritePHAdd.XMM, memop>; + defm PHADDSW : SS3I_binop_rm<0x03, "phaddsw", X86hadds, v8i16, v8i16, VR128, + memop, i128mem, SchedWritePHAdd.XMM>; + defm PHSUBSW : SS3I_binop_rm<0x07, "phsubsw", X86hsubs, v8i16, v8i16, VR128, + memop, i128mem, SchedWritePHAdd.XMM>; defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, v16i8, VR128, memop, i128mem, SchedWriteVecIMul.XMM>; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 0f725a8..88ade87 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -534,7 +534,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx10_mask_vcvttpd2qqs_round_512, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SIS, X86ISD::CVTTP2SIS_SAE), X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_128, CVTPD2DQ_MASK, - X86ISD::CVTTP2UIS, X86ISD::MCVTTP2SIS), + X86ISD::CVTTP2UIS, X86ISD::MCVTTP2UIS), X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UIS, 0), X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_round_512, INTR_TYPE_1OP_MASK, @@ -724,8 +724,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(avx2_phadd_sw, INTR_TYPE_2OP, X86ISD::HADDS, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_phsub_sw, INTR_TYPE_2OP, X86ISD::HSUBS, 0), X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0), X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0), X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), @@ -2017,11 +2019,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_phadd_sw_128, INTR_TYPE_2OP, X86ISD::HADDS, 0), X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_phsub_sw_128, INTR_TYPE_2OP, X86ISD::HSUBS, 0), X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0), diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index 090060e..3b96e70 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -115,9 +115,9 @@ struct MachineGadgetGraph : ImmutableGraph<MachineInstr *, int> { static constexpr MachineInstr *const ArgNodeSentinel = nullptr; using GraphT = ImmutableGraph<MachineInstr *, int>; - using Node = typename GraphT::Node; - using Edge = typename GraphT::Edge; - using size_type = typename GraphT::size_type; + using Node = GraphT::Node; + using Edge = GraphT::Edge; + using size_type = GraphT::size_type; MachineGadgetGraph(std::unique_ptr<Node[]> Nodes, std::unique_ptr<Edge[]> Edges, size_type NodesSize, size_type EdgesSize, int NumFences = 0, int NumGadgets = 0) @@ -191,10 +191,10 @@ template <> struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits { using GraphType = MachineGadgetGraph; using Traits = llvm::GraphTraits<GraphType *>; - using NodeRef = typename Traits::NodeRef; - using EdgeRef = typename Traits::EdgeRef; - using ChildIteratorType = typename Traits::ChildIteratorType; - using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType; + using NodeRef = Traits::NodeRef; + using EdgeRef = Traits::EdgeRef; + using ChildIteratorType = Traits::ChildIteratorType; + using ChildEdgeIteratorType = Traits::ChildEdgeIteratorType; DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} @@ -227,9 +227,6 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits { } // end namespace llvm -constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel; -constexpr int MachineGadgetGraph::GadgetEdgeSentinel; - char X86LoadValueInjectionLoadHardeningPass::ID = 0; void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage( @@ -335,7 +332,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( L.computePhiInfo(); GraphBuilder Builder; - using GraphIter = typename GraphBuilder::BuilderNodeRef; + using GraphIter = GraphBuilder::BuilderNodeRef; DenseMap<MachineInstr *, GraphIter> NodeMap; int FenceCount = 0, GadgetCount = 0; auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) { diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 7f33939..662aec2 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -23,12 +23,15 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -40,7 +43,7 @@ using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "lower-amx-intrinsics" +#define DEBUG_TYPE "x86-lower-amx-intrinsics" #ifndef NDEBUG static bool isV256I32Ty(Type *Ty) { @@ -627,6 +630,37 @@ bool X86LowerAMXIntrinsics::visit() { } namespace { +bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) { + return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) || + TM->getOptLevel() == CodeGenOptLevel::None); +} + +bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + X86LowerAMXIntrinsics LAT(F, DTU, LI); + return LAT.visit(); +} +} // namespace + +PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &FAM) { + if (!shouldRunLowerAMXIntrinsics(F, TM)) + return PreservedAnalyses::all(); + + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); + bool Changed = runLowerAMXIntrinsics(F, &DT, &LI); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LoopAnalysis>(); + return PA; +} + +namespace { class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { public: static char ID; @@ -634,21 +668,15 @@ public: X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { - if (!X86ScalarizeAMX) - return false; TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); - if (!F.hasFnAttribute(Attribute::OptimizeNone) && - TM->getOptLevel() != CodeGenOptLevel::None) + if (!shouldRunLowerAMXIntrinsics(F, TM)) return false; auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - - X86LowerAMXIntrinsics LAT(F, DTU, LI); - return LAT.visit(); + return runLowerAMXIntrinsics(F, DT, LI); } StringRef getPassName() const override { return "Lower AMX intrinsics"; } @@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) -FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() { +FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() { return new X86LowerAMXIntrinsicsLegacyPass(); } diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp index 167bed1..c964605 100644 --- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -359,7 +359,7 @@ bool X86OptimizeLEAPass::chooseBestLEA( // example MOV8mr_NOREX. We could constrain the register class of the LEA // def to suit MI, however since this case is very rare and hard to // reproduce in a test it's just more reliable to skip the LEA. - if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI) != + if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg) != MRI->getRegClass(DefMI->getOperand(0).getReg())) continue; diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index a25e4e0..898c83c 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -16,10 +16,12 @@ #include "X86TargetMachine.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -30,39 +32,44 @@ using namespace llvm; namespace { -class X86PartialReduction : public FunctionPass { +class X86PartialReduction { + const X86TargetMachine *TM; const DataLayout *DL = nullptr; const X86Subtarget *ST = nullptr; public: + X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {} + bool run(Function &F); + +private: + bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); + bool trySADReplacement(Instruction *Op); +}; + +class X86PartialReductionLegacy : public FunctionPass { +public: static char ID; // Pass identification, replacement for typeid. - X86PartialReduction() : FunctionPass(ID) { } + X86PartialReductionLegacy() : FunctionPass(ID) {} - bool runOnFunction(Function &Fn) override; + bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } - StringRef getPassName() const override { - return "X86 Partial Reduction"; - } - -private: - bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); - bool trySADReplacement(Instruction *Op); + StringRef getPassName() const override { return "X86 Partial Reduction"; } }; } -FunctionPass *llvm::createX86PartialReductionPass() { - return new X86PartialReduction(); +FunctionPass *llvm::createX86PartialReductionLegacyPass() { + return new X86PartialReductionLegacy(); } -char X86PartialReduction::ID = 0; +char X86PartialReductionLegacy::ID = 0; -INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, - "X86 Partial Reduction", false, false) +INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction", + false, false) // This function should be aligned with detectExtMul() in X86ISelLowering.cpp. static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul, @@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { } } -bool X86PartialReduction::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - auto &TM = TPC->getTM<X86TargetMachine>(); - ST = TM.getSubtargetImpl(F); - +bool X86PartialReduction::run(Function &F) { + ST = TM->getSubtargetImpl(F); DL = &F.getDataLayout(); bool MadeChange = false; @@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) { return MadeChange; } + +bool X86PartialReductionLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F); +} + +PreservedAnalyses X86PartialReductionPass::run(Function &F, + FunctionAnalysisManager &FAM) { + bool Changed = X86PartialReduction(TM).run(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index fc25d55..b80ad38 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -15,20 +15,23 @@ #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif +FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this)) FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this)) +FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this)) #undef FUNCTION_PASS #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) #endif -DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) -DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) #undef DUMMY_FUNCTION_PASS #ifndef MACHINE_FUNCTION_PASS #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) #endif +MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass()) +MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpanderPass()) +MACHINE_FUNCTION_PASS("x86-fp-stackifier", X86FPStackifierPass()) MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this)) #undef MACHINE_FUNCTION_PASS @@ -36,13 +39,10 @@ MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this)) #define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME) #endif DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-SFB", X86AvoidSFBPass()) -DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass()) DUMMY_MACHINE_FUNCTION_PASS("x86-cf-opt", X86CallFrameOptimization()) DUMMY_MACHINE_FUNCTION_PASS("x86-cmov-conversion", X86CmovConverterPass()) -DUMMY_MACHINE_FUNCTION_PASS("x86-codege", FPS()) DUMMY_MACHINE_FUNCTION_PASS("x86-compress-evex", CompressEVEXPass()) DUMMY_MACHINE_FUNCTION_PASS("x86-domain-reassignment", X86DomainReassignment()) -DUMMY_MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpander()) DUMMY_MACHINE_FUNCTION_PASS("x86-execution-domain-fix", X86ExecutionDomainFix()) DUMMY_MACHINE_FUNCTION_PASS("fastpretileconfig", X86FastPreTileConfig()) DUMMY_MACHINE_FUNCTION_PASS("fasttileconfig", X86FastTileConfig()) diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index e0b3b61..829a32e 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -54,7 +54,6 @@ #include <cassert> #include <iterator> #include <optional> -#include <utility> using namespace llvm; @@ -841,7 +840,7 @@ getRegClassForUnfoldedLoad(const X86InstrInfo &TII, unsigned Opcode) { unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold( Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index); const MCInstrDesc &MCID = TII.get(UnfoldedOpc); - return TII.getRegClass(MCID, Index, &TII.getRegisterInfo()); + return TII.getRegClass(MCID, Index); } void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads( diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 66d9e74..61f288f 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -32,7 +32,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/TargetParser/Triple.h" diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9a76abc..713df63 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -50,7 +50,6 @@ #include "llvm/Transforms/CFGuard.h" #include <memory> #include <optional> -#include <string> using namespace llvm; @@ -77,7 +76,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeFixupBWInstPassPass(PR); initializeCompressEVEXPassPass(PR); initializeFixupLEAPassPass(PR); - initializeFPSPass(PR); + initializeX86FPStackifierLegacyPass(PR); initializeX86FixupSetCCPassPass(PR); initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); @@ -90,14 +89,14 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); initializeX86AvoidSFBPassPass(PR); - initializeX86AvoidTrailingCallPassPass(PR); + initializeX86AvoidTrailingCallLegacyPassPass(PR); initializeX86SpeculativeLoadHardeningPassPass(PR); initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); initializeX86LoadValueInjectionLoadHardeningPassPass(PR); initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); - initializeX86PartialReductionPass(PR); + initializeX86PartialReductionLegacyPass(PR); initializePseudoProbeInserterPass(PR); initializeX86ReturnThunksPass(PR); initializeX86DAGToDAGISelLegacyPass(PR); @@ -105,7 +104,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeX86AsmPrinterPass(PR); initializeX86FixupInstTuningPassPass(PR); initializeX86FixupVectorConstantsPassPass(PR); - initializeX86DynAllocaExpanderPass(PR); + initializeX86DynAllocaExpanderLegacyPass(PR); initializeX86SuppressAPXForRelocationPassPass(PR); initializeX86WinEHUnwindV2Pass(PR); } @@ -422,14 +421,14 @@ void X86PassConfig::addIRPasses() { // We add both pass anyway and when these two passes run, we skip the pass // based on the option level and option attribute. - addPass(createX86LowerAMXIntrinsicsPass()); + addPass(createX86LowerAMXIntrinsicsLegacyPass()); addPass(createX86LowerAMXTypeLegacyPass()); TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createInterleavedAccessPass()); - addPass(createX86PartialReductionPass()); + addPass(createX86PartialReductionLegacyPass()); } // Add passes that handle indirect branch removal and insertion of a retpoline @@ -517,7 +516,7 @@ void X86PassConfig::addPreRegAlloc() { addPass(createX86SpeculativeLoadHardeningPass()); addPass(createX86FlagsCopyLoweringPass()); - addPass(createX86DynAllocaExpander()); + addPass(createX86DynAllocaExpanderLegacyPass()); if (getOptLevel() != CodeGenOptLevel::None) addPass(createX86PreTileConfigPass()); @@ -532,7 +531,7 @@ void X86PassConfig::addMachineSSAOptimization() { void X86PassConfig::addPostRegAlloc() { addPass(createX86LowerTileCopyPass()); - addPass(createX86FloatingPointStackifierPass()); + addPass(createX86FPStackifierLegacyPass()); // When -O0 is enabled, the Load Value Injection Hardening pass will fall back // to using the Speculative Execution Side Effect Suppression pass for // mitigation. This is to prevent slow downs due to @@ -564,8 +563,6 @@ void X86PassConfig::addPreEmitPass() { addPass(createX86FixupVectorConstants()); } addPass(createX86CompressEVEXPass()); - addPass(createX86DiscriminateMemOpsPass()); - addPass(createX86InsertPrefetchPass()); addPass(createX86InsertX87waitPass()); } @@ -589,7 +586,7 @@ void X86PassConfig::addPreEmitPass2() { // Insert extra int3 instructions after trailing call instructions to avoid // issues in the unwinder. if (TT.isOSWindows() && TT.isX86_64()) - addPass(createX86AvoidTrailingCallPass()); + addPass(createX86AvoidTrailingCallLegacyPass()); // Verify basic block incoming and outgoing cfa offset and register values and // correct CFA calculation rule where needed by inserting appropriate CFI diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 3d8d0a23..9fb9791 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5411,9 +5411,28 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } InstructionCost -X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, - unsigned AddressSpace, +X86TTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const { + switch (MICA.getID()) { + case Intrinsic::masked_scatter: + case Intrinsic::masked_gather: + return getGatherScatterOpCost(MICA, CostKind); + case Intrinsic::masked_load: + case Intrinsic::masked_store: + return getMaskedMemoryOpCost(MICA, CostKind); + } + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); +} + +InstructionCost +X86TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load + : Instruction::Store; + Type *SrcTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); + bool IsLoad = (Instruction::Load == Opcode); bool IsStore = (Instruction::Store == Opcode); @@ -6253,10 +6272,15 @@ InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, } /// Calculate the cost of Gather / Scatter operation -InstructionCost X86TTIImpl::getGatherScatterOpCost( - unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, - Align Alignment, TTI::TargetCostKind CostKind, - const Instruction *I = nullptr) const { +InstructionCost +X86TTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const { + bool IsLoad = MICA.getID() == Intrinsic::masked_gather || + MICA.getID() == Intrinsic::vp_gather; + unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store; + Type *SrcVTy = MICA.getDataType(); + const Value *Ptr = MICA.getPointer(); + Align Alignment = MICA.getAlignment(); if ((Opcode == Instruction::Load && (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), @@ -6265,8 +6289,7 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost( (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), Align(Alignment))))) - return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, - Alignment, CostKind, I); + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); @@ -6317,7 +6340,8 @@ static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) { } bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment, - unsigned AddressSpace) const { + unsigned AddressSpace, + TTI::MaskKind MaskKind) const { Type *ScalarTy = DataTy->getScalarType(); // The backend can't handle a single element vector w/o CFCMOV. @@ -6330,7 +6354,8 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment, } bool X86TTIImpl::isLegalMaskedStore(Type *DataTy, Align Alignment, - unsigned AddressSpace) const { + unsigned AddressSpace, + TTI::MaskKind MaskKind) const { Type *ScalarTy = DataTy->getScalarType(); // The backend can't handle a single element vector w/o CFCMOV. @@ -6562,7 +6587,7 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, bool X86TTIImpl::areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const { + ArrayRef<Type *> Types) const { if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) return false; @@ -6647,10 +6672,12 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( LegalVT.getVectorNumElements()); InstructionCost MemOpCost; bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; - if (UseMaskedMemOp) - MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, - AddressSpace, CostKind); - else + if (UseMaskedMemOp) { + unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load + : Intrinsic::masked_store; + MemOpCost = getMaskedMemoryOpCost( + {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind); + } else MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace, CostKind); @@ -7223,3 +7250,19 @@ bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I, return false; } + +bool X86TTIImpl::useFastCCForInternalCall(Function &F) const { + bool HasEGPR = ST->hasEGPR(); + const TargetMachine &TM = getTLI()->getTargetMachine(); + + for (User *U : F.users()) { + CallBase *CB = dyn_cast<CallBase>(U); + if (!CB || CB->getCalledOperand() != &F) + continue; + Function *CallerFunc = CB->getFunction(); + if (TM.getSubtarget<X86Subtarget>(*CallerFunc).hasEGPR() != HasEGPR) + return false; + } + + return true; +} diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 133b366..4f67279 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -183,14 +183,12 @@ public: TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I = nullptr) const override; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, - TTI::TargetCostKind CostKind) const override; - InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, - const Value *Ptr, bool VariableMask, - Align Alignment, - TTI::TargetCostKind CostKind, - const Instruction *I) const override; + getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const override; + InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const; + InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const; InstructionCost getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, @@ -268,10 +266,14 @@ public: bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override; bool canMacroFuseCmp() const override; - bool isLegalMaskedLoad(Type *DataType, Align Alignment, - unsigned AddressSpace) const override; - bool isLegalMaskedStore(Type *DataType, Align Alignment, - unsigned AddressSpace) const override; + bool + isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, + TTI::MaskKind MaskKind = + TTI::MaskKind::VariableOrConstantMask) const override; + bool + isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, + TTI::MaskKind MaskKind = + TTI::MaskKind::VariableOrConstantMask) const override; bool isLegalNTLoad(Type *DataType, Align Alignment) const override; bool isLegalNTStore(Type *DataType, Align Alignment) const override; bool isLegalBroadcastLoad(Type *ElementTy, @@ -296,7 +298,7 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const override; bool areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Type) const override; + ArrayRef<Type *> Type) const override; uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override { return ST->getMaxInlineSizeThreshold(); @@ -319,6 +321,8 @@ public: unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override; + bool useFastCCForInternalCall(Function &F) const override; + private: bool supportsGather() const; InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp index f6f7e92..2f28ab3 100644 --- a/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -66,7 +66,7 @@ namespace { MachineBasicBlock &MBB); void addDirtySuccessor(MachineBasicBlock &MBB); - using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; + enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; static const char* getBlockExitStateName(BlockExitState ST); |
