diff options
Diffstat (limited to 'llvm/lib/Target/X86')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 38 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 9 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleZnver4.td | 110 |
5 files changed, 92 insertions, 68 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b5f8ee5..410f20e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2632,6 +2632,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(Op, MVT::f32, Promote); } + setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16); + setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32); + setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64); + // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::VECTOR_SHUFFLE, ISD::SCALAR_TO_VECTOR, @@ -20558,7 +20562,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // NOTE: By using fsub of a positive constant instead of fadd of a negative - // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is + // constant, we avoid reassociation in MachineCombiner when reassoc is // enabled. See PR24512. SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); // TODO: Are there any fast-math-flags to propagate here? @@ -29516,11 +29520,8 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX)) return MinMax; - if (DAG.isKnownNeverNaN(NewX)) - NewX = NewY; - - SDValue IsNaN = - DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO); + SDValue NaNSrc = IsNum ? MinMax : NewX; + SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO); return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax); } @@ -57616,10 +57617,10 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, } // Fold any similar generic ADD/SUB opcodes to reuse this node. - auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { + auto MatchGeneric = [&](unsigned Opc, SDValue N0, SDValue N1, bool Negate) { SDValue Ops[] = {N0, N1}; SDVTList VTs = DAG.getVTList(N->getValueType(0)); - if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { + if (SDNode *GenericAddSub = DAG.getNodeIfExists(Opc, VTs, Ops)) { SDValue Op(N, 0); if (Negate) { // Bail if this is only used by a user of the x86 add/sub. @@ -57631,8 +57632,25 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, DCI.CombineTo(GenericAddSub, Op); } }; - MatchGeneric(LHS, RHS, false); - MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); + MatchGeneric(GenericOpc, LHS, RHS, false); + MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode()); + + if (auto *Const = dyn_cast<ConstantSDNode>(RHS)) { + SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); + if (X86ISD::SUB == N->getOpcode()) { + // Fold generic add(LHS, -C) to X86ISD::SUB(LHS, C). + MatchGeneric(ISD::ADD, LHS, NegC, false); + } else { + // Negate X86ISD::ADD(LHS, C) and replace generic sub(-C, LHS). + MatchGeneric(ISD::SUB, NegC, LHS, true); + } + } else if (auto *Const = dyn_cast<ConstantSDNode>(LHS)) { + if (X86ISD::SUB == N->getOpcode()) { + SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); + // Negate X86ISD::SUB(C, RHS) and replace generic add(RHS, -C). + MatchGeneric(ISD::ADD, RHS, NegC, true); + } + } // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the // EFLAGS result doesn't change. diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e28b9c1..b7151f6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1592,7 +1592,6 @@ namespace llvm { bool useLoadStackGuardNode(const Module &M) const override; bool useStackGuardXorFP() const override; void insertSSPDeclarations(Module &M) const override; - Function *getSSPStackGuardCheck(const Module &M) const override; SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 37d7772..a61bbe5 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -640,15 +640,6 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const { TargetLowering::insertSSPDeclarations(M); } -Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { - // MSVC CRT has a function to validate security cookie. - if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || - Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { - return M.getFunction("__security_check_cookie"); - } - return TargetLowering::getSSPStackGuardCheck(M); -} - Value * X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 83bd6ac..1b748b7 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5519,7 +5519,7 @@ defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs, defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs, SchedWriteFCmpSizes, 0>; -// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use +// MIN/MAX nodes are commutable under (nnan + ninf). In this case we use // X86fminc and X86fmaxc instead of X86fmin and X86fmax multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td index cc30054..ac4d31d 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver4.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td @@ -15,7 +15,7 @@ //===----------------------------------------------------------------------===// def Znver4Model : SchedMachineModel { - // AMD SOG Zen4, 2.9.6 Dispatch + // AMD SOG Zen4, 2.9.8 Dispatch // The processor may dispatch up to 6 macro ops per cycle // into the execution engine. let IssueWidth = 6; @@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel { int VecLoadLatency = 7; // Latency of a simple store operation. int StoreLatency = 1; - // FIXME: - let HighLatency = 25; // FIXME: any better choice? + // Mean and median value for all instructions with latencies >6 + // Source: Zen4 Instruction Latencies spreadsheet (included with SOG) + let HighLatency = 13; // AMD SOG Zen4, 2.8 Optimizing Branching // The branch misprediction penalty is in the range from 11 to 18 cycles, // <...>. The common case penalty is 13 cycles. @@ -612,6 +613,7 @@ def Zn4WriteLEA : SchedWriteVariant<[ def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; +// values from uops.info def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> { let Latency = 2; // FIXME: not from llvm-exegesis let ReleaseAtCycles = [4]; @@ -659,15 +661,15 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> { let Latency = 3; // FIXME: not from llvm-exegesis - let ReleaseAtCycles = [24]; - let NumMicroOps = 19; + let ReleaseAtCycles = [20]; + let NumMicroOps = 15; } def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>; def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> { - let Latency = 4; // FIXME: not from llvm-exegesis - let ReleaseAtCycles = [59]; - let NumMicroOps = 28; + let Latency = 2; // FIXME: not from llvm-exegesis + let ReleaseAtCycles = [40]; + let NumMicroOps = 26; } def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; @@ -681,7 +683,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = 5; + let NumMicroOps = 2; } def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; @@ -693,19 +695,17 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; // Integer division. -// FIXME: uops for 8-bit division measures as 2. for others it's a guess. -// FIXME: latency for 8-bit division measures as 10. for others it's a guess. -defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>; -defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>; -defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>; -defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>; -defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>; -defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>; -defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>; -defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>; - -defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward. -defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse. +defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>; +defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>; +defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>; +defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>; +defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>; +defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>; +defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>; +defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>; + +defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward. +defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse. defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count. @@ -725,12 +725,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { } def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>; -defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count. +defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1, [1], 1>; // Trailing zero count. def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { - let Latency = 2; - let ReleaseAtCycles = [4]; - let NumMicroOps = 2; + let Latency = 1; + let ReleaseAtCycles = [1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>; @@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> { } def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>; -def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { - // TODO: All align instructions are expected to be of 4 cycle latency - let Latency = 4; +// 128-bit VALIGN +def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { + let Latency = 2; let ReleaseAtCycles = [1]; let NumMicroOps = 1; } -def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri, - VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri) - >; + +// 256-bit VALIGN +def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { + let Latency = 3; + let ReleaseAtCycles = [1]; + let NumMicroOps = 1; +} + +// 512-bit VALIGN +def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { + let Latency = 4; + let ReleaseAtCycles = [2]; + let NumMicroOps = 1; +} + +def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>; +def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>; +def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>; + defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> { @@ -1326,9 +1342,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask -defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; +defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7, [8], 3, /*LoadUOps=*/1>; // Packed Compare Explicit Length Strings, Return Mask -defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; +defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7, [12], 7, /*LoadUOps=*/5>; // Packed Compare Implicit Length Strings, Return Index defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>; // Packed Compare Explicit Length Strings, Return Index @@ -1340,7 +1356,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation. // Carry-less multiplication instructions. -defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>; +defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3], 4>; // EMMS/FEMMS defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis @@ -1386,44 +1402,44 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>; def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> { - let Latency = 7; + let Latency = 4; let ReleaseAtCycles = [1]; - let NumMicroOps = 2; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>; def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency); - let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1); + let ReleaseAtCycles = [1, 1, 1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>; def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> { - let Latency = 6; + let Latency = 4; let ReleaseAtCycles = [1]; - let NumMicroOps = 2; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency); - let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1); + let ReleaseAtCycles = [1, 1, 1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>; def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> { - let Latency = 5; + let Latency = 4; let ReleaseAtCycles = [1]; - let NumMicroOps = 2; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>; def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency); - let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0); + let ReleaseAtCycles = [1, 1, 1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; |
