diff options
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r-- | llvm/lib/Target/X86/X86FixupInstTuning.cpp | 54 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 32 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 48 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrArithmetic.td | 23 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 10 |
6 files changed, 101 insertions, 69 deletions
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp index 33dc0a2..a1d4e0b 100644 --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction( return true; }; + // Is ADD(X,X) more efficient than SHL(X,1)? + auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool { + if (MI.getOperand(NumOperands - 1).getImm() != 1) + return false; + if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true)) + return false; + LLVM_DEBUG(dbgs() << "Replacing: " << MI); + { + MI.setDesc(TII->get(AddOpc)); + MI.removeOperand(NumOperands - 1); + MI.addOperand(MI.getOperand(NumOperands - 2)); + } + LLVM_DEBUG(dbgs() << " With: " << MI); + return false; + }; + switch (Opc) { case X86::BLENDPDrri: return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1); @@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction( return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz); case X86::VUNPCKHPSZrmkz: return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz); + + case X86::PSLLWri: + return ProcessShiftLeftToAdd(X86::PADDWrr); + case X86::VPSLLWri: + return ProcessShiftLeftToAdd(X86::VPADDWrr); + case X86::VPSLLWYri: + return ProcessShiftLeftToAdd(X86::VPADDWYrr); + case X86::VPSLLWZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDWZ128rr); + case X86::VPSLLWZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDWZ256rr); + case X86::VPSLLWZri: + return ProcessShiftLeftToAdd(X86::VPADDWZrr); + case X86::PSLLDri: + return ProcessShiftLeftToAdd(X86::PADDDrr); + case X86::VPSLLDri: + return ProcessShiftLeftToAdd(X86::VPADDDrr); + case X86::VPSLLDYri: + return ProcessShiftLeftToAdd(X86::VPADDDYrr); + case X86::VPSLLDZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDDZ128rr); + case X86::VPSLLDZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDDZ256rr); + case X86::VPSLLDZri: + return ProcessShiftLeftToAdd(X86::VPADDDZrr); + case X86::PSLLQri: + return ProcessShiftLeftToAdd(X86::PADDQrr); + case X86::VPSLLQri: + return ProcessShiftLeftToAdd(X86::VPADDQrr); + case X86::VPSLLQYri: + return ProcessShiftLeftToAdd(X86::VPADDQYrr); + case X86::VPSLLQZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDQZ128rr); + case X86::VPSLLQZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDQZ256rr); + case X86::VPSLLQZri: + return ProcessShiftLeftToAdd(X86::VPADDQZrr); + default: return false; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index efeddd7..cd04ff5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4456,8 +4456,8 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, bool AllowAVX512 = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; - if ((CheckBWI && Subtarget.useBWIRegs()) || - (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) { + if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) || + (!CheckBWI && Subtarget.useAVX512Regs()))) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -30313,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); - if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) { - // Hardware support for vector shifts is sparse which makes us scalarize the - // vector operations in many cases. Also, on sandybridge ADD is faster than - // shl: (shl V, 1) -> (add (freeze V), (freeze V)) - if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) { - // R may be undef at run-time, but (shl R, 1) must be an even number (LSB - // must be 0). (add undef, undef) however can be any value. To make this - // safe, we must freeze R to ensure that register allocation uses the same - // register for an undefined value. This ensures that the result will - // still be even and preserves the original semantics. - R = DAG.getFreeze(R); - return DAG.getNode(ISD::ADD, dl, VT, R, R); - } - + if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); - } // i64 SRA needs to be performed as partial shifts. if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || @@ -31229,16 +31215,16 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, unsigned NumElts = VT.getVectorNumElements(); if (Subtarget.hasVBMI2() && EltSizeInBits > 8) { - if (IsFSHR) - std::swap(Op0, Op1); if (IsCstSplat) { + if (IsFSHR) + std::swap(Op0, Op1); uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8); return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, {Op0, Op1, Imm}, DAG, Subtarget); } - return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, + return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT, {Op0, Op1, Amt}, DAG, Subtarget); } assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || @@ -35153,8 +35139,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VALIGN) NODE_NAME_CASE(VSHLD) NODE_NAME_CASE(VSHRD) - NODE_NAME_CASE(VSHLDV) - NODE_NAME_CASE(VSHRDV) NODE_NAME_CASE(PSHUFD) NODE_NAME_CASE(PSHUFHW) NODE_NAME_CASE(PSHUFLW) @@ -45185,6 +45169,7 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( case X86ISD::Wrapper: case X86ISD::WrapperRIP: return true; + case X86ISD::INSERTPS: case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: @@ -45255,6 +45240,7 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( case X86ISD::BLENDV: return false; // SSE target shuffles. + case X86ISD::INSERTPS: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::UNPCKL: @@ -46211,7 +46197,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, SDValue Zero = DAG.getConstant(0, DL, DpVT); return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1}, - DpBuilder, false); + DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI()); } // Create a PSADBW given two sources representable as zexts of vXi8. diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 8ab8c66..b55556a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -471,8 +471,7 @@ namespace llvm { // VBMI2 Concat & Shift. VSHLD, VSHRD, - VSHLDV, - VSHRDV, + // Shuffle Packed Values at 128-bit granularity. SHUF128, MOVDDUP, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 2371ed4..564810c 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -12300,72 +12300,76 @@ defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>; // VBMI2 //===----------------------------------------------------------------------===// -multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR, X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { let Constraints = "$src1 = $dst", ExeDomain = VTI.ExeDomain in { defm r: AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.RC:$src3), OpStr, "$src3, $src2", "$src2, $src3", - (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>, + !if(SwapLR, + (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src3))), + (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src3))))>, T8, PD, EVEX, VVVV, Sched<[sched]>; defm m: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, "$src3, $src2", "$src2, $src3", - (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (VTI.LdFrag addr:$src3))))>, + !if(SwapLR, + (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.LdFrag addr:$src3)))), + (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.LdFrag addr:$src3)))))>, T8, PD, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>; } } -multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR, X86FoldableSchedWrite sched, X86VectorVTInfo VTI> - : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched, VTI> { + : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched, VTI> { let Constraints = "$src1 = $dst", ExeDomain = VTI.ExeDomain in defm mb: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr, "${src3}"#VTI.BroadcastStr#", $src2", "$src2, ${src3}"#VTI.BroadcastStr, - (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, + !if(SwapLR, + (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))), + (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))))>, T8, PD, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } -multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> { let Predicates = [HasVBMI2] in - defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.ZMM, VTI.info512>, + defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>, EVEX_V512; let Predicates = [HasVBMI2, HasVLX] in { - defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.YMM, VTI.info256>, + defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>, EVEX_V256; - defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.XMM, VTI.info128>, + defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>, EVEX_V128; } } -multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> { let Predicates = [HasVBMI2] in - defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.ZMM, VTI.info512>, + defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>, EVEX_V512; let Predicates = [HasVBMI2, HasVLX] in { - defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.YMM, VTI.info256>, + defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>, EVEX_V256; - defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.XMM, VTI.info128>, + defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>, EVEX_V128; } } multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix, - SDNode OpNode, X86SchedWriteWidths sched> { - defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, sched, + SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched> { + defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, SwapLR, sched, avx512vl_i16_info>, REX_W, EVEX_CD8<16, CD8VF>; - defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, sched, + defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, SwapLR, sched, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; - defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, sched, + defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, SwapLR, sched, avx512vl_i64_info>, REX_W, EVEX_CD8<64, CD8VF>; } @@ -12381,8 +12385,8 @@ multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix, } // Concat & Shift -defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>; -defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>; +defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", fshl, 0, SchedWriteVecIMul>; +defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", fshr, 1, SchedWriteVecIMul>; defm VPSHLD : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>; defm VPSHRD : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>; diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td index b4768590..031fdc1 100644 --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -25,18 +25,12 @@ let SchedRW = [WriteLEA] in { [(set GR32:$dst, lea32addr:$src)]>, OpSize32, Requires<[Not64BitMode]>; - let Predicates = [HasNDD], isCodeGenOnly = 1 in { - def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR8:$dst), (ins lea64_8mem:$src), - "lea{b}\t{$src|$dst}, {$dst|$src}", - [(set GR8:$dst, lea64_iaddr:$src)]>, - OpSize16, - Requires<[In64BitMode]>; - - def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR16:$dst), (ins lea64_16mem:$src), - "lea{w}\t{$src|$dst}, {$dst|$src}", - [(set GR16:$dst, lea64_iaddr:$src)]>, - OpSize16, - Requires<[In64BitMode]>; + let isCodeGenOnly = 1 in { + def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_8mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32; + + def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_16mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32; } def LEA64_32r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_32mem:$src), @@ -51,6 +45,11 @@ let SchedRW = [WriteLEA] in { [(set GR64:$dst, lea64addr:$src)]>; } // SchedRW +let Predicates = [HasNDD] in { + def : Pat<(i8 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_8r lea64_8mem:$src), sub_8bit)>; + def : Pat<(i16 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_16r lea64_16mem:$src), sub_16bit)>; +} + // Pseudo instruction for lea that prevent optimizer from eliminating // the instruction. let SchedRW = [WriteLEA], isPseudo = true, hasSideEffects = 1 in { diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 0c20ffe..5321ecf 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -406,16 +406,6 @@ def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; def X86VShld : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>; def X86VShrd : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>; -def X86VShldv : SDNode<"X86ISD::VSHLDV", - SDTypeProfile<1, 3, [SDTCisVec<0>, - SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisSameAs<0,3>]>>; -def X86VShrdv : SDNode<"X86ISD::VSHRDV", - SDTypeProfile<1, 3, [SDTCisVec<0>, - SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisSameAs<0,3>]>>; def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>; |