diff options
author | Sanjay Patel <spatel@rotateright.com> | 2021-09-24 11:31:11 -0400 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2021-09-24 11:38:19 -0400 |
commit | 09e71c367af36a07cce04a812dcd9c62bebfbcdf (patch) | |
tree | 39695eab5a33e5244e11e2a0b9b283c344d08b0e /llvm/lib | |
parent | 638a4147fc6400b928080c6d5656fbcee707a793 (diff) | |
download | llvm-09e71c367af36a07cce04a812dcd9c62bebfbcdf.zip llvm-09e71c367af36a07cce04a812dcd9c62bebfbcdf.tar.gz llvm-09e71c367af36a07cce04a812dcd9c62bebfbcdf.tar.bz2 |
[x86] convert logic-of-FP-compares to FP logic-of-vector-compares
This is motivated by the examples and discussion in:
https://llvm.org/PR51245
...and related bugs.
By using vector compares and vector logic, we can convert 2 'set'
instructions into 1 'movd' or 'movmsk' and generally improve
throughput/reduce instructions.
Unfortunately, we don't have a complete vector compare ISA before
AVX, so I left SSE-only out of this patch. Ie, we'd need extra logic
ops to simulate the missing predicates for SSE 'cmpp*', so it's not
as clearly a win.
Differential Revision: https://reviews.llvm.org/D110342
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 48 |
1 files changed, 38 insertions, 10 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 95c80e5..c1e8531 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45449,9 +45449,9 @@ static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) { return FPOpcode; } -/// If both input operands of a logic op are being cast from floating point -/// types, try to convert this into a floating point logic node to avoid -/// unnecessary moves from SSE to integer registers. +/// If both input operands of a logic op are being cast from floating-point +/// types or FP compares, try to convert this into a floating-point logic node +/// to avoid unnecessary moves from SSE to integer registers. static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -45460,10 +45460,8 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); SDLoc DL(N); - if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST) - return SDValue(); - - if (DCI.isBeforeLegalizeOps()) + if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) || + (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC))) return SDValue(); SDValue N00 = N0.getOperand(0); @@ -45477,9 +45475,39 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, (Subtarget.hasFP16() && N00Type == MVT::f16))) return SDValue(); - unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode()); - SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); - return DAG.getBitcast(VT, FPLogic); + if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) { + unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode()); + SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); + return DAG.getBitcast(VT, FPLogic); + } + + // The vector ISA for FP predicates is incomplete before AVX, so converting + // COMIS* to CMPS* may not be a win before AVX. + // TODO: Check types/predicates to see if they are available with SSE/SSE2. + if (!Subtarget.hasAVX() || VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || + !N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); + + // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*) + // and vector logic: + // logic (setcc N00, N01), (setcc N10, N11) --> + // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0 + unsigned NumElts = 128 / N00Type.getSizeInBits(); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts); + EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); + SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL); + SDValue N01 = N0.getOperand(1); + SDValue N11 = N1.getOperand(1); + SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00); + SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01); + SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10); + SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11); + SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, + cast<CondCodeSDNode>(N0.getOperand(2))->get()); + SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, + cast<CondCodeSDNode>(N1.getOperand(2))->get()); + SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex); } // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y)) |