[x86] convert logic-of-FP-compares to FP logic-of-vector-compares

This is motivated by the examples and discussion in: https://llvm.org/PR51245 ...and related bugs. By using vector compares and vector logic, we can convert 2 'set' instructions into 1 'movd' or 'movmsk' and generally improve throughput/reduce instructions. Unfortunately, we don't have a complete vector compare ISA before AVX, so I left SSE-only out of this patch. Ie, we'd need extra logic ops to simulate the missing predicates for SSE 'cmpp*', so it's not as clearly a win. Differential Revision: https://reviews.llvm.org/D110342
author: Sanjay Patel <spatel@rotateright.com> 2021-09-24 11:31:11 -0400
committer: Sanjay Patel <spatel@rotateright.com> 2021-09-24 11:38:19 -0400
commit: 09e71c367af36a07cce04a812dcd9c62bebfbcdf (patch)
tree: 39695eab5a33e5244e11e2a0b9b283c344d08b0e /llvm/lib
parent: 638a4147fc6400b928080c6d5656fbcee707a793 (diff)
download: llvm-09e71c367af36a07cce04a812dcd9c62bebfbcdf.zip
llvm-09e71c367af36a07cce04a812dcd9c62bebfbcdf.tar.gz
llvm-09e71c367af36a07cce04a812dcd9c62bebfbcdf.tar.bz2
1 files changed, 38 insertions, 10 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 95c80e5..c1e8531 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45449,9 +45449,9 @@ static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
   return FPOpcode;
 }
 
-/// If both input operands of a logic op are being cast from floating point
-/// types, try to convert this into a floating point logic node to avoid
-/// unnecessary moves from SSE to integer registers.
+/// If both input operands of a logic op are being cast from floating-point
+/// types or FP compares, try to convert this into a floating-point logic node
+/// to avoid unnecessary moves from SSE to integer registers.
 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const X86Subtarget &Subtarget) {
@@ -45460,10 +45460,8 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   SDValue N1 = N->getOperand(1);
   SDLoc DL(N);
 
-  if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
-    return SDValue();
-
-  if (DCI.isBeforeLegalizeOps())
+  if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
+        (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
@@ -45477,9 +45475,39 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
                               (Subtarget.hasFP16() && N00Type == MVT::f16)))
     return SDValue();
 
-  unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
-  SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
-  return DAG.getBitcast(VT, FPLogic);
+  if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
+    unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
+    SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+    return DAG.getBitcast(VT, FPLogic);
+  }
+
+  // The vector ISA for FP predicates is incomplete before AVX, so converting
+  // COMIS* to CMPS* may not be a win before AVX.
+  // TODO: Check types/predicates to see if they are available with SSE/SSE2.
+  if (!Subtarget.hasAVX() || VT != MVT::i1 || N0.getOpcode() != ISD::SETCC ||
+      !N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
+  // and vector logic:
+  // logic (setcc N00, N01), (setcc N10, N11) -->
+  // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
+  unsigned NumElts = 128 / N00Type.getSizeInBits();
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
+  EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+  SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
+  SDValue N01 = N0.getOperand(1);
+  SDValue N11 = N1.getOperand(1);
+  SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
+  SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
+  SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
+  SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
+  SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01,
+                                cast<CondCodeSDNode>(N0.getOperand(2))->get());
+  SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11,
+                                cast<CondCodeSDNode>(N1.getOperand(2))->get());
+  SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
 }
 
 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
author	Sanjay Patel <spatel@rotateright.com>	2021-09-24 11:31:11 -0400
committer	Sanjay Patel <spatel@rotateright.com>	2021-09-24 11:38:19 -0400
commit	09e71c367af36a07cce04a812dcd9c62bebfbcdf (patch)
tree	39695eab5a33e5244e11e2a0b9b283c344d08b0e /llvm/lib
parent	638a4147fc6400b928080c6d5656fbcee707a793 (diff)
download	llvm-09e71c367af36a07cce04a812dcd9c62bebfbcdf.zip llvm-09e71c367af36a07cce04a812dcd9c62bebfbcdf.tar.gz llvm-09e71c367af36a07cce04a812dcd9c62bebfbcdf.tar.bz2