aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2024-04-19 17:52:39 +0100
committerSimon Pilgrim <llvm-dev@redking.me.uk>2024-04-19 19:02:41 +0100
commitafc8ad0d938b3fa74e92f1d066d28e64a7f7f905 (patch)
treee1798bd774da1f1eb06e9c5cffb002665809bebc
parenta6a4d4a0949fa7aab93429754704f28505d56d3f (diff)
downloadllvm-afc8ad0d938b3fa74e92f1d066d28e64a7f7f905.zip
llvm-afc8ad0d938b3fa74e92f1d066d28e64a7f7f905.tar.gz
llvm-afc8ad0d938b3fa74e92f1d066d28e64a7f7f905.tar.bz2
[X86] LowerFunnelShift - improve handling of vXi8 constant splat funnel shifts
This patch moves the promotion to vXi16 shifts and the upper/lower bit masking into LowerFunnelShift for targets that have a bit-select instruction (XOP's VPCMOV and AVX512's VPTERNLOG). This prevents the regressions in #89115 due to the masking of ((X << V) | (Y >> (8-V))) vXi8 shifts.
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp25
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-128.ll6
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-256.ll20
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-128.ll6
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-256.ll20
5 files changed, 48 insertions, 29 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bedec0c..3a51c7c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29830,6 +29830,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
if (VT.isVector()) {
APInt APIntShiftAmt;
bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
+ unsigned NumElts = VT.getVectorNumElements();
if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
if (IsFSHR)
@@ -29858,6 +29859,29 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
+ assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
+
+ if (EltSizeInBits == 8 && ShXAmt > 1 &&
+ (Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT))) {
+ // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
+ // bit-select - lower using vXi16 shifts and then perform the bitmask at
+ // the original vector width to handle cases where we split.
+ MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
+ APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
+ SDValue ShX =
+ DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
+ DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
+ SDValue ShY =
+ DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
+ DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
+ ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
+ DAG.getConstant(MaskX, DL, VT));
+ ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
+ DAG.getConstant(MaskY, DL, VT));
+ return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
+ }
+
SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
DAG.getShiftAmountConstant(ShXAmt, VT, DL));
SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
@@ -29874,7 +29898,6 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
return SDValue();
unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
- unsigned NumElts = VT.getVectorNumElements();
MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 1addedf..0459d47 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -2453,9 +2453,9 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
;
; XOP-LABEL: splatconstant_funnnel_v16i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpsrlw $4, %xmm1, %xmm1
+; XOP-NEXT: vpsllw $4, %xmm0, %xmm0
+; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X86-SSE2-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index ebcb1cb..e81b9ad 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -2344,17 +2344,15 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
;
; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 638a3cd..b839452 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -2462,9 +2462,9 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
;
; XOP-LABEL: splatconstant_funnnel_v16i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpsrlw $4, %xmm1, %xmm1
+; XOP-NEXT: vpsllw $4, %xmm0, %xmm0
+; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X86-SSE2-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 3fabf72..7b6b0ea 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -2145,17 +2145,15 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
;
; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_funnnel_v32i8: