aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2025-09-02 09:44:15 +0100
committerGitHub <noreply@github.com>2025-09-02 08:44:15 +0000
commitba707db840516b2246c6a31ef8a96e41939deeb5 (patch)
tree13d3bb2ee89e1497c30f26b7a0ce76ce4c266f60
parentc4885849adf0addf8c154bfcaf143d959ffda961 (diff)
downloadllvm-ba707db840516b2246c6a31ef8a96e41939deeb5.zip
llvm-ba707db840516b2246c6a31ef8a96e41939deeb5.tar.gz
llvm-ba707db840516b2246c6a31ef8a96e41939deeb5.tar.bz2
[X86] getScalarMaskingNode - if the mask is zero just return the blended passthrough and preserved source value (#153575)
We already handle the case if the mask is one, so I added the other case where the op is replaced with a MOVSH/S/D blend. This assumes the scalar passthrough is op0. I had to adjust the test case for #98306 as AFAICT it'd been over reduced Fixes #153570
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp18
-rw-r--r--llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll6
-rw-r--r--llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll16
3 files changed, 33 insertions, 7 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dd4c608..572cfda 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26269,10 +26269,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
-
- if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
- if (MaskConst->getZExtValue() & 0x1)
- return Op;
+ auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskConst && (MaskConst->getZExtValue() & 0x1))
+ return Op;
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -26288,6 +26287,17 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+
+ if (MaskConst) {
+ assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
+ // Discard op and blend passthrough with scalar op src/dst.
+ SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
+ std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
+ ShuffleMask[0] = VT.getVectorNumElements();
+ return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
+ ShuffleMask);
+ }
+
return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
}
diff --git a/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
index e449c71..b60d7a5 100644
--- a/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
@@ -278,14 +278,14 @@ define <4 x float> @test_int_x86_avx512fp16_maskz_cfcmadd_sh(<4 x float> %x0, <4
ret <4 x float> %res
}
-define <4 x float> @PR98306() {
+define <4 x float> @PR98306(i8 %m) {
; CHECK-LABEL: PR98306:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kxorw %k0, %k0, %k1
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [7.8125E-3,1.050912E+6,4.203776E+6,1.6815616E+7]
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [3.2E+1,4.03288064E+8,8.0658432E+8,1.61318502E+9]
; CHECK-NEXT: vfmaddcsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> <float 7.812500e-03, float 0x4130092000000000, float 0x4150094000000000, float 0x4170096000000000>, <4 x float> <float 2.000000e+00, float 0x4188098000000000, float 0x4198099000000000, float 0x41A809A000000000>, <4 x float> <float 3.200000e+01, float 0x41B809B000000000, float 0x41C809C000000000, float 0x41D809D000000000>, i8 0, i32 4)
+ %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> <float 7.812500e-03, float 0x4130092000000000, float 0x4150094000000000, float 0x4170096000000000>, <4 x float> <float 2.000000e+00, float 0x4188098000000000, float 0x4198099000000000, float 0x41A809A000000000>, <4 x float> <float 3.200000e+01, float 0x41B809B000000000, float 0x41C809C000000000, float 0x41D809D000000000>, i8 %m, i32 4)
ret <4 x float> %res
}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
index 627a947..b1bacd9 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@@ -1361,3 +1361,19 @@ define <32 x half> @test_mm512_castph256_ph512_freeze(<16 x half> %a0) nounwind
%res = shufflevector <16 x half> %a0, <16 x half> %a1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <32 x half> %res
}
+
+define <8 x half> @PR153570(ptr %p) {
+; CHECK-LABEL: PR153570:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT: vmulsh {rn-sae}, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vmovsh %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vmovaps %xmm1, (%rdi)
+; CHECK-NEXT: retq
+ %r = tail call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>, <8 x half> <half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000>, <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, i8 0, i32 8)
+ store <8 x half> %r, ptr %p, align 16
+ %r1 = tail call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>, <8 x half> <half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000>, <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, i8 1, i32 8)
+ ret <8 x half> %r1
+}