diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2025-09-27 17:43:58 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-09-27 16:43:58 +0000 |
commit | 0df3651802d35b26ae857b549de9edf73b67fb98 (patch) | |
tree | d3857739ec322965543b9c2ead334357603cb898 | |
parent | d70490c6940f0bca4f13be199396701249876685 (diff) | |
download | llvm-main.zip llvm-main.tar.gz llvm-main.tar.bz2 |
If we have a AVX512 target capable of AVXIFMA but not AVX512IFMA then we must split 512-bit (or larger) types to 256-bits
Fixes #160928
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll | 619 |
2 files changed, 392 insertions, 235 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3af673d..efeddd7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4452,11 +4452,12 @@ static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, template <typename F> SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops, - F Builder, bool CheckBWI = true) { + F Builder, bool CheckBWI = true, + bool AllowAVX512 = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; if ((CheckBWI && Subtarget.useBWIRegs()) || - (!CheckBWI && Subtarget.useAVX512Regs())) { + (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -58076,7 +58077,8 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder, - /*CheckBWI*/ false); + /*CheckBWI*/ false, + /*AllowAVX512*/ Subtarget.hasIFMA()); } static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll index aebfc7d..3ece4be 100644 --- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll +++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll @@ -1,25 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX,AVXIFMA ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma | FileCheck %s --check-prefixes=X64,AVX512,AVX512-NOVL ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX,AVX512-NOIFMA ; 67108863 == (1 << 26) - 1 ; 4503599627370496 == (1 << 52) ; 4503599627370495 == (1 << 52) - 1 define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { -; AVX-LABEL: test_512_combine: -; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4 -; AVX-NEXT: vpand %ymm6, %ymm3, %ymm0 -; AVX-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5 -; AVX-NEXT: vmovdqa %ymm4, %ymm0 -; AVX-NEXT: vmovdqa %ymm5, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_512_combine: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVXIFMA-NEXT: vpand %ymm6, %ymm0, %ymm0 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4 +; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm0 +; AVXIFMA-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5 +; AVXIFMA-NEXT: vmovdqa %ymm4, %ymm0 +; AVXIFMA-NEXT: vmovdqa %ymm5, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_512_combine: ; AVX512: # %bb.0: @@ -29,6 +30,19 @@ define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { ; AVX512-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_512_combine: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863] +; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm1, %zmm1 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm3, %ymm4, %ymm5 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm2 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <8 x i64> %x, splat (i64 67108863) %y_masked = and <8 x i64> %y, splat (i64 67108863) %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked @@ -37,19 +51,19 @@ define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { } define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { -; AVX-LABEL: test_512_combine_v2: -; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3] -; AVX-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623] -; AVX-NEXT: vpand %ymm7, %ymm0, %ymm0 -; AVX-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4 -; AVX-NEXT: vpand %ymm6, %ymm3, %ymm0 -; AVX-NEXT: vpand %ymm7, %ymm1, %ymm1 -; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5 -; AVX-NEXT: vmovdqa %ymm4, %ymm0 -; AVX-NEXT: vmovdqa %ymm5, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_512_combine_v2: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3] +; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623] +; AVXIFMA-NEXT: vpand %ymm7, %ymm0, %ymm0 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4 +; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm0 +; AVXIFMA-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5 +; AVXIFMA-NEXT: vmovdqa %ymm4, %ymm0 +; AVXIFMA-NEXT: vmovdqa %ymm5, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_512_combine_v2: ; AVX512: # %bb.0: @@ -58,6 +72,18 @@ define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) ; AVX512-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_512_combine_v2: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm4, %ymm5, %ymm3 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm2 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <8 x i64> %x, splat (i64 1125899906842623) ; (1 << 50) - 1 %y_masked = and <8 x i64> %y, splat (i64 3) %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked @@ -66,32 +92,32 @@ define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) } define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { -; AVX-LABEL: test_512_no_combine: -; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495] -; AVX-NEXT: vpand %ymm6, %ymm0, %ymm7 -; AVX-NEXT: vpand %ymm6, %ymm1, %ymm8 -; AVX-NEXT: vpand %ymm6, %ymm2, %ymm9 -; AVX-NEXT: vpand %ymm6, %ymm3, %ymm6 -; AVX-NEXT: vpsrlq $32, %ymm8, %ymm8 -; AVX-NEXT: vpmuludq %ymm3, %ymm8, %ymm8 -; AVX-NEXT: vpsrlq $32, %ymm6, %ymm6 -; AVX-NEXT: vpmuludq %ymm6, %ymm1, %ymm6 -; AVX-NEXT: vpaddq %ymm6, %ymm8, %ymm6 -; AVX-NEXT: vpsllq $32, %ymm6, %ymm6 -; AVX-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vpsrlq $32, %ymm7, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX-NEXT: vpsrlq $32, %ymm9, %ymm7 -; AVX-NEXT: vpmuludq %ymm7, %ymm0, %ymm7 -; AVX-NEXT: vpaddq %ymm3, %ymm7, %ymm3 -; AVX-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm5, %ymm1, %ymm1 -; AVX-NEXT: vpaddq %ymm6, %ymm1, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_512_no_combine: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495] +; AVXIFMA-NEXT: vpand %ymm6, %ymm0, %ymm7 +; AVXIFMA-NEXT: vpand %ymm6, %ymm1, %ymm8 +; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm9 +; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm6 +; AVXIFMA-NEXT: vpsrlq $32, %ymm8, %ymm8 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm8, %ymm8 +; AVXIFMA-NEXT: vpsrlq $32, %ymm6, %ymm6 +; AVXIFMA-NEXT: vpmuludq %ymm6, %ymm1, %ymm6 +; AVXIFMA-NEXT: vpaddq %ymm6, %ymm8, %ymm6 +; AVXIFMA-NEXT: vpsllq $32, %ymm6, %ymm6 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVXIFMA-NEXT: vpsrlq $32, %ymm7, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpsrlq $32, %ymm9, %ymm7 +; AVXIFMA-NEXT: vpmuludq %ymm7, %ymm0, %ymm7 +; AVXIFMA-NEXT: vpaddq %ymm3, %ymm7, %ymm3 +; AVXIFMA-NEXT: vpsllq $32, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm5, %ymm1, %ymm1 +; AVXIFMA-NEXT: vpaddq %ymm6, %ymm1, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_512_no_combine: ; AVX512: # %bb.0: @@ -108,6 +134,22 @@ define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_512_no_combine: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpbroadcastq {{.*#+}} zmm3 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495] +; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm0, %zmm4 +; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm1, %zmm3 +; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm4, %zmm4 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm4, %zmm4 +; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512-NOIFMA-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpsllq $32, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <8 x i64> %x, splat (i64 4503599627370495) %y_masked = and <8 x i64> %y, splat (i64 4503599627370495) %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked @@ -116,27 +158,27 @@ define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) } define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { -; AVX-LABEL: test_512_no_combine_v2: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlq $32, %ymm1, %ymm6 -; AVX-NEXT: vpmuludq %ymm3, %ymm6, %ymm6 -; AVX-NEXT: vpsrlq $32, %ymm3, %ymm7 -; AVX-NEXT: vpmuludq %ymm7, %ymm1, %ymm7 -; AVX-NEXT: vpaddq %ymm6, %ymm7, %ymm6 -; AVX-NEXT: vpsllq $32, %ymm6, %ymm6 -; AVX-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX-NEXT: vpsrlq $32, %ymm2, %ymm7 -; AVX-NEXT: vpmuludq %ymm7, %ymm0, %ymm7 -; AVX-NEXT: vpaddq %ymm3, %ymm7, %ymm3 -; AVX-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm5, %ymm1, %ymm1 -; AVX-NEXT: vpaddq %ymm6, %ymm1, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_512_no_combine_v2: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpsrlq $32, %ymm1, %ymm6 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm6, %ymm6 +; AVXIFMA-NEXT: vpsrlq $32, %ymm3, %ymm7 +; AVXIFMA-NEXT: vpmuludq %ymm7, %ymm1, %ymm7 +; AVXIFMA-NEXT: vpaddq %ymm6, %ymm7, %ymm6 +; AVXIFMA-NEXT: vpsllq $32, %ymm6, %ymm6 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVXIFMA-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpsrlq $32, %ymm2, %ymm7 +; AVXIFMA-NEXT: vpmuludq %ymm7, %ymm0, %ymm7 +; AVXIFMA-NEXT: vpaddq %ymm3, %ymm7, %ymm3 +; AVXIFMA-NEXT: vpsllq $32, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm5, %ymm1, %ymm1 +; AVXIFMA-NEXT: vpaddq %ymm6, %ymm1, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_512_no_combine_v2: ; AVX512: # %bb.0: @@ -150,6 +192,19 @@ define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_512_no_combine_v2: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm0, %zmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm1, %zmm4 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm4, %zmm0, %zmm4 +; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm4, %zmm3 +; AVX512-NOIFMA-NEXT: vpsllq $32, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: retq %mul = mul <8 x i64> %x, %y %res = add <8 x i64> %mul, %z ret <8 x i64> %res @@ -255,25 +310,25 @@ define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) ; 40-bit and 13-bit, too wide define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { -; AVX-LABEL: test_mixed_width_too_wide: -; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191] -; AVX-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX-NEXT: vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052] -; AVX-NEXT: vpshufb %ymm6, %ymm1, %ymm7 -; AVX-NEXT: vpmuludq %ymm3, %ymm7, %ymm7 -; AVX-NEXT: vpsllq $32, %ymm7, %ymm7 -; AVX-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vpshufb %ymm6, %ymm0, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm1, %ymm5, %ymm1 -; AVX-NEXT: vpaddq %ymm7, %ymm1, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_mixed_width_too_wide: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191] +; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052] +; AVXIFMA-NEXT: vpshufb %ymm6, %ymm1, %ymm7 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm7, %ymm7 +; AVXIFMA-NEXT: vpsllq $32, %ymm7, %ymm7 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVXIFMA-NEXT: vpshufb %ymm6, %ymm0, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpsllq $32, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm1, %ymm5, %ymm1 +; AVXIFMA-NEXT: vpaddq %ymm7, %ymm1, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_mixed_width_too_wide: ; AVX512: # %bb.0: @@ -286,6 +341,18 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64 ; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_mixed_width_too_wide: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm3 +; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpsllq $32, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm2, %zmm1 +; AVX512-NOIFMA-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x40 = and <8 x i64> %x, splat (i64 1099511627775) %y13 = and <8 x i64> %y, splat (i64 8191) %mul = mul <8 x i64> %x40, %y13 @@ -294,19 +361,19 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64 } define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, <8 x i64> %z) { -; AVX-LABEL: test_zext32_inputs_not_safe: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpmuludq %ymm5, %ymm4, %ymm4 -; AVX-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpaddq %ymm4, %ymm2, %ymm0 -; AVX-NEXT: vpaddq %ymm1, %ymm3, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_zext32_inputs_not_safe: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVXIFMA-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVXIFMA-NEXT: vpmuludq %ymm5, %ymm4, %ymm4 +; AVXIFMA-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVXIFMA-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 +; AVXIFMA-NEXT: vpaddq %ymm4, %ymm2, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm1, %ymm3, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_zext32_inputs_not_safe: ; AVX512: # %bb.0: @@ -315,6 +382,14 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, ; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_zext32_inputs_not_safe: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512-NOIFMA-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x = zext <8 x i32> %xi32 to <8 x i64> %y = zext <8 x i32> %yi32 to <8 x i64> %mul = mul <8 x i64> %x, %y @@ -323,36 +398,36 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, } define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) nounwind { -; AVX-LABEL: test_1024_combine_split: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $32, %rsp -; AVX-NEXT: vmovdqa 112(%rbp), %ymm8 -; AVX-NEXT: vmovdqa 80(%rbp), %ymm9 -; AVX-NEXT: vmovdqa 48(%rbp), %ymm10 -; AVX-NEXT: vmovdqa 16(%rbp), %ymm11 -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm3, %ymm12, %ymm3 -; AVX-NEXT: vpand %ymm2, %ymm12, %ymm2 -; AVX-NEXT: vpand %ymm1, %ymm12, %ymm1 -; AVX-NEXT: vpand %ymm0, %ymm12, %ymm0 -; AVX-NEXT: vpand %ymm7, %ymm12, %ymm7 -; AVX-NEXT: {vex} vpmadd52luq %ymm7, %ymm3, %ymm8 -; AVX-NEXT: vpand %ymm6, %ymm12, %ymm3 -; AVX-NEXT: {vex} vpmadd52luq %ymm3, %ymm2, %ymm9 -; AVX-NEXT: vpand %ymm5, %ymm12, %ymm2 -; AVX-NEXT: {vex} vpmadd52luq %ymm2, %ymm1, %ymm10 -; AVX-NEXT: vpand %ymm4, %ymm12, %ymm1 -; AVX-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm11 -; AVX-NEXT: vmovdqa %ymm11, %ymm0 -; AVX-NEXT: vmovdqa %ymm10, %ymm1 -; AVX-NEXT: vmovdqa %ymm9, %ymm2 -; AVX-NEXT: vmovdqa %ymm8, %ymm3 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_1024_combine_split: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: pushq %rbp +; AVXIFMA-NEXT: movq %rsp, %rbp +; AVXIFMA-NEXT: andq $-32, %rsp +; AVXIFMA-NEXT: subq $32, %rsp +; AVXIFMA-NEXT: vmovdqa 112(%rbp), %ymm8 +; AVXIFMA-NEXT: vmovdqa 80(%rbp), %ymm9 +; AVXIFMA-NEXT: vmovdqa 48(%rbp), %ymm10 +; AVXIFMA-NEXT: vmovdqa 16(%rbp), %ymm11 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm3, %ymm12, %ymm3 +; AVXIFMA-NEXT: vpand %ymm2, %ymm12, %ymm2 +; AVXIFMA-NEXT: vpand %ymm1, %ymm12, %ymm1 +; AVXIFMA-NEXT: vpand %ymm0, %ymm12, %ymm0 +; AVXIFMA-NEXT: vpand %ymm7, %ymm12, %ymm7 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm7, %ymm3, %ymm8 +; AVXIFMA-NEXT: vpand %ymm6, %ymm12, %ymm3 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm3, %ymm2, %ymm9 +; AVXIFMA-NEXT: vpand %ymm5, %ymm12, %ymm2 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm1, %ymm10 +; AVXIFMA-NEXT: vpand %ymm4, %ymm12, %ymm1 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm11 +; AVXIFMA-NEXT: vmovdqa %ymm11, %ymm0 +; AVXIFMA-NEXT: vmovdqa %ymm10, %ymm1 +; AVXIFMA-NEXT: vmovdqa %ymm9, %ymm2 +; AVXIFMA-NEXT: vmovdqa %ymm8, %ymm3 +; AVXIFMA-NEXT: movq %rbp, %rsp +; AVXIFMA-NEXT: popq %rbp +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_1024_combine_split: ; AVX512: # %bb.0: @@ -366,6 +441,27 @@ define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_1024_combine_split: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpbroadcastq {{.*#+}} zmm6 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863] +; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm1, %zmm1 +; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm2, %zmm2 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm6 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm7 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm4, %ymm8 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm6, %ymm7, %ymm8 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm0 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm5, %ymm6 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm4, %ymm6 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm3, %ymm1, %ymm5 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm1 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <16 x i64> %x, splat (i64 67108863) %y_masked = and <16 x i64> %y, splat (i64 67108863) %mul = mul <16 x i64> %x_masked, %y_masked @@ -388,13 +484,13 @@ define <1 x i64> @test_not_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) { } define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) { -; AVX-LABEL: test_v3i64: -; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmuludq %ymm0, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_v3i64: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpmuludq %ymm0, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVXIFMA-NEXT: retq ; ; AVX512-NOVL-LABEL: test_v3i64: ; AVX512-NOVL: # %bb.0: @@ -410,6 +506,13 @@ define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) { ; AVX512VL-NEXT: vpmuludq %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_v3i64: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NOIFMA-NEXT: vpmuludq %ymm0, %ymm0, %ymm0 +; AVX512-NOIFMA-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <3 x i64> %x, splat (i64 67108863) %y_masked = and <3 x i64> %x, splat (i64 67108863) %mul = mul <3 x i64> %x_masked, %y_masked @@ -418,35 +521,35 @@ define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) { } define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) { -; AVX-LABEL: test_v5i64: -; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: vmovq %r8, %xmm0 -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vmovq %rsi, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm2 -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF -; AVX-NEXT: vmovq %rcx, %xmm3 -; AVX-NEXT: vmovq %r9, %xmm4 -; AVX-NEXT: vpand %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpsrlq $32, %xmm3, %xmm4 -; AVX-NEXT: vpmuludq %xmm4, %xmm3, %xmm4 -; AVX-NEXT: vpsllq $33, %xmm4, %xmm4 -; AVX-NEXT: vpmuludq %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vpaddq %xmm4, %xmm1, %xmm1 -; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm2 -; AVX-NEXT: vmovdqa %ymm2, (%rdi) -; AVX-NEXT: vmovq %xmm1, 32(%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_v5i64: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: movq %rdi, %rax +; AVXIFMA-NEXT: vmovq %r8, %xmm0 +; AVXIFMA-NEXT: vmovq %rcx, %xmm1 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVXIFMA-NEXT: vmovq %rdx, %xmm1 +; AVXIFMA-NEXT: vmovq %rsi, %xmm2 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVXIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVXIFMA-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm2 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVXIFMA-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF +; AVXIFMA-NEXT: vmovq %rcx, %xmm3 +; AVXIFMA-NEXT: vmovq %r9, %xmm4 +; AVXIFMA-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVXIFMA-NEXT: vpsrlq $32, %xmm3, %xmm4 +; AVXIFMA-NEXT: vpmuludq %xmm4, %xmm3, %xmm4 +; AVXIFMA-NEXT: vpsllq $33, %xmm4, %xmm4 +; AVXIFMA-NEXT: vpmuludq %xmm3, %xmm3, %xmm3 +; AVXIFMA-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVXIFMA-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm2 +; AVXIFMA-NEXT: vmovdqa %ymm2, (%rdi) +; AVXIFMA-NEXT: vmovq %xmm1, 32(%rdi) +; AVXIFMA-NEXT: vzeroupper +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_v5i64: ; AVX512: # %bb.0: @@ -454,6 +557,13 @@ define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) { ; AVX512-NEXT: vpmuludq %zmm0, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_v5i64: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm0, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <5 x i64> %x, splat (i64 67108863) %y_masked = and <5 x i64> %x, splat (i64 67108863) %mul = mul <5 x i64> %x_masked, %y_masked @@ -462,30 +572,30 @@ define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) { } define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) { -; AVX-LABEL: test_v6i64: -; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: vmovq %r8, %xmm0 -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vmovq %rsi, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm1 -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm1 -; AVX-NEXT: vmovq %r9, %xmm0 -; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpmuldq %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rdi) -; AVX-NEXT: vmovdqa %ymm1, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_v6i64: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: movq %rdi, %rax +; AVXIFMA-NEXT: vmovq %r8, %xmm0 +; AVXIFMA-NEXT: vmovq %rcx, %xmm1 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVXIFMA-NEXT: vmovq %rdx, %xmm1 +; AVXIFMA-NEXT: vmovq %rsi, %xmm2 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVXIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm1 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm1 +; AVXIFMA-NEXT: vmovq %r9, %xmm0 +; AVXIFMA-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVXIFMA-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVXIFMA-NEXT: vpmuldq %xmm0, %xmm0, %xmm0 +; AVXIFMA-NEXT: vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVXIFMA-NEXT: vmovdqa %xmm0, 32(%rdi) +; AVXIFMA-NEXT: vmovdqa %ymm1, (%rdi) +; AVXIFMA-NEXT: vzeroupper +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_v6i64: ; AVX512: # %bb.0: @@ -493,6 +603,13 @@ define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) { ; AVX512-NEXT: vpmuludq %zmm0, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_v6i64: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm0, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <6 x i64> %x, splat (i64 67108863) %y_masked = and <6 x i64> %x, splat (i64 67108863) %mul = mul <6 x i64> %x_masked, %y_masked @@ -501,43 +618,43 @@ define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) { } define <9 x i64> @test_v9i64(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) { -; AVX-LABEL: test_v9i64: -; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: vmovq %r8, %xmm0 -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vmovq %rsi, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovq %r9, %xmm1 -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm3 -; AVX-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm4 -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF -; AVX-NEXT: vmovq %rcx, %xmm5 -; AVX-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero -; AVX-NEXT: vpand %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpsrlq $32, %xmm5, %xmm6 -; AVX-NEXT: vpmuludq %xmm6, %xmm5, %xmm6 -; AVX-NEXT: vpsllq $33, %xmm6, %xmm6 -; AVX-NEXT: vpmuludq %xmm5, %xmm5, %xmm5 -; AVX-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX-NEXT: vpaddq %xmm6, %xmm2, %xmm2 -; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm4 -; AVX-NEXT: {vex} vpmadd52luq %ymm1, %ymm1, %ymm3 -; AVX-NEXT: vmovdqa %ymm3, 32(%rdi) -; AVX-NEXT: vmovdqa %ymm4, (%rdi) -; AVX-NEXT: vmovq %xmm2, 64(%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_v9i64: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: movq %rdi, %rax +; AVXIFMA-NEXT: vmovq %r8, %xmm0 +; AVXIFMA-NEXT: vmovq %rcx, %xmm1 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVXIFMA-NEXT: vmovq %rdx, %xmm1 +; AVXIFMA-NEXT: vmovq %rsi, %xmm2 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVXIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVXIFMA-NEXT: vmovq %r9, %xmm1 +; AVXIFMA-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVXIFMA-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVXIFMA-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm3 +; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm4 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVXIFMA-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF +; AVXIFMA-NEXT: vmovq %rcx, %xmm5 +; AVXIFMA-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero +; AVXIFMA-NEXT: vpand %xmm5, %xmm6, %xmm5 +; AVXIFMA-NEXT: vpsrlq $32, %xmm5, %xmm6 +; AVXIFMA-NEXT: vpmuludq %xmm6, %xmm5, %xmm6 +; AVXIFMA-NEXT: vpsllq $33, %xmm6, %xmm6 +; AVXIFMA-NEXT: vpmuludq %xmm5, %xmm5, %xmm5 +; AVXIFMA-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; AVXIFMA-NEXT: vpaddq %xmm6, %xmm2, %xmm2 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm4 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm1, %ymm3 +; AVXIFMA-NEXT: vmovdqa %ymm3, 32(%rdi) +; AVXIFMA-NEXT: vmovdqa %ymm4, (%rdi) +; AVXIFMA-NEXT: vmovq %xmm2, 64(%rdi) +; AVXIFMA-NEXT: vzeroupper +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_v9i64: ; AVX512: # %bb.0: @@ -572,6 +689,44 @@ define <9 x i64> @test_v9i64(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) { ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_v9i64: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: movq %rdi, %rax +; AVX512-NOIFMA-NEXT: vmovq %r8, %xmm0 +; AVX512-NOIFMA-NEXT: vmovq %rcx, %xmm1 +; AVX512-NOIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NOIFMA-NEXT: vmovq %rdx, %xmm1 +; AVX512-NOIFMA-NEXT: vmovq %rsi, %xmm2 +; AVX512-NOIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NOIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NOIFMA-NEXT: vmovq %r9, %xmm1 +; AVX512-NOIFMA-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NOIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NOIFMA-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF +; AVX512-NOIFMA-NEXT: vmovq %rcx, %xmm2 +; AVX512-NOIFMA-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NOIFMA-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX512-NOIFMA-NEXT: vpsrlq $32, %xmm2, %xmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %xmm3, %xmm2, %xmm3 +; AVX512-NOIFMA-NEXT: vpsllq $33, %xmm3, %xmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %xmm2, %xmm2, %xmm2 +; AVX512-NOIFMA-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX512-NOIFMA-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512-NOIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm3 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm2, %ymm3 +; AVX512-NOIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm2 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm2 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; AVX512-NOIFMA-NEXT: vmovq %xmm1, 64(%rdi) +; AVX512-NOIFMA-NEXT: vmovdqa64 %zmm0, (%rdi) +; AVX512-NOIFMA-NEXT: vzeroupper +; AVX512-NOIFMA-NEXT: retq %x_masked = and <9 x i64> %x, splat (i64 67108863) %y_masked = and <9 x i64> %x, splat (i64 67108863) %mul = mul <9 x i64> %x_masked, %y_masked |