diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2023-12-08 11:38:20 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2023-12-08 11:38:32 +0000 |
commit | 5f91335a55cd65dda8351f85b93eeaa7493e06c4 (patch) | |
tree | 9bb6dd82db2c275535d1aed73015abc368317eec | |
parent | ffd61c1e96e9c8a472f305585930b45be0d639d3 (diff) | |
download | llvm-5f91335a55cd65dda8351f85b93eeaa7493e06c4.zip llvm-5f91335a55cd65dda8351f85b93eeaa7493e06c4.tar.gz llvm-5f91335a55cd65dda8351f85b93eeaa7493e06c4.tar.bz2 |
[X86] canonicalizeBitSelect - always use VPTERNLOGD for sub-32bit types
We were using VPTERNLOGQ for everything but i32 types, which made broadcasts wider than necessary
Noticed in #73509
29 files changed, 381 insertions, 398 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 620014c..f7fe82f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48707,7 +48707,7 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, if (useVPTERNLOG(Subtarget, VT)) { // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C. // VPTERNLOG is only available as vXi32/64-bit types. - MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64; + MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64; MVT OpVT = MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits()); SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1)); diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll index 77cab3d..8d811d8 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-arith.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll @@ -329,7 +329,7 @@ define half @fcopysign(half %x, half %y) { ; CHECK-LABEL: fcopysign: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; CHECK-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vpternlogd $226, %xmm1, %xmm2, %xmm0 ; CHECK-NEXT: retq %a = call half @llvm.copysign.f16(half %x, half %y) ret half %a @@ -384,7 +384,7 @@ declare <8 x half> @llvm.fabs.v8f16(<8 x half>) define <8 x half> @fcopysignv8f16(<8 x half> %x, <8 x half> %y) { ; CHECK-LABEL: fcopysignv8f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; CHECK-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; CHECK-NEXT: retq %a = call <8 x half> @llvm.copysign.v8f16(<8 x half> %x, <8 x half> %y) ret <8 x half> %a @@ -439,7 +439,7 @@ declare <16 x half> @llvm.fabs.v16f16(<16 x half>) define <16 x half> @fcopysignv16f16(<16 x half> %x, <16 x half> %y) { ; CHECK-LABEL: fcopysignv16f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; CHECK-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; CHECK-NEXT: retq %a = call <16 x half> @llvm.copysign.v16f16(<16 x half> %x, <16 x half> %y) ret <16 x half> %a @@ -494,7 +494,7 @@ declare <32 x half> @llvm.fabs.v32f16(<32 x half>) define <32 x half> @fcopysignv32f16(<32 x half> %x, <32 x half> %y) { ; CHECK-LABEL: fcopysignv32f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; CHECK-NEXT: retq %a = call <32 x half> @llvm.copysign.v32f16(<32 x half> %x, <32 x half> %y) ret <32 x half> %a diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll index 6cd85e0..8f12720 100644 --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -31,7 +31,7 @@ define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm2 ; GFNIAVX512-NEXT: vpsrlw $5, %xmm1, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 +; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>) ret <16 x i8> %res @@ -119,7 +119,7 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $4, %ymm0, %ymm2 ; GFNIAVX512-NEXT: vpsrlw $4, %ymm1, %ymm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 +; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) ret <32 x i8> %res @@ -175,7 +175,7 @@ define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $2, %ymm0, %ymm2 ; GFNIAVX512-NEXT: vpsrlw $6, %ymm1, %ymm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 +; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>) ret <32 x i8> %res @@ -339,7 +339,7 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $6, %zmm0, %zmm2 ; GFNIAVX512-NEXT: vpsrlw $2, %zmm1, %zmm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll index 29b58d0..8b94a3f 100644 --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -32,7 +32,7 @@ define <16 x i8> @splatconstant_rotl_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm1 ; GFNIAVX512-NEXT: vpsrlw $5, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>) ret <16 x i8> %res @@ -121,7 +121,7 @@ define <32 x i8> @splatconstant_rotl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $4, %ymm0, %ymm1 ; GFNIAVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) ret <32 x i8> %res @@ -177,7 +177,7 @@ define <32 x i8> @splatconstant_rotr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $2, %ymm0, %ymm1 ; GFNIAVX512-NEXT: vpsrlw $6, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>) ret <32 x i8> %res @@ -344,7 +344,7 @@ define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $6, %zmm0, %zmm1 ; GFNIAVX512-NEXT: vpsrlw $2, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 7dd4af7..5cd0c23 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -2010,7 +2010,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-v ; CHECK: # %bb.0: ; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; CHECK-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; CHECK-NEXT: retq %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> @@ -2023,7 +2023,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le ; CHECK: # %bb.0: ; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; CHECK-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-NEXT: retq %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> diff --git a/llvm/test/CodeGen/X86/vec_fcopysign.ll b/llvm/test/CodeGen/X86/vec_fcopysign.ll index 4572107..b34b02c 100644 --- a/llvm/test/CodeGen/X86/vec_fcopysign.ll +++ b/llvm/test/CodeGen/X86/vec_fcopysign.ll @@ -159,8 +159,8 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm1 -; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX512-NEXT: vpternlogq $202, (%eax), %xmm1, %xmm0 +; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] +; X86-AVX512-NEXT: vpternlogd $202, (%eax), %xmm1, %xmm0 ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v8f16: @@ -193,8 +193,8 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind { ; X64-AVX512-LABEL: fcopysign_v8f16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; X64-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9223231297218904063,9223231297218904063] -; X64-AVX512-NEXT: vpternlogq $202, (%rsi), %xmm1, %xmm0 +; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] +; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %xmm1, %xmm0 ; X64-AVX512-NEXT: retq %a0 = load <8 x half>, ptr %p0, align 16 %a1 = load <8 x half>, ptr %p1, align 16 @@ -405,8 +405,8 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: vmovdqu (%ecx), %ymm1 -; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX512-NEXT: vpternlogq $202, (%eax), %ymm1, %ymm0 +; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X86-AVX512-NEXT: vpternlogd $202, (%eax), %ymm1, %ymm0 ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v16f16: @@ -444,8 +444,8 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind { ; X64-AVX512-LABEL: fcopysign_v16f16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1 -; X64-AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063] -; X64-AVX512-NEXT: vpternlogq $202, (%rsi), %ymm1, %ymm0 +; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %ymm1, %ymm0 ; X64-AVX512-NEXT: retq %a0 = load <16 x half>, ptr %p0, align 16 %a1 = load <16 x half>, ptr %p1, align 16 @@ -691,34 +691,14 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; X86-AVX2-NEXT: retl ; -; X86-AVX512VL-LABEL: fcopysign_v32f16: -; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512VL-NEXT: vmovdqu64 (%ecx), %zmm1 -; X86-AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X86-AVX512VL-NEXT: vpternlogq $202, (%eax), %zmm1, %zmm0 -; X86-AVX512VL-NEXT: retl -; -; X86-AVX512FP16-LABEL: fcopysign_v32f16: -; X86-AVX512FP16: # %bb.0: -; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512FP16-NEXT: vmovdqu64 (%ecx), %zmm1 -; X86-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX512FP16-NEXT: vpternlogq $202, (%eax), %zmm1, %zmm0 -; X86-AVX512FP16-NEXT: retl -; -; X86-AVX512VLDQ-LABEL: fcopysign_v32f16: -; X86-AVX512VLDQ: # %bb.0: -; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512VLDQ-NEXT: vmovdqu64 (%ecx), %zmm1 -; X86-AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX512VLDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X86-AVX512VLDQ-NEXT: vpternlogq $202, (%eax), %zmm1, %zmm0 -; X86-AVX512VLDQ-NEXT: retl +; X86-AVX512-LABEL: fcopysign_v32f16: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovdqu64 (%ecx), %zmm1 +; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X86-AVX512-NEXT: vpternlogd $202, (%eax), %zmm1, %zmm0 +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v32f16: ; X64-SSE: # %bb.0: @@ -769,8 +749,8 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind { ; X64-AVX512-LABEL: fcopysign_v32f16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 -; X64-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063] -; X64-AVX512-NEXT: vpternlogq $202, (%rsi), %zmm1, %zmm0 +; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %zmm1, %zmm0 ; X64-AVX512-NEXT: retq %a0 = load <32 x half>, ptr %p0, align 16 %a1 = load <32 x half>, ptr %p1, align 16 @@ -786,3 +766,6 @@ declare <32 x half> @llvm.copysign.v32f16(<32 x half>, <32 x half>) ; X64-AVX512VLDQ: {{.*}} ; X86: {{.*}} ; X86-AVX: {{.*}} +; X86-AVX512FP16: {{.*}} +; X86-AVX512VL: {{.*}} +; X86-AVX512VLDQ: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 2d0e92a..4f100cd 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -2409,7 +2409,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2418,14 +2418,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2434,7 +2434,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper ; AVX512VBMI2-NEXT: retq @@ -2443,14 +2443,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 43d426c..05be4e1 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -2303,7 +2303,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; @@ -2311,14 +2311,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512BW-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; @@ -2326,7 +2326,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -2334,14 +2334,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX10-LABEL: splatconstant_funnnel_v32i8: ; AVX10: # %bb.0: ; AVX10-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX10-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX10-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 +; AVX10-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 ; AVX10-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index c246aaf..0724d87 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -1124,7 +1124,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: @@ -1137,35 +1137,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 37d4f3b..9ddd171 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1851,7 +1851,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1860,14 +1860,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1876,14 +1876,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper ; AVX512VBMI2-NEXT: retq @@ -1892,7 +1892,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 8762072..58719e6 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -447,12 +447,12 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 @@ -467,12 +467,12 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 @@ -1652,7 +1652,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; @@ -1660,14 +1660,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; @@ -1675,14 +1675,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -1690,7 +1690,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index 0a473dd..edfa56a 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -131,15 +131,15 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm5 = [17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520] -; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] +; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm6 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm7 = [18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596] -; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm6 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] +; AVX512F-NEXT: vpternlogd $226, %zmm4, %zmm7, %zmm6 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 @@ -151,12 +151,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm7, %zmm4 +; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm7, %zmm4 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 @@ -173,15 +173,15 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520] -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] +; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $6, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm6 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596] -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm6 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] +; AVX512VL-NEXT: vpternlogd $226, %ymm4, %ymm7, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 @@ -192,12 +192,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4 +; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm7, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 @@ -721,7 +721,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: @@ -733,35 +733,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 809735a..eb2df0d 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -2403,7 +2403,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2412,14 +2412,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2428,7 +2428,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper ; AVX512VBMI2-NEXT: retq @@ -2437,14 +2437,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index fa41a10..1a6ecea 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -2104,7 +2104,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; @@ -2112,14 +2112,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512BW-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; @@ -2127,7 +2127,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -2135,14 +2135,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX10-LABEL: splatconstant_funnnel_v32i8: ; AVX10: # %bb.0: ; AVX10-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX10-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX10-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 +; AVX10-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 ; AVX10-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index b1fee9d..1c66461 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -1180,7 +1180,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: @@ -1193,35 +1193,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 ; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 9c5fe49..402eb73 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1915,7 +1915,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1924,14 +1924,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1940,14 +1940,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper ; AVX512VBMI2-NEXT: retq @@ -1956,7 +1956,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index e4867ba..bb31146 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -473,17 +473,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -492,17 +492,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -1703,7 +1703,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; @@ -1711,14 +1711,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; @@ -1726,14 +1726,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -1741,7 +1741,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index 3c17bf2..4364c04 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -131,36 +131,36 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm4 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm5 = [1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095] -; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135] +; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $6, %ymm2, %ymm4 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm6 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm7 = [4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399] -; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm6 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567] +; AVX512F-NEXT: vpternlogd $226, %zmm4, %zmm7, %zmm6 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm6 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm8 = [9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471] -; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm8, %zmm6 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143] +; AVX512F-NEXT: vpternlogd $226, %zmm4, %zmm8, %zmm6 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm7, %zmm4 +; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm7, %zmm4 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm8, %zmm4 +; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm8, %zmm4 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -171,36 +171,36 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm4 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095] -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135] +; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $6, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm6 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399] -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm6 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567] +; AVX512VL-NEXT: vpternlogd $226, %ymm4, %ymm7, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm6 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471] -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm8, %ymm6 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143] +; AVX512VL-NEXT: vpternlogd $226, %ymm4, %ymm8, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4 +; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm7, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm8, %ymm4 +; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm8, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -721,7 +721,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: @@ -733,35 +733,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index e6de73d..3dc5818 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -1218,7 +1218,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> ; AVX512F-NEXT: vpermd (%rdx), %zmm4, %zmm5 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] ; AVX512F-NEXT: vpshufb %ymm6, %ymm3, %ymm3 @@ -2386,7 +2386,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> ; AVX512F-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm10 +; AVX512F-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 ; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512F-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -2469,7 +2469,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-NEXT: vpermd (%rdx), %zmm18, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm1 +; AVX512F-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 9f479b0..59866f2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -714,7 +714,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 ; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -753,7 +753,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] @@ -1396,7 +1396,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -1474,7 +1474,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -2804,7 +2804,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm0 ; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u> ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm0 @@ -2862,7 +2862,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm2 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm10[0,1,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] ; AVX512F-SLOW-NEXT: vprolq $16, %ymm3, %ymm8 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] @@ -2889,7 +2889,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm4 ; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm20[1,1,2,2] @@ -2995,7 +2995,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm3 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm15 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 @@ -3049,7 +3049,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm1 ; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm5 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 @@ -3072,7 +3072,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r9) @@ -5924,12 +5924,12 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm18, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm31, %zmm18, %zmm1 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm25, %zmm19, %zmm27 ; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm25 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm18, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm27, %zmm18, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm31 # 64-byte Folded Reload @@ -5972,28 +5972,28 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm18 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm18, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm22, %zmm18, %zmm8 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm22 ; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm22, %zmm24, %zmm21 ; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm22 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm22, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm18, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm21, %zmm18, %zmm10 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm24, %zmm11 ; AVX512F-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm12 ; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm13 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm16, %zmm12 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm11, %zmm16, %zmm12 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm24, %zmm4 ; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm6 ; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm7 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm16, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm4, %zmm16, %zmm6 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm30, %zmm19, %zmm9 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm19, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] @@ -6186,7 +6186,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm0 ; AVX512F-FAST-NEXT: vpbroadcastq 120(%r8), %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm2, %zmm17, %zmm16 ; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm24 @@ -6231,7 +6231,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm6 ; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm10 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm17, %zmm6 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm7, %zmm17, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] @@ -6267,7 +6267,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm8, %zmm5, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] @@ -6282,7 +6282,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm4 ; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm7 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm8, %zmm5, %zmm4 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload @@ -6294,10 +6294,10 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm20, %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm20, %zmm5, %zmm2 ; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm7 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm22, %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm22, %zmm5, %zmm7 ; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm26 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpternlogq $226, %zmm25, %zmm1, %zmm12 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index 7133214..b2c0e00 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -3486,19 +3486,19 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm17, %zmm10, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm10, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm20, %zmm10, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm16, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm23, %zmm12, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm12, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm4, %zmm6, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) @@ -3695,23 +3695,23 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm18[0,1,2,3],zmm10[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm22, %zmm0, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm19, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm17, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm17, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm16, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: popq %rax ; AVX512F-ONLY-FAST-NEXT: vzeroupper @@ -3935,19 +3935,19 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm26, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm22, %zmm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm21, %zmm7, %zmm8 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm20, %zmm9, %zmm7 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm23, %zmm9, %zmm10 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm3, %zmm1 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) @@ -4154,22 +4154,22 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm4, %zmm5, %zmm6 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm3, %zmm5, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm1, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm18, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm18, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm17[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm1, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm23[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm1, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -7763,10 +7763,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm2, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 @@ -7862,27 +7862,27 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm24, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm27, %zmm10, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm10, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm25, %zmm10, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm22, %zmm10, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm28, %zmm13, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm13, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm30, %zmm13, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm21, %zmm13, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm13, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) @@ -8295,40 +8295,40 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm18[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm7, %zmm6 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm20[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm7, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm28[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm7, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm17[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm7, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm16, %zmm0, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm26, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm26, %zmm0, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm25, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm25, %zmm1, %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) ; AVX512F-ONLY-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper @@ -8731,10 +8731,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8807,27 +8807,27 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm4 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm25 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm25 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm23, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm6 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm17, %zmm16 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm16 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm16 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm24, %zmm14 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm14 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm18, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm29, %zmm18, %zmm5 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm19, %zmm9, %zmm8 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm20, %zmm9, %zmm0 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm22, %zmm9, %zmm7 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm9, %zmm10 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 448(%rax) @@ -9253,43 +9253,43 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[0,0,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm11, %zmm13 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm9, %zmm11, %zmm13 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm5, %zmm11, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm5, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm29, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm29, %zmm5, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm25[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm0 = zmm17[0,1,2,3],mem[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 448(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm12, %zmm11, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 576(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 512(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 704(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm0 = zmm26[0,1,2,3],mem[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 640(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm0 = zmm22[0,1,2,3],mem[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX512DQ-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 5934b80..3de1918 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -458,7 +458,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512F-SLOW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512F-SLOW-NEXT: vmovq %xmm0, 48(%rax) @@ -1059,7 +1059,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 ; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 ; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1141,7 +1141,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpbroadcastd 4(%r10), %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-FAST-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 64(%rax) @@ -2427,22 +2427,22 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0 ; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm1 ; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm6 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] @@ -2556,20 +2556,20 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 ; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 +; AVX512F-FAST-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 +; AVX512F-FAST-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 ; AVX512F-FAST-NEXT: vpbroadcastd (%rax), %ymm2 ; AVX512F-FAST-NEXT: vpbroadcastd 4(%rax), %ymm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 ; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1 @@ -5262,12 +5262,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm9 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,1,3,2] ; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm18, %zmm8 @@ -5276,7 +5276,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm8, %zmm8 @@ -5295,7 +5295,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm10 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm9 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 @@ -5303,7 +5303,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] ; AVX512F-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermd (%rax), %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) @@ -5541,11 +5541,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] ; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm8, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 ; AVX512F-FAST-NEXT: vpbroadcastd 36(%rax), %ymm6 ; AVX512F-FAST-NEXT: vpbroadcastd 40(%rax), %ymm8 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] ; AVX512F-FAST-NEXT: vpternlogq $226, %zmm26, %zmm8, %zmm5 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 @@ -5558,7 +5558,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm19[0,0,1,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm18[2,2,2,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm17[2,1,3,2] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm5 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm5 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] @@ -5574,7 +5574,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 ; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm21, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm9 # 32-byte Folded Reload @@ -5591,7 +5591,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16 ; AVX512F-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm0 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm0 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm0 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) @@ -10946,7 +10946,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] ; AVX512F-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermd 64(%rax), %zmm11, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm17, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] @@ -10973,7 +10973,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm5 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm5, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 @@ -11007,7 +11007,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm2 ; AVX512F-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm3 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 @@ -11049,7 +11049,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm5 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm9, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 @@ -11085,7 +11085,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm2 ; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm3 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm9, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] @@ -11120,7 +11120,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 ; AVX512F-SLOW-NEXT: vpermd (%rax), %zmm11, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm17, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] @@ -11145,7 +11145,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] ; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm12 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm25 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm15, %zmm25 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm15, %zmm25 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 ; AVX512F-SLOW-NEXT: vprold $16, %xmm12, %xmm15 @@ -11178,7 +11178,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm0 ; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm6 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm28 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm28 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm28 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 @@ -11284,9 +11284,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm23 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm1, %zmm23 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload @@ -11643,7 +11643,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm19, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm22, %zmm2 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm3, %zmm22, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm0 @@ -11679,7 +11679,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] ; AVX512F-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm25, %zmm0 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm3, %zmm25, %zmm0 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm0 @@ -11776,7 +11776,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastd 68(%rax), %ymm6 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm23 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm23 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm2, %zmm4, %zmm23 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm23 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -11790,7 +11790,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512F-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm20 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm20 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm1, %zmm4, %zmm20 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm9, %zmm2, %zmm20 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] @@ -11835,7 +11835,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] ; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm5, %zmm19 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm19 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm2, %zmm25, %zmm19 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 @@ -11851,7 +11851,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastd 96(%rax), %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm21 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm13, %zmm30, %zmm21 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm13, %zmm30, %zmm21 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm1 @@ -11904,7 +11904,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastd 104(%rax), %ymm6 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm22, %zmm2 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm3, %zmm22, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[3,3,3,3,7,7,7,7] @@ -11918,7 +11918,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm31, %ymm9 ; AVX512F-FAST-NEXT: vpbroadcastd 32(%rax), %ymm18 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm10, %zmm30, %zmm9 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm10, %zmm30, %zmm9 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm9 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm2 @@ -11944,7 +11944,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastd 36(%rax), %ymm3 ; AVX512F-FAST-NEXT: vpbroadcastd 40(%rax), %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm22, %zmm3 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm4, %zmm22, %zmm3 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] ; AVX512F-FAST-NEXT: vpternlogq $184, %ymm7, %ymm0, %ymm29 @@ -11955,7 +11955,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm29[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm12 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm12 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index f4fda97..33f4e4b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -927,7 +927,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vporq %zmm7, %zmm5, %zmm5 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u> ; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] @@ -966,7 +966,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u> ; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm7, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] @@ -1886,7 +1886,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u> ; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm8, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u] ; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 @@ -1968,7 +1968,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u> ; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u] ; AVX512F-FAST-NEXT: vpor %ymm5, %ymm8, %ymm5 @@ -4041,15 +4041,15 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm3 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm9, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm30 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm30 ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm25 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm27[0,0,1,1,4,4,5,5] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm8[0,0,1,1,4,4,5,5] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u> ; AVX512F-SLOW-NEXT: vpermd %zmm31, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%r9) @@ -4202,7 +4202,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX512F-FAST-NEXT: vpternlogq $226, %zmm22, %zmm21, %zmm3 ; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm30, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm8 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm8 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,0,1,1] @@ -4226,13 +4226,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> ; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm18[0,0,1,1,4,4,5,5] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm11[0,0,1,1,4,4,5,5] ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u> ; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 3bc7b6e..d16daa0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -1750,7 +1750,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm13 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm14, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogd $226, %zmm6, %zmm14, %zmm13 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm11 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm12 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] @@ -1761,7 +1761,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 @@ -1779,7 +1779,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm7[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm8 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm10 @@ -1789,7 +1789,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm14, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm8, %zmm14, %zmm9 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm10 ; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm8 @@ -1815,7 +1815,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) @@ -1858,7 +1858,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm8 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm11 @@ -1878,7 +1878,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vprold $16, %xmm14, %xmm14 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm14 = zmm13[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm12 ; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm15 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm13 @@ -1889,7 +1889,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm15, %zmm10 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,0,1,4,4,4,5] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm14, %zmm15, %zmm10 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm14, %zmm15, %zmm10 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512F-FAST-NEXT: vprold $16, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] @@ -1901,13 +1901,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX512F-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm15, %zmm3 +; AVX512F-FAST-NEXT: vpternlogd $226, %zmm5, %zmm15, %zmm3 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) @@ -4306,7 +4306,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm28[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm5, %zmm8, %zmm1 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm29[0,0,0,1] ; AVX512F-SLOW-NEXT: vprold $16, %ymm30, %ymm9 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm21[0,0,0,1] @@ -4336,7 +4336,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm15[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm8, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm8, %zmm5 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm27[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] @@ -4350,7 +4350,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm10, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm10, %zmm8 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm26[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] @@ -4362,7 +4362,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm31[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm10, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm10, %zmm11 ; AVX512F-SLOW-NEXT: vpermq $234, (%rsp), %zmm2 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload @@ -4382,11 +4382,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm23[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm3 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) @@ -4605,13 +4605,13 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm6 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm7, %zmm8, %zmm6 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) ; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm16, %ymm12 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm6 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm8, %zmm3 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm2, %zmm8, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] @@ -4634,13 +4634,13 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm7 = mem[2,2,2,3,6,6,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm3, %zmm8, %zmm7 ; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm2, %ymm31 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm3 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm31[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-FAST-NEXT: vpermq $234, (%rsp), %zmm6 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm6 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm8, %zmm6 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm3, %zmm8, %zmm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm23[0,0,0,1,4,4,4,5] ; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] @@ -4658,11 +4658,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm26[0,0,0,1,4,4,4,5] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm2, %zmm4, %zmm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm28[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vpternlogd $184, %zmm0, %zmm4, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 54ed0f1..a4f21ea 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -659,7 +659,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,12],zero,ymm1[u,u,u,u,5,13],zero,ymm1[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm1[23,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512F-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512F-SLOW-NEXT: vmovq %xmm1, 48(%rax) @@ -1560,7 +1560,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, 96(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa %ymm4, 64(%rax) @@ -1631,7 +1631,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 ; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, 96(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-FAST-NEXT: vmovdqa %ymm2, 64(%rax) @@ -3342,7 +3342,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] ; AVX512F-SLOW-NEXT: vporq %zmm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] @@ -3371,7 +3371,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] ; AVX512F-SLOW-NEXT: vporq %zmm10, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 ; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm11 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero @@ -3391,7 +3391,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm14, %zmm13 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm18 = zmm13[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm18 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm18 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm13 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm14 @@ -3407,7 +3407,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm0[0,0,1,0,4,4,5,4] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm10 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm10 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14,u,u],zero,zero,zero,zero,ymm1[15,u,u],zero,zero,zero,zero,ymm1[16,u,u],zero,zero,zero,zero,ymm1[17,u,u],zero,zero,zero,zero,ymm1[18] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 @@ -3426,7 +3426,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u,u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm7, %ymm0 @@ -3502,7 +3502,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm13, %zmm10 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm15 = zmm10[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 ; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm10 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,1,u,1,u,0,0,u> @@ -3519,7 +3519,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 +; AVX512F-FAST-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero @@ -3535,7 +3535,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[u,u,u,u,u,14],zero,ymm1[u,u,u,u,u,15],zero,ymm1[u,u,u,u,u,16],zero,ymm1[u,u,u,u,u,17],zero,ymm1[u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u] ; AVX512F-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 @@ -3576,7 +3576,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] ; AVX512F-FAST-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[20],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 @@ -3593,7 +3593,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm11 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm11 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] @@ -7623,19 +7623,19 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,0,1,0] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm3 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 ; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 ; AVX512F-SLOW-NEXT: vporq %ymm15, %ymm18, %ymm5 ; AVX512F-SLOW-NEXT: vporq %ymm19, %ymm20, %ymm6 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm16 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm16 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm26 +; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm26 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm26 ; AVX512F-SLOW-NEXT: vporq %ymm21, %ymm22, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 @@ -7648,7 +7648,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm12[0,1,0,1,4,5,4,5] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm7[0,0,1,0,4,4,5,4] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm0 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) @@ -8039,7 +8039,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm22 = mem[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm22 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm22 ; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload @@ -8061,11 +8061,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm24 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm24 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21 +; AVX512F-FAST-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21 ; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0 ; AVX512F-FAST-NEXT: vpor %ymm12, %ymm15, %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 @@ -8078,14 +8078,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm31[0,1,0,1,4,5,4,5] ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm2 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm2 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 ; AVX512F-FAST-NEXT: vpor %ymm1, %ymm13, %ymm1 ; AVX512F-FAST-NEXT: vpor %ymm11, %ymm14, %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm20, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 33f7a4e..43c9be2 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -1573,7 +1573,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512NOVLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512NOVLX-NEXT: vzeroupper ; AVX512NOVLX-NEXT: retq @@ -1582,7 +1582,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VLX-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_v16i8: @@ -1811,7 +1811,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512NOVLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512NOVLX-NEXT: vzeroupper ; AVX512NOVLX-NEXT: retq @@ -1820,7 +1820,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VLX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index 7047d5e..c55335f 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -392,12 +392,12 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 @@ -412,12 +412,12 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 @@ -1404,7 +1404,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512NOVLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512NOVLX-NEXT: retq ; @@ -1412,7 +1412,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VLX-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_v32i8: @@ -1667,7 +1667,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512NOVLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512NOVLX-NEXT: retq ; @@ -1675,7 +1675,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VLX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 6504c3e..29afbf4 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -135,15 +135,15 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm5 = [17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520] -; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] +; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm6 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm7 = [18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596] -; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm6 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] +; AVX512F-NEXT: vpternlogd $226, %zmm4, %zmm7, %zmm6 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 @@ -155,12 +155,12 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm7, %zmm4 +; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm7, %zmm4 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 @@ -177,15 +177,15 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520] -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] +; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $6, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm6 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596] -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm6 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] +; AVX512VL-NEXT: vpternlogd $226, %ymm4, %ymm7, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 @@ -196,12 +196,12 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4 +; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm7, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 @@ -754,7 +754,7 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_v64i8: @@ -766,35 +766,35 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> @@ -902,7 +902,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; @@ -915,7 +915,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -923,7 +923,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -931,7 +931,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; @@ -939,7 +939,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -947,7 +947,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll index 7159edc..dd10221 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll @@ -230,7 +230,7 @@ define <64 x i8> @f2(ptr %p0) { ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: f2: @@ -517,7 +517,7 @@ define <64 x i8> @f4(ptr %p0) { ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: f4: |