diff options
Diffstat (limited to 'llvm/test/CodeGen/X86')
44 files changed, 4608 insertions, 1883 deletions
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll index b111ae5..e52ce6c 100644 --- a/llvm/test/CodeGen/X86/apx/cf.ll +++ b/llvm/test/CodeGen/X86/apx/cf.ll @@ -194,3 +194,38 @@ entry: call void @llvm.masked.store.v1i64.p0(<1 x i64> %3, ptr %p, i32 4, <1 x i1> %0) ret void } + +define void @sink_gep(ptr %p, i1 %cond) { +; CHECK-LABEL: sink_gep: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: cfcmovnel %eax, 112(%rdi) +; CHECK-NEXT: cfcmovnel 112(%rdi), %eax +; CHECK-NEXT: movl %eax, (%rdi) +; CHECK-NEXT: retq +entry: + %0 = getelementptr i8, ptr %p, i64 112 + br label %next + +next: + %1 = bitcast i1 %cond to <1 x i1> + call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr %0, i32 1, <1 x i1> %1) + %2 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr %0, i32 1, <1 x i1> %1, <1 x i32> zeroinitializer) + store <1 x i32> %2, ptr %p, align 4 + ret void +} + +define void @xor_cond(ptr %p, i1 %cond) { +; CHECK-LABEL: xor_cond: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: cfcmovel %eax, (%rdi) +; CHECK-NEXT: retq +entry: + %0 = xor i1 %cond, true + %1 = insertelement <1 x i1> zeroinitializer, i1 %0, i64 0 + call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr %p, i32 1, <1 x i1> %1) + ret void +} diff --git a/llvm/test/CodeGen/X86/avg-mask.ll b/llvm/test/CodeGen/X86/avg-mask.ll index b148cd3..e886639 100644 --- a/llvm/test/CodeGen/X86/avg-mask.ll +++ b/llvm/test/CodeGen/X86/avg-mask.ll @@ -177,11 +177,11 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin ; AVX512F-NEXT: shrq $32, %rdi ; AVX512F-NEXT: shrq $48, %rax ; AVX512F-NEXT: shrl $16, %ecx -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: kmovw %edi, %k4 @@ -364,11 +364,11 @@ define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nou ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpavgw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: kmovw %edi, %k2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll index ba2cacc..2f86499 100644 --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -1974,9 +1974,8 @@ define void @bcast_unfold_fmax_v4f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB60_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpnltps 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB60_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2007,9 +2006,8 @@ define void @bcast_unfold_fmax_v8f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB61_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vmaxps %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpnltps 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB61_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2041,9 +2039,8 @@ define void @bcast_unfold_fmax_v16f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB62_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpnltps 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB62_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2076,9 +2073,8 @@ define void @bcast_unfold_fmax_v2f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB63_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vmaxpd %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpnltpd 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovupd %xmm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB63_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2109,9 +2105,8 @@ define void @bcast_unfold_fmax_v4f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB64_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpnltpd 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB64_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2143,9 +2138,8 @@ define void @bcast_unfold_fmax_v8f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB65_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpnltpd 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB65_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2177,9 +2171,8 @@ define void @bcast_unfold_fmin_v4f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB66_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vminps %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB66_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2210,9 +2203,8 @@ define void @bcast_unfold_fmin_v8f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB67_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vminps %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB67_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2244,9 +2236,8 @@ define void @bcast_unfold_fmin_v16f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB68_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB68_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2279,9 +2270,8 @@ define void @bcast_unfold_fmin_v2f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB69_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovupd %xmm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB69_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2312,9 +2302,8 @@ define void @bcast_unfold_fmin_v4f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB70_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB70_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2346,9 +2335,8 @@ define void @bcast_unfold_fmin_v8f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB71_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB71_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3161,13 +3149,12 @@ define void @bcast_unfold_pcmpgt_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB96_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpltd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB96_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3195,13 +3182,12 @@ define void @bcast_unfold_pcmpgt_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB97_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpltd 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB97_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3230,13 +3216,12 @@ define void @bcast_unfold_pcmpgt_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB98_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpltd 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB98_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3265,13 +3250,12 @@ define void @bcast_unfold_pcmpgt_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB99_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpltq 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB99_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3299,13 +3283,12 @@ define void @bcast_unfold_pcmpgt_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB100_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpltq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB100_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3334,13 +3317,12 @@ define void @bcast_unfold_pcmpgt_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB101_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpltq 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB101_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3369,13 +3351,12 @@ define void @bcast_unfold_pcmpeq_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB102_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpeqd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB102_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3403,13 +3384,12 @@ define void @bcast_unfold_pcmpeq_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB103_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpeqd %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpeqd 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB103_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3438,13 +3418,12 @@ define void @bcast_unfold_pcmpeq_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB104_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpeqd %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpeqd 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB104_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3473,13 +3452,12 @@ define void @bcast_unfold_pcmpeq_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB105_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpeqq 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB105_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3507,13 +3485,12 @@ define void @bcast_unfold_pcmpeq_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB106_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpeqq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpeqq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB106_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3542,13 +3519,12 @@ define void @bcast_unfold_pcmpeq_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB107_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpeqq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpeqq 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB107_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3577,13 +3553,12 @@ define void @bcast_unfold_pcmp_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB108_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 -; CHECK-NEXT: vpcmpltd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpgtd (%rdi,%rax,4), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB108_1 @@ -3612,13 +3587,12 @@ define void @bcast_unfold_pcmp_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB109_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1 -; CHECK-NEXT: vpcmpltd %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpgtd (%rdi,%rax,4), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB109_1 @@ -3648,13 +3622,12 @@ define void @bcast_unfold_pcmp_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB110_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 -; CHECK-NEXT: vpcmpltd %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpgtd (%rdi,%rax,4), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB110_1 @@ -3684,13 +3657,12 @@ define void @bcast_unfold_pcmp_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB111_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 -; CHECK-NEXT: vpcmpltq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpgtq (%rdi,%rax,8), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $2, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB111_1 @@ -3719,13 +3691,12 @@ define void @bcast_unfold_pcmp_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB112_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 -; CHECK-NEXT: vpcmpltq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpgtq (%rdi,%rax,8), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB112_1 @@ -3755,13 +3726,12 @@ define void @bcast_unfold_pcmp_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB113_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1 -; CHECK-NEXT: vpcmpltq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpgtq (%rdi,%rax,8), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB113_1 @@ -3791,13 +3761,12 @@ define void @bcast_unfold_pcmpu_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB114_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 -; CHECK-NEXT: vpcmpltud %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpnleud (%rdi,%rax,4), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB114_1 @@ -3826,13 +3795,12 @@ define void @bcast_unfold_pcmpu_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB115_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1 -; CHECK-NEXT: vpcmpltud %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpnleud (%rdi,%rax,4), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB115_1 @@ -3862,13 +3830,12 @@ define void @bcast_unfold_pcmpu_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB116_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 -; CHECK-NEXT: vpcmpltud %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpnleud (%rdi,%rax,4), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB116_1 @@ -3898,13 +3865,12 @@ define void @bcast_unfold_pcmpu_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB117_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 -; CHECK-NEXT: vpcmpltuq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpnleuq (%rdi,%rax,8), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $2, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB117_1 @@ -3933,13 +3899,12 @@ define void @bcast_unfold_pcmpu_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB118_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 -; CHECK-NEXT: vpcmpltuq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpnleuq (%rdi,%rax,8), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB118_1 @@ -3969,13 +3934,12 @@ define void @bcast_unfold_pcmpu_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB119_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1 -; CHECK-NEXT: vpcmpltuq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpnleuq (%rdi,%rax,8), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB119_1 @@ -4009,10 +3973,8 @@ define void @bcast_unfold_cmp_v4f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB120_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm2 -; CHECK-NEXT: vcmpltps %xmm0, %xmm2, %k1 -; CHECK-NEXT: vblendmps %xmm2, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovups %xmm2, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB120_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4044,10 +4006,8 @@ define void @bcast_unfold_cmp_v8f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB121_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm2 -; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm2, %ymm1, %ymm2 {%k1} -; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB121_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4080,10 +4040,8 @@ define void @bcast_unfold_cmp_v16f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB122_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm2 -; CHECK-NEXT: vcmpltps %zmm0, %zmm2, %k1 -; CHECK-NEXT: vblendmps %zmm2, %zmm1, %zmm2 {%k1} -; CHECK-NEXT: vmovups %zmm2, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB122_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4118,10 +4076,8 @@ define void @bcast_unfold_cmp_v2f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB123_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm2 -; CHECK-NEXT: vcmpltpd %xmm0, %xmm2, %k1 -; CHECK-NEXT: vblendmpd %xmm2, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovupd %xmm2, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB123_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4153,10 +4109,8 @@ define void @bcast_unfold_cmp_v4f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB124_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm2 -; CHECK-NEXT: vcmpltpd %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm2, %ymm1, %ymm2 {%k1} -; CHECK-NEXT: vmovupd %ymm2, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB124_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4189,10 +4143,8 @@ define void @bcast_unfold_cmp_v8f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB125_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm2 -; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k1 -; CHECK-NEXT: vblendmpd %zmm2, %zmm1, %zmm2 {%k1} -; CHECK-NEXT: vmovupd %zmm2, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB125_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4254,13 +4206,12 @@ define void @bcast_unfold_ptestm_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB127_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vptestmd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB127_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4289,13 +4240,12 @@ define void @bcast_unfold_ptestnm_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB128_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vptestnmd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB128_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4324,13 +4274,12 @@ define void @bcast_unfold_ptestm_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB129_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vptestmq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB129_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4360,13 +4309,12 @@ define void @bcast_unfold_ptestnm_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB130_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vptestnmq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB130_1 ; CHECK-NEXT: # %bb.2: # %bb10 diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index c60d9a3..1a712ff 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -6,7 +6,8 @@ define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone { ; KNL-LABEL: zext_8x8mem_to_8x16: ; KNL: # %bb.0: -; KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 ; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -21,7 +22,8 @@ define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone ; ; AVX512DQNOBW-LABEL: zext_8x8mem_to_8x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512DQNOBW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQNOBW-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -35,7 +37,8 @@ define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone define <8 x i16> @sext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone { ; KNL-LABEL: sext_8x8mem_to_8x16: ; KNL: # %bb.0: -; KNL-NEXT: vpmovsxbw (%rdi), %xmm1 +; KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; KNL-NEXT: vpmovsxbw %xmm1, %xmm1 ; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 ; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -50,7 +53,8 @@ define <8 x i16> @sext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone ; ; AVX512DQNOBW-LABEL: sext_8x8mem_to_8x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %xmm1 +; AVX512DQNOBW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQNOBW-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX512DQNOBW-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -208,8 +212,10 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; KNL-NEXT: vmovdqu (%rdi), %ymm2 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 @@ -231,8 +237,10 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn ; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512DQNOBW-NEXT: vmovdqu (%rdi), %ymm2 +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0 @@ -253,8 +261,10 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm2 -; KNL-NEXT: vpmovsxbw (%rdi), %ymm3 +; KNL-NEXT: vmovdqu (%rdi), %ymm2 +; KNL-NEXT: vpmovsxbw %xmm2, %ymm3 +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 +; KNL-NEXT: vpmovsxbw %xmm2, %ymm2 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 @@ -276,8 +286,10 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn ; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm2 -; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm3 +; AVX512DQNOBW-NEXT: vmovdqu (%rdi), %ymm2 +; AVX512DQNOBW-NEXT: vpmovsxbw %xmm2, %ymm3 +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQNOBW-NEXT: vpmovsxbw %xmm2, %ymm2 ; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll index f02d1164..6d22f66 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=F16C ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefix=FP16 -define <2 x half> @foo(<2 x half> %0) "unsafe-fp-math"="true" nounwind { +define <2 x half> @foo(<2 x half> %0) nounwind { ; AVX2-LABEL: foo: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $40, %rsp diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll new file mode 100644 index 0000000..1136287 --- /dev/null +++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll @@ -0,0 +1,43 @@ +;; Test if temporary labels are generated for each indirect callsite with a callee_type metadata. +;; Test if the .callgraph section contains the MD5 hash of callee type ids generated from +;; generalized type id strings. + +; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -o - < %s | FileCheck %s + +; CHECK: ball: +; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: +define ptr @ball() { +entry: + %fp_foo_val = load ptr, ptr null, align 8 + ; CHECK: [[LABEL_TMP0:\.L.*]]: + call void (...) %fp_foo_val(), !callee_type !0 + %fp_bar_val = load ptr, ptr null, align 8 + ; CHECK: [[LABEL_TMP1:\.L.*]]: + %call_fp_bar = call i32 %fp_bar_val(i8 0), !callee_type !2 + %fp_baz_val = load ptr, ptr null, align 8 + ; CHECK: [[LABEL_TMP2:\.L.*]]: + %call_fp_baz = call ptr %fp_baz_val(ptr null), !callee_type !4 + ret ptr %call_fp_baz +} + +; CHECK: .section .callgraph,"o",@progbits,.text + +; CHECK-NEXT: .quad 0 +; CHECK-NEXT: .quad [[LABEL_FUNC]] +; CHECK-NEXT: .quad 1 +; CHECK-NEXT: .quad 3 +!0 = !{!1} +!1 = !{i64 0, !"_ZTSFvE.generalized"} +;; Test for MD5 hash of _ZTSFvE.generalized and the generated temporary callsite label. +; CHECK-NEXT: .quad 4524972987496481828 +; CHECK-NEXT: .quad [[LABEL_TMP0]] +!2 = !{!3} +!3 = !{i64 0, !"_ZTSFicE.generalized"} +;; Test for MD5 hash of _ZTSFicE.generalized and the generated temporary callsite label. +; CHECK-NEXT: .quad 3498816979441845844 +; CHECK-NEXT: .quad [[LABEL_TMP1]] +!4 = !{!5} +!5 = !{i64 0, !"_ZTSFPvS_E.generalized"} +;; Test for MD5 hash of _ZTSFPvS_E.generalized and the generated temporary callsite label. +; CHECK-NEXT: .quad 8646233951371320954 +; CHECK-NEXT: .quad [[LABEL_TMP2]] diff --git a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll new file mode 100644 index 0000000..fa14a98 --- /dev/null +++ b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll @@ -0,0 +1,34 @@ +;; Tests that we store the type identifiers in .callgraph section of the object file for tailcalls. + +; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -filetype=obj -o - < %s | \ +; RUN: llvm-readelf -x .callgraph - | FileCheck %s + +define i32 @check_tailcall(ptr %func, i8 %x) !type !0 { +entry: + %call = tail call i32 %func(i8 signext %x), !callee_type !1 + ret i32 %call +} + +define i32 @main(i32 %argc) !type !3 { +entry: + %andop = and i32 %argc, 1 + %cmp = icmp eq i32 %andop, 0 + %foo.bar = select i1 %cmp, ptr @foo, ptr @bar + %call.i = tail call i32 %foo.bar(i8 signext 97), !callee_type !1 + ret i32 %call.i +} + +declare !type !2 i32 @foo(i8 signext) + +declare !type !2 i32 @bar(i8 signext) + +;; Check that the numeric type id (md5 hash) for the below type ids are emitted +;; to the callgraph section. + +; CHECK: Hex dump of section '.callgraph': + +!0 = !{i64 0, !"_ZTSFiPvcE.generalized"} +!1 = !{!2} +; CHECK-DAG: 5486bc59 814b8e30 +!2 = !{i64 0, !"_ZTSFicE.generalized"} +!3 = !{i64 0, !"_ZTSFiiE.generalized"} diff --git a/llvm/test/CodeGen/X86/call-graph-section.ll b/llvm/test/CodeGen/X86/call-graph-section.ll new file mode 100644 index 0000000..4a9840e --- /dev/null +++ b/llvm/test/CodeGen/X86/call-graph-section.ll @@ -0,0 +1,38 @@ +;; Tests that we store the type identifiers in .callgraph section of the object file. + +; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -filetype=obj -o - < %s | \ +; RUN: llvm-readelf -x .callgraph - | FileCheck %s + +declare !type !0 void @foo() + +declare !type !1 i32 @bar(i8) + +declare !type !2 ptr @baz(ptr) + +define void @main() { +entry: + %a = alloca i8, align 1 + %fp_foo_val = load ptr, ptr null, align 8 + call void (...) %fp_foo_val(), !callee_type !1 + %fp_bar_val = load ptr, ptr null, align 8 + %param = trunc i64 0 to i8 + %call_fp_bar = call i32 %fp_bar_val(i8 signext %param), !callee_type !3 + %fp_baz_val = load ptr, ptr null, align 8 + %call_fp_baz = call ptr %fp_baz_val(ptr %a), !callee_type !4 + ret void +} + +;; Check that the numeric type id (md5 hash) for the below type ids are emitted +;; to the callgraph section. + +; CHECK: Hex dump of section '.callgraph': + +; CHECK-DAG: 2444f731 f5eecb3e +!0 = !{i64 0, !"_ZTSFvE.generalized"} +!1 = !{!0} +; CHECK-DAG: 5486bc59 814b8e30 +!2 = !{i64 0, !"_ZTSFicE.generalized"} +!3 = !{!2} +; CHECK-DAG: 7ade6814 f897fd77 +!4 = !{!5} +!5 = !{i64 0, !"_ZTSFPvS_E.generalized"} diff --git a/llvm/test/CodeGen/X86/calleetypeid-directcall-mismatched.ll b/llvm/test/CodeGen/X86/calleetypeid-directcall-mismatched.ll new file mode 100644 index 0000000..7881ea7 --- /dev/null +++ b/llvm/test/CodeGen/X86/calleetypeid-directcall-mismatched.ll @@ -0,0 +1,32 @@ +;; Tests that callee_type metadata attached to direct call sites are safely ignored. + +; RUN: llc --call-graph-section -mtriple x86_64-linux-gnu < %s -stop-after=finalize-isel -o - | FileCheck --match-full-lines %s + +;; Test that `calleeTypeIds` field is not present in `callSites` +; CHECK-LABEL: callSites: +; CHECK-NEXT: - { bb: {{[0-9]+}}, offset: {{[0-9]+}}, fwdArgRegs: [] } +; CHECK-NEXT: - { bb: {{[0-9]+}}, offset: {{[0-9]+}}, fwdArgRegs: [] } +; CHECK-NEXT: - { bb: {{[0-9]+}}, offset: {{[0-9]+}}, fwdArgRegs: [] } +define i32 @foo(i32 %x, i32 %y) !type !0 { +entry: + ;; Call instruction with accurate callee_type. + ;; callee_type should be dropped seemlessly. + %call = call i32 @fizz(i32 %x, i32 %y), !callee_type !1 + ;; Call instruction with mismatched callee_type. + ;; callee_type should be dropped seemlessly without errors. + %call1 = call i32 @fizz(i32 %x, i32 %y), !callee_type !3 + %add = add nsw i32 %call, %call1 + ;; Call instruction with mismatched callee_type. + ;; callee_type should be dropped seemlessly without errors. + %call2 = call i32 @fizz(i32 %add, i32 %y), !callee_type !3 + %sub = sub nsw i32 %add, %call2 + ret i32 %sub +} + +declare !type !2 i32 @fizz(i32, i32) + +!0 = !{i64 0, !"_ZTSFiiiiE.generalized"} +!1 = !{!2} +!2 = !{i64 0, !"_ZTSFiiiE.generalized"} +!3 = !{!4} +!4 = !{i64 0, !"_ZTSFicE.generalized"} diff --git a/llvm/test/CodeGen/X86/callsite-emit-calleetypeid-tailcall.ll b/llvm/test/CodeGen/X86/callsite-emit-calleetypeid-tailcall.ll new file mode 100644 index 0000000..8f6b7a6 --- /dev/null +++ b/llvm/test/CodeGen/X86/callsite-emit-calleetypeid-tailcall.ll @@ -0,0 +1,19 @@ +;; Tests that call site callee type ids can be extracted and set from +;; callee_type metadata for indirect tail calls. + +;; Verify the exact calleeTypeIds value to ensure it is not garbage but the value +;; computed as the type id from the callee_type metadata. +; RUN: llc --call-graph-section -mtriple=x86_64-unknown-linux < %s -stop-after=finalize-isel -o - | FileCheck --match-full-lines %s + +define i32 @check_tailcall(ptr %func, i8 %x) !type !0 { +entry: + ; CHECK: callSites: + ; CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], calleeTypeIds: + ; CHECK-NEXT: [ 3498816979441845844 ] } + %call = tail call i32 %func(i8 signext %x), !callee_type !1 + ret i32 %call +} + +!0 = !{i64 0, !"_ZTSFiPvcE.generalized"} +!1 = !{!2} +!2 = !{i64 0, !"_ZTSFicE.generalized"} diff --git a/llvm/test/CodeGen/X86/callsite-emit-calleetypeid.ll b/llvm/test/CodeGen/X86/callsite-emit-calleetypeid.ll new file mode 100644 index 0000000..e97a6ac --- /dev/null +++ b/llvm/test/CodeGen/X86/callsite-emit-calleetypeid.ll @@ -0,0 +1,20 @@ +;; Tests that call site callee type ids can be extracted and set from +;; callee_type metadata. + +;; Verify the exact calleeTypeIds value to ensure it is not garbage but the value +;; computed as the type id from the callee_type metadata. +; RUN: llc --call-graph-section -mtriple=x86_64-unknown-linux < %s -stop-after=finalize-isel -o - | FileCheck --match-full-lines %s + +; CHECK: name: main +; CHECK: callSites: +; CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], calleeTypeIds: +; CHECK-NEXT: [ 7854600665770582568 ] } +define i32 @main() { +entry: + %fn = load ptr, ptr null, align 8 + call void %fn(i8 0), !callee_type !0 + ret i32 0 +} + +!0 = !{!1} +!1 = !{i64 0, !"_ZTSFvcE.generalized"} diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll index f3e1417..ed3f0e0 100644 --- a/llvm/test/CodeGen/X86/cmp.ll +++ b/llvm/test/CodeGen/X86/cmp.ll @@ -956,3 +956,185 @@ define i1 @fold_test_and_with_chain(ptr %x, ptr %y, i32 %z) { store i32 %z, ptr %y ret i1 %c } + +define i1 @sext_mask(i32 %a) { +; CHECK-LABEL: sext_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpl $-523, %edi # encoding: [0x81,0xff,0xf5,0xfd,0xff,0xff] +; CHECK-NEXT: # imm = 0xFDF5 +; CHECK-NEXT: setl %al # encoding: [0x0f,0x9c,0xc0] +; CHECK-NEXT: retq # encoding: [0xc3] + %a64 = sext i32 %a to i64 + %v1 = icmp slt i64 %a64, -523 + ret i1 %v1 +} + +define i1 @sext_i9_mask(i9 %a) { +; NO-NDD-LABEL: sext_i9_mask: +; NO-NDD: # %bb.0: +; NO-NDD-NEXT: # kill: def $edi killed $edi def $rdi +; NO-NDD-NEXT: shlq $55, %rdi # encoding: [0x48,0xc1,0xe7,0x37] +; NO-NDD-NEXT: sarq $55, %rdi # encoding: [0x48,0xc1,0xff,0x37] +; NO-NDD-NEXT: cmpl $-522, %edi # encoding: [0x81,0xff,0xf6,0xfd,0xff,0xff] +; NO-NDD-NEXT: # imm = 0xFDF6 +; NO-NDD-NEXT: setl %al # encoding: [0x0f,0x9c,0xc0] +; NO-NDD-NEXT: retq # encoding: [0xc3] +; +; NDD-LABEL: sext_i9_mask: +; NDD: # %bb.0: +; NDD-NEXT: # kill: def $edi killed $edi def $rdi +; NDD-NEXT: shlq $55, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe7,0x37] +; NDD-NEXT: sarq $55, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xff,0x37] +; NDD-NEXT: cmpl $-522, %edi # encoding: [0x81,0xff,0xf6,0xfd,0xff,0xff] +; NDD-NEXT: # imm = 0xFDF6 +; NDD-NEXT: setl %al # encoding: [0x0f,0x9c,0xc0] +; NDD-NEXT: retq # encoding: [0xc3] + %a64 = sext i9 %a to i64 + %v1 = icmp slt i64 %a64, -522 + ret i1 %v1 +} + +define i1 @sext_i32_mask(i32 %a) { +; CHECK-LABEL: sext_i32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpl $-522, %edi # encoding: [0x81,0xff,0xf6,0xfd,0xff,0xff] +; CHECK-NEXT: # imm = 0xFDF6 +; CHECK-NEXT: setl %al # encoding: [0x0f,0x9c,0xc0] +; CHECK-NEXT: retq # encoding: [0xc3] + %a64 = sext i32 %a to i64 + %v1 = icmp slt i64 %a64, -522 + ret i1 %v1 +} + +define i1 @i40(i40 %a) { +; NO-NDD-LABEL: i40: +; NO-NDD: # %bb.0: +; NO-NDD-NEXT: shlq $24, %rdi # encoding: [0x48,0xc1,0xe7,0x18] +; NO-NDD-NEXT: sarq $24, %rdi # encoding: [0x48,0xc1,0xff,0x18] +; NO-NDD-NEXT: cmpq $-521, %rdi # encoding: [0x48,0x81,0xff,0xf7,0xfd,0xff,0xff] +; NO-NDD-NEXT: # imm = 0xFDF7 +; NO-NDD-NEXT: setl %al # encoding: [0x0f,0x9c,0xc0] +; NO-NDD-NEXT: retq # encoding: [0xc3] +; +; NDD-LABEL: i40: +; NDD: # %bb.0: +; NDD-NEXT: shlq $24, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe7,0x18] +; NDD-NEXT: sarq $24, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xff,0x18] +; NDD-NEXT: cmpq $-521, %rdi # encoding: [0x48,0x81,0xff,0xf7,0xfd,0xff,0xff] +; NDD-NEXT: # imm = 0xFDF7 +; NDD-NEXT: setl %al # encoding: [0x0f,0x9c,0xc0] +; NDD-NEXT: retq # encoding: [0xc3] + %a64 = sext i40 %a to i64 + %v1 = icmp slt i64 %a64, -521 + ret i1 %v1 +} + +define i1 @sext_i9_mask_sgt(i9 %a) { +; NO-NDD-LABEL: sext_i9_mask_sgt: +; NO-NDD: # %bb.0: +; NO-NDD-NEXT: # kill: def $edi killed $edi def $rdi +; NO-NDD-NEXT: shlq $55, %rdi # encoding: [0x48,0xc1,0xe7,0x37] +; NO-NDD-NEXT: sarq $55, %rdi # encoding: [0x48,0xc1,0xff,0x37] +; NO-NDD-NEXT: cmpl $-520, %edi # encoding: [0x81,0xff,0xf8,0xfd,0xff,0xff] +; NO-NDD-NEXT: # imm = 0xFDF8 +; NO-NDD-NEXT: setge %al # encoding: [0x0f,0x9d,0xc0] +; NO-NDD-NEXT: retq # encoding: [0xc3] +; +; NDD-LABEL: sext_i9_mask_sgt: +; NDD: # %bb.0: +; NDD-NEXT: # kill: def $edi killed $edi def $rdi +; NDD-NEXT: shlq $55, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe7,0x37] +; NDD-NEXT: sarq $55, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xff,0x37] +; NDD-NEXT: cmpl $-520, %edi # encoding: [0x81,0xff,0xf8,0xfd,0xff,0xff] +; NDD-NEXT: # imm = 0xFDF8 +; NDD-NEXT: setge %al # encoding: [0x0f,0x9d,0xc0] +; NDD-NEXT: retq # encoding: [0xc3] + %a64 = sext i9 %a to i64 + %v1 = icmp sgt i64 %a64, -521 + ret i1 %v1 +} + +define i1 @sext_i32_mask_sgt(i32 %a) { +; CHECK-LABEL: sext_i32_mask_sgt: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpl $-521, %edi # encoding: [0x81,0xff,0xf7,0xfd,0xff,0xff] +; CHECK-NEXT: # imm = 0xFDF7 +; CHECK-NEXT: setge %al # encoding: [0x0f,0x9d,0xc0] +; CHECK-NEXT: retq # encoding: [0xc3] + %a64 = sext i32 %a to i64 + %v1 = icmp sgt i64 %a64, -522 + ret i1 %v1 +} + +define i1 @i40_sge(i40 %a) { +; NO-NDD-LABEL: i40_sge: +; NO-NDD: # %bb.0: +; NO-NDD-NEXT: shlq $24, %rdi # encoding: [0x48,0xc1,0xe7,0x18] +; NO-NDD-NEXT: sarq $24, %rdi # encoding: [0x48,0xc1,0xff,0x18] +; NO-NDD-NEXT: cmpq $-521, %rdi # encoding: [0x48,0x81,0xff,0xf7,0xfd,0xff,0xff] +; NO-NDD-NEXT: # imm = 0xFDF7 +; NO-NDD-NEXT: setge %al # encoding: [0x0f,0x9d,0xc0] +; NO-NDD-NEXT: retq # encoding: [0xc3] +; +; NDD-LABEL: i40_sge: +; NDD: # %bb.0: +; NDD-NEXT: shlq $24, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe7,0x18] +; NDD-NEXT: sarq $24, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xff,0x18] +; NDD-NEXT: cmpq $-521, %rdi # encoding: [0x48,0x81,0xff,0xf7,0xfd,0xff,0xff] +; NDD-NEXT: # imm = 0xFDF7 +; NDD-NEXT: setge %al # encoding: [0x0f,0x9d,0xc0] +; NDD-NEXT: retq # encoding: [0xc3] + %a64 = sext i40 %a to i64 + %v1 = icmp sge i64 %a64, -521 + ret i1 %v1 +} + +define i1 @i40_eq(i40 %a) { +; NO-NDD-LABEL: i40_eq: +; NO-NDD: # %bb.0: +; NO-NDD-NEXT: movabsq $1099511627775, %rax # encoding: [0x48,0xb8,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00] +; NO-NDD-NEXT: # imm = 0xFFFFFFFFFF +; NO-NDD-NEXT: andq %rdi, %rax # encoding: [0x48,0x21,0xf8] +; NO-NDD-NEXT: movabsq $1099511627255, %rcx # encoding: [0x48,0xb9,0xf7,0xfd,0xff,0xff,0xff,0x00,0x00,0x00] +; NO-NDD-NEXT: # imm = 0xFFFFFFFDF7 +; NO-NDD-NEXT: cmpq %rcx, %rax # encoding: [0x48,0x39,0xc8] +; NO-NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NO-NDD-NEXT: retq # encoding: [0xc3] +; +; NDD-LABEL: i40_eq: +; NDD: # %bb.0: +; NDD-NEXT: movabsq $1099511627775, %rax # encoding: [0x48,0xb8,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00] +; NDD-NEXT: # imm = 0xFFFFFFFFFF +; NDD-NEXT: andq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xf8] +; NDD-NEXT: movabsq $1099511627255, %rcx # encoding: [0x48,0xb9,0xf7,0xfd,0xff,0xff,0xff,0x00,0x00,0x00] +; NDD-NEXT: # imm = 0xFFFFFFFDF7 +; NDD-NEXT: cmpq %rcx, %rax # encoding: [0x48,0x39,0xc8] +; NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NDD-NEXT: retq # encoding: [0xc3] + %a64 = sext i40 %a to i64 + %v1 = icmp eq i64 %a64, -521 + ret i1 %v1 +} + +define i1 @i40_ult(i40 %a) { +; NO-NDD-LABEL: i40_ult: +; NO-NDD: # %bb.0: +; NO-NDD-NEXT: shlq $24, %rdi # encoding: [0x48,0xc1,0xe7,0x18] +; NO-NDD-NEXT: sarq $24, %rdi # encoding: [0x48,0xc1,0xff,0x18] +; NO-NDD-NEXT: cmpq $-521, %rdi # encoding: [0x48,0x81,0xff,0xf7,0xfd,0xff,0xff] +; NO-NDD-NEXT: # imm = 0xFDF7 +; NO-NDD-NEXT: setb %al # encoding: [0x0f,0x92,0xc0] +; NO-NDD-NEXT: retq # encoding: [0xc3] +; +; NDD-LABEL: i40_ult: +; NDD: # %bb.0: +; NDD-NEXT: shlq $24, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe7,0x18] +; NDD-NEXT: sarq $24, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xff,0x18] +; NDD-NEXT: cmpq $-521, %rdi # encoding: [0x48,0x81,0xff,0xf7,0xfd,0xff,0xff] +; NDD-NEXT: # imm = 0xFDF7 +; NDD-NEXT: setb %al # encoding: [0x0f,0x92,0xc0] +; NDD-NEXT: retq # encoding: [0xc3] + %a64 = sext i40 %a to i64 + %v1 = icmp ult i64 %a64, -521 + ret i1 %v1 +} diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 1ae1d61..98187d6 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2201,9 +2201,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: psraw $8, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw $7, %xmm3 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7] +; SSE41-NEXT: paddw %xmm0, %xmm3 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7] ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE41-NEXT: psraw $8, %xmm2 @@ -2234,9 +2234,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7] +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll new file mode 100644 index 0000000..c18c89d --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll @@ -0,0 +1,1540 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s -check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s -check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s -check-prefix=AVX512 + +define void @test_masked_store_success_v4i8(<4 x i8> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovd %xmm1, (%rdi) +; AVX512-NEXT: retq + %load = load <4 x i8>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i8> %x, <4 x i8> %load + store <4 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i16(<4 x i16> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,u,4,u,8,u,12,u,8,u,12,u,12,u,14,u] +; AVX-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,u,4,u,8,u,12,u,8,u,12,u,12,u,14,u] +; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: retq + %load = load <4 x i16>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i16> %x, <4 x i16> %load + store <4 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <4 x i32>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %load + store <4 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i64(<4 x i64> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmaskmovq %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa64 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <4 x i64>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %load + store <4 x i64> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f16(<4 x half> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4f16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm2 +; AVX-NEXT: vpextrw $0, %xmm2, %edx +; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX-NEXT: vpextrw $0, %xmm2, %ecx +; AVX-NEXT: movzwl 2(%rdi), %eax +; AVX-NEXT: vpextrb $4, %xmm1, %esi +; AVX-NEXT: testb $1, %sil +; AVX-NEXT: cmovnel %ecx, %eax +; AVX-NEXT: vpextrb $8, %xmm1, %ecx +; AVX-NEXT: testb $1, %cl +; AVX-NEXT: jne .LBB4_1 +; AVX-NEXT: # %bb.2: +; AVX-NEXT: movl 4(%rdi), %ecx +; AVX-NEXT: jmp .LBB4_3 +; AVX-NEXT: .LBB4_1: +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-NEXT: vpextrw $0, %xmm2, %ecx +; AVX-NEXT: .LBB4_3: +; AVX-NEXT: movzwl 6(%rdi), %esi +; AVX-NEXT: vpextrb $12, %xmm1, %r8d +; AVX-NEXT: testb $1, %r8b +; AVX-NEXT: cmovnel %edx, %esi +; AVX-NEXT: vmovd %xmm1, %edx +; AVX-NEXT: testb $1, %dl +; AVX-NEXT: jne .LBB4_4 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: movl (%rdi), %edx +; AVX-NEXT: jmp .LBB4_6 +; AVX-NEXT: .LBB4_4: +; AVX-NEXT: vpextrw $0, %xmm0, %edx +; AVX-NEXT: .LBB4_6: +; AVX-NEXT: movw %dx, (%rdi) +; AVX-NEXT: movw %si, 6(%rdi) +; AVX-NEXT: movw %cx, 4(%rdi) +; AVX-NEXT: movw %ax, 2(%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm2 +; AVX2-NEXT: vpextrw $0, %xmm2, %edx +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX2-NEXT: vpextrw $0, %xmm2, %ecx +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vpextrb $4, %xmm1, %esi +; AVX2-NEXT: testb $1, %sil +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: jne .LBB4_1 +; AVX2-NEXT: # %bb.2: +; AVX2-NEXT: movl 4(%rdi), %ecx +; AVX2-NEXT: jmp .LBB4_3 +; AVX2-NEXT: .LBB4_1: +; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX2-NEXT: vpextrw $0, %xmm2, %ecx +; AVX2-NEXT: .LBB4_3: +; AVX2-NEXT: movzwl 6(%rdi), %esi +; AVX2-NEXT: vpextrb $12, %xmm1, %r8d +; AVX2-NEXT: testb $1, %r8b +; AVX2-NEXT: cmovnel %edx, %esi +; AVX2-NEXT: vmovd %xmm1, %edx +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: jne .LBB4_4 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: movl (%rdi), %edx +; AVX2-NEXT: jmp .LBB4_6 +; AVX2-NEXT: .LBB4_4: +; AVX2-NEXT: vpextrw $0, %xmm0, %edx +; AVX2-NEXT: .LBB4_6: +; AVX2-NEXT: movw %dx, (%rdi) +; AVX2-NEXT: movw %si, 6(%rdi) +; AVX2-NEXT: movw %cx, 4(%rdi) +; AVX2-NEXT: movw %ax, 2(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: retq + %load = load <4 x half>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x half> %x, <4 x half> %load + store <4 x half> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f32(<4 x float> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovaps %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <4 x float>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x float> %x, <4 x float> %load + store <4 x float> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f64(<4 x double> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovapd %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <4 x double>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x double> %x, <4 x double> %load + store <4 x double> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i8(<8 x i8> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: retq + %load = load <8 x i8>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i8> %x, <8 x i8> %load + store <8 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i16(<8 x i16> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <8 x i16>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %load + store <8 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i64(<8 x i64> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8i64: +; AVX: # %bb.0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vpmovsxdq %xmm3, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpmovsxdq %xmm2, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm2, (%rdi) +; AVX-NEXT: vmaskmovpd %ymm1, %ymm3, 32(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vpmaskmovq %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vpmaskmovq %ymm1, %ymm3, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i64>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load + store <8 x i64> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f16(<8 x half> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8f16: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <8 x half>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x half> %x, <8 x half> %load + store <8 x half> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f32(<8 x float> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovaps %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x float>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x float> %x, <8 x float> %load + store <8 x float> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f64(<8 x double> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8f64: +; AVX: # %bb.0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vpmovsxdq %xmm3, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpmovsxdq %xmm2, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm2, (%rdi) +; AVX-NEXT: vmaskmovpd %ymm1, %ymm3, 32(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vmaskmovpd %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vmaskmovpd %ymm1, %ymm3, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovupd %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x double>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x double> %x, <8 x double> %load + store <8 x double> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i8(<16 x i8> %x, ptr %ptr, <16 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512-NEXT: vpmovb2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <16 x i8>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %load + store <16 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i16(<16 x i16> %x, ptr %ptr, <16 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v16i16: +; AVX: # %bb.0: +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX-NEXT: vpsraw $15, %xmm2, %xmm2 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm3, %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, (%rdi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1 +; AVX2-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512-NEXT: vpmovb2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <16 x i16>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %load + store <16 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v16i32: +; AVX: # %bb.0: +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) +; AVX-NEXT: vmaskmovps %ymm1, %ymm3, 32(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm3, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512-NEXT: vpmovb2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <16 x i32>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %load + store <16 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v32i8(<32 x i8> %x, ptr %ptr, <32 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v32i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vmovdqa (%rdi), %xmm4 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm5, %xmm1 +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $7, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %ymm1, %ymm1 +; AVX512-NEXT: vpmovb2m %ymm1, %k1 +; AVX512-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <32 x i8>, ptr %ptr, align 32 + %sel = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %load + store <32 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v32i16(<32 x i16> %x, ptr %ptr, <32 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v32i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm3 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX-NEXT: vpsllw $15, %xmm8, %xmm8 +; AVX-NEXT: vpsraw $15, %xmm8, %xmm8 +; AVX-NEXT: vpblendvb %xmm8, %xmm1, %xmm5, %xmm5 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX-NEXT: vpsllw $15, %xmm8, %xmm8 +; AVX-NEXT: vpsraw $15, %xmm8, %xmm8 +; AVX-NEXT: vpblendvb %xmm8, %xmm0, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsllw $15, %xmm7, %xmm7 +; AVX-NEXT: vpsraw $15, %xmm7, %xmm7 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm7, %xmm1, %xmm6, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX-NEXT: vpsraw $15, %xmm2, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vmovdqa %xmm3, (%rdi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdi) +; AVX-NEXT: vmovdqa %xmm5, 32(%rdi) +; AVX-NEXT: vmovdqa %xmm1, 48(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero +; AVX2-NEXT: vpsllw $15, %ymm5, %ymm5 +; AVX2-NEXT: vpsraw $15, %ymm5, %ymm5 +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpsllw $15, %ymm2, %ymm2 +; AVX2-NEXT: vpsraw $15, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %ymm1, %ymm1 +; AVX512-NEXT: vpmovb2m %ymm1, %k1 +; AVX512-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <32 x i16>, ptr %ptr, align 32 + %sel = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %load + store <32 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v64i8(<64 x i8> %x, ptr %ptr, <64 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v64i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %esi, %xmm2 +; AVX-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpsllw $7, %xmm2, %xmm2 +; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpsllw $7, %xmm3, %xmm3 +; AVX-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpsllw $7, %xmm4, %xmm4 +; AVX-NEXT: vmovd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpsllw $7, %xmm5, %xmm5 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX-NEXT: vmovdqa (%rdi), %xmm7 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX-NEXT: vpblendvb %xmm5, %xmm6, %xmm10, %xmm5 +; AVX-NEXT: vpblendvb %xmm4, %xmm1, %xmm9, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX-NEXT: vpblendvb %xmm3, %xmm4, %xmm8, %xmm3 +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm7, %xmm0 +; AVX-NEXT: vmovdqa %xmm3, 16(%rdi) +; AVX-NEXT: vmovdqa %xmm1, 32(%rdi) +; AVX-NEXT: vmovdqa %xmm5, 48(%rdi) +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX2-NEXT: vmovd %esi, %xmm3 +; AVX2-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $3, %r8d, %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $4, %r9d, %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpsllw $7, %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %zmm1, %zmm1 +; AVX512-NEXT: vpmovb2m %zmm1, %k1 +; AVX512-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <64 x i8>, ptr %ptr, align 32 + %sel = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %load + store <64 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_invert_mask_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_invert_mask_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_invert_mask_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <4 x i32>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> %x + store <4 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_invert_mask_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_invert_mask_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_invert_mask_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k0 +; AVX512-NEXT: knotb %k0, %k1 +; AVX512-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %load, <8 x i32> %x + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_invert_mask_v16i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi) +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vxorps %ymm1, %ymm3, %ymm1 +; AVX-NEXT: vpslld $31, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_invert_mask_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_invert_mask_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512-NEXT: vpmovb2m %xmm1, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <16 x i32>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i32> %load, <16 x i32> %x + store <16 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_zextload(<4 x i64> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_zextload: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovapd %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_zextload: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_zextload: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <4 x i32>, ptr %ptr, align 32 + %zext = zext <4 x i32> %load to <4 x i64> + %masked = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %zext + store <4 x i64> %masked, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_volatile_load: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_volatile_load: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_volatile_load: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load volatile <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_volatile_store: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_volatile_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_volatile_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store volatile <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +declare void @use_vec(<8 x i32>) + +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) nounwind { +; AVX-LABEL: test_masked_store_intervening: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $32, %rsp +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: callq use_vec@PLT +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, (%rbx) +; AVX-NEXT: addq $32, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_intervening: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $32, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: callq use_vec@PLT +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rbx) +; AVX2-NEXT: addq $32, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_intervening: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $80, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm0 +; AVX512-NEXT: vpmovw2m %xmm0, %k1 +; AVX512-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: vmovaps (%rdi), %ymm0 +; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %ymm0, (%rdi) +; AVX512-NEXT: callq use_vec@PLT +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rbx) +; AVX512-NEXT: addq $80, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + store <8 x i32> zeroinitializer, ptr %ptr, align 32 + %tmp = load <8 x i32>, ptr %ptr + call void @use_vec(<8 x i32> %tmp) + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + + +define void @test_masked_store_multiple_v8i32(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { +; AVX-LABEL: test_masked_store_multiple_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vmovaps (%rsi), %ymm4 +; AVX-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) +; AVX-NEXT: vmovaps %ymm1, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_multiple_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 +; AVX2-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_multiple_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX512-NEXT: vpmovw2m %xmm2, %k1 +; AVX512-NEXT: vpsllw $15, %xmm3, %xmm2 +; AVX512-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512-NEXT: vpmovw2m %xmm2, %k2 +; AVX512-NEXT: vmovdqa32 %ymm1, %ymm3 {%k2} +; AVX512-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vmovdqa %ymm3, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr1, align 32 + %load2 = load <8 x i32>, ptr %ptr2, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %mask2, <8 x i32> %y, <8 x i32> %load2 + store <8 x i32> %sel, ptr %ptr1, align 32 + store <8 x i32> %sel2, ptr %ptr2, align 32 + ret void +} + +define void @test_masked_store_multiple_v8i64(<8 x i64> %x, <8 x i64> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { +; AVX-LABEL: test_masked_store_multiple_v8i64: +; AVX: # %bb.0: +; AVX-NEXT: vmovapd (%rsi), %ymm6 +; AVX-NEXT: vmovapd 32(%rsi), %ymm7 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm8, %xmm8 +; AVX-NEXT: vpmovsxdq %xmm8, %xmm9 +; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm8, %xmm8 +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpmovsxdq %xmm4, %xmm9 +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm4, %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm9, %xmm9 +; AVX-NEXT: vpmovsxdq %xmm9, %xmm10 +; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm9, %xmm9 +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX-NEXT: vblendvpd %ymm9, %ymm3, %ymm7, %ymm3 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX-NEXT: vpmovsxdq %xmm5, %xmm7 +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm5, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX-NEXT: vblendvpd %ymm5, %ymm2, %ymm6, %ymm2 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm4, (%rdi) +; AVX-NEXT: vmaskmovpd %ymm1, %ymm8, 32(%rdi) +; AVX-NEXT: vmovapd %ymm3, 32(%rsi) +; AVX-NEXT: vmovapd %ymm2, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_multiple_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovapd (%rsi), %ymm6 +; AVX2-NEXT: vmovapd 32(%rsi), %ymm7 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpslld $31, %xmm8, %xmm8 +; AVX2-NEXT: vpmovsxdq %xmm8, %ymm8 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-NEXT: vpslld $31, %xmm9, %xmm9 +; AVX2-NEXT: vpmovsxdq %xmm9, %ymm9 +; AVX2-NEXT: vblendvpd %ymm9, %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm7, %ymm3 +; AVX2-NEXT: vpmaskmovq %ymm0, %ymm4, (%rdi) +; AVX2-NEXT: vpmaskmovq %ymm1, %ymm8, 32(%rdi) +; AVX2-NEXT: vmovapd %ymm3, 32(%rsi) +; AVX2-NEXT: vmovapd %ymm2, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_multiple_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX512-NEXT: vpmovw2m %xmm2, %k1 +; AVX512-NEXT: vpsllw $15, %xmm3, %xmm2 +; AVX512-NEXT: vmovdqu64 (%rsi), %zmm3 +; AVX512-NEXT: vpmovw2m %xmm2, %k2 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vmovdqu64 %zmm3, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i64>, ptr %ptr1, align 32 + %load2 = load <8 x i64>, ptr %ptr2, align 32 + %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load + %sel2 = select <8 x i1> %mask2, <8 x i64> %y, <8 x i64> %load2 + store <8 x i64> %sel, ptr %ptr1, align 32 + store <8 x i64> %sel2, ptr %ptr2, align 32 + ret void +} + +define void @test_masked_store_unaligned_v4i32(<4 x i32> %data, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_unaligned_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vmaskmovps %xmm0, %xmm1, 1(%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_unaligned_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, 1(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_unaligned_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu32 %xmm0, 1(%rdi) {%k1} +; AVX512-NEXT: retq + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <4 x i32>, ptr %ptr_vec, align 1 + %sel = select <4 x i1> %mask, <4 x i32> %data, <4 x i32> %load + store <4 x i32> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v4i64(<4 x i64> %data, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_unaligned_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm1, 1(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_unaligned_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmaskmovq %ymm0, %ymm1, 1(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_unaligned_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu64 %ymm0, 1(%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ptr_i8 = getelementptr i8, ptr %ptr, i64 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <4 x i64>, ptr %ptr_vec, align 1 + %sel = select <4 x i1> %mask, <4 x i64> %data, <4 x i64> %load + store <4 x i64> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v8i32(<8 x i32> %data, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_unaligned_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, 1(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_unaligned_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, 1(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_unaligned_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu32 %ymm0, 1(%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i32>, ptr %ptr_vec, align 1 + %sel = select <8 x i1> %mask, <8 x i32> %data, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v8i64(<8 x i64> %data, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_unaligned_v8i64: +; AVX: # %bb.0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vpmovsxdq %xmm3, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpmovsxdq %xmm2, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm2, 1(%rdi) +; AVX-NEXT: vmaskmovpd %ymm1, %ymm3, 33(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_unaligned_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vpmaskmovq %ymm0, %ymm2, 1(%rdi) +; AVX2-NEXT: vpmaskmovq %ymm1, %ymm3, 33(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_unaligned_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu64 %zmm0, 1(%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ptr_i8 = getelementptr i8, ptr %ptr, i64 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i64>, ptr %ptr_vec, align 1 + %sel = select <8 x i1> %mask, <8 x i64> %data, <8 x i64> %load + store <8 x i64> %sel, ptr %ptr_vec, align 1 + ret void +} diff --git a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir index 17de405..0f28964 100644 --- a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir +++ b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s -# Check that only the computed goto is not be restrict by tail-dup-pred-size and tail-dup-succ-size. +# +# Check that only the computed goto and others are restricted by tail-dup-pred-size and tail-dup-succ-size. +# --- | @computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)] declare i64 @f0() @@ -30,54 +32,54 @@ tracksRegLiveness: true body: | ; CHECK-LABEL: name: computed_goto ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]] - ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: JMP_1 %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1): - ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY [[COPY2]] - ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: JMP_1 %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2): - ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64_nosp = COPY [[COPY4]] - ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY4]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: JMP_1 %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3): - ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[COPY6]] - ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY6]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: JMP_1 %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4): - ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = COPY [[COPY8]] - ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY8]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gr64_nosp = PHI [[COPY]], %bb.0, [[COPY4]], %bb.4, [[COPY3]], %bb.3, [[COPY2]], %bb.2, [[COPY1]], %bb.1 + ; CHECK-NEXT: JMP64m $noreg, 8, [[PHI]], @computed_goto.dispatch, $noreg bb.0: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax diff --git a/llvm/test/CodeGen/X86/exp10-libcall-names.ll b/llvm/test/CodeGen/X86/exp10-libcall-names.ll index 96e3aae..2688474 100644 --- a/llvm/test/CodeGen/X86/exp10-libcall-names.ll +++ b/llvm/test/CodeGen/X86/exp10-libcall-names.ll @@ -13,10 +13,7 @@ ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=GISEL-X86 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=GISEL-X64 -; RUN: not llc -mtriple=x86_64-apple-macos10.8 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s -; Check exp10/exp10f is emitted as __exp10/__exp10f on assorted systems. - -; ERR: no libcall available for fexp10 +; Check exp10/exp10f is emitted as __exp10/__exp10f on assorted darwin systems. define float @test_exp10_f32(float %x) nounwind { ; LINUX-LABEL: test_exp10_f32: @@ -78,43 +75,3 @@ define double @test_exp10_f64(double %x) nounwind { %ret = call double @llvm.exp10.f64(double %x) ret double %ret } - -define x86_fp80 @test_exp10_f80(x86_fp80 %x) nounwind { -; LINUX-LABEL: test_exp10_f80: -; LINUX: # %bb.0: -; LINUX-NEXT: subq $24, %rsp -; LINUX-NEXT: fldt {{[0-9]+}}(%rsp) -; LINUX-NEXT: fstpt (%rsp) -; LINUX-NEXT: callq exp10l@PLT -; LINUX-NEXT: addq $24, %rsp -; LINUX-NEXT: retq -; -; APPLE-LABEL: test_exp10_f80: -; APPLE: ## %bb.0: -; APPLE-NEXT: subq $24, %rsp -; APPLE-NEXT: fldt {{[0-9]+}}(%rsp) -; APPLE-NEXT: fstpt (%rsp) -; APPLE-NEXT: callq _exp10l -; APPLE-NEXT: addq $24, %rsp -; APPLE-NEXT: retq -; -; GISEL-X86-LABEL: test_exp10_f80: -; GISEL-X86: # %bb.0: -; GISEL-X86-NEXT: subl $12, %esp -; GISEL-X86-NEXT: fldt {{[0-9]+}}(%esp) -; GISEL-X86-NEXT: fstpt (%esp) -; GISEL-X86-NEXT: calll exp10l -; GISEL-X86-NEXT: addl $12, %esp -; GISEL-X86-NEXT: retl -; -; GISEL-X64-LABEL: test_exp10_f80: -; GISEL-X64: # %bb.0: -; GISEL-X64-NEXT: subq $24, %rsp -; GISEL-X64-NEXT: fldt {{[0-9]+}}(%rsp) -; GISEL-X64-NEXT: fstpt (%rsp) -; GISEL-X64-NEXT: callq exp10l -; GISEL-X64-NEXT: addq $24, %rsp -; GISEL-X64-NEXT: retq - %ret = call x86_fp80 @llvm.exp10.f80(x86_fp80 %x) - ret x86_fp80 %ret -} diff --git a/llvm/test/CodeGen/X86/exp10l-libcall-names.ll b/llvm/test/CodeGen/X86/exp10l-libcall-names.ll new file mode 100644 index 0000000..2e7f9e3 --- /dev/null +++ b/llvm/test/CodeGen/X86/exp10l-libcall-names.ll @@ -0,0 +1,46 @@ +; RUN: llc -mtriple=x86_64-linux-gnu < %s | FileCheck -check-prefix=LINUX %s +; RUN: not llc -mtriple=x86_64-apple-macos10.9 < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -mtriple=x86_64-apple-ios9.0 < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -mtriple=x86_64-apple-tvos9.0 < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -mtriple=x86_64-apple-watchos9.0 < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -mtriple=x86_64-apple-xros9.0 < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -mtriple=x86_64-apple-ios8.0 < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -mtriple=x86_64-apple-tvos8.0 < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -mtriple=x86_64-apple-xros8.0 < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -mtriple=x86_64-apple-driverkit < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -mtriple=x86_64-apple-driverkit24.0 < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=GISEL-X86 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=GISEL-X64 + +; ERR: no libcall available for fexp10 + +define x86_fp80 @test_exp10_f80(x86_fp80 %x) nounwind { +; LINUX-LABEL: test_exp10_f80: +; LINUX: # %bb.0: +; LINUX-NEXT: subq $24, %rsp +; LINUX-NEXT: fldt {{[0-9]+}}(%rsp) +; LINUX-NEXT: fstpt (%rsp) +; LINUX-NEXT: callq exp10l@PLT +; LINUX-NEXT: addq $24, %rsp +; LINUX-NEXT: retq +; +; GISEL-X86-LABEL: test_exp10_f80: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: subl $12, %esp +; GISEL-X86-NEXT: fldt {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: fstpt (%esp) +; GISEL-X86-NEXT: calll exp10l +; GISEL-X86-NEXT: addl $12, %esp +; GISEL-X86-NEXT: retl +; +; GISEL-X64-LABEL: test_exp10_f80: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: subq $24, %rsp +; GISEL-X64-NEXT: fldt {{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: fstpt (%rsp) +; GISEL-X64-NEXT: callq exp10l +; GISEL-X64-NEXT: addq $24, %rsp +; GISEL-X64-NEXT: retq + %ret = call x86_fp80 @llvm.exp10.f80(x86_fp80 %x) + ret x86_fp80 %ret +} diff --git a/llvm/test/CodeGen/X86/fmaddsub-combine.ll b/llvm/test/CodeGen/X86/fmaddsub-combine.ll index 5219ab3..2af219b 100644 --- a/llvm/test/CodeGen/X86/fmaddsub-combine.ll +++ b/llvm/test/CodeGen/X86/fmaddsub-combine.ll @@ -6,7 +6,7 @@ ; This test checks the fusing of MUL + ADDSUB to FMADDSUB. -define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { +define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) { ; NOFMA-LABEL: mul_addsub_pd128: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0 @@ -23,14 +23,14 @@ define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x do ; FMA4-NEXT: vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 ; FMA4-NEXT: retq entry: - %AB = fmul <2 x double> %A, %B - %Sub = fsub <2 x double> %AB, %C - %Add = fadd <2 x double> %AB, %C + %AB = fmul contract <2 x double> %A, %B + %Sub = fsub contract <2 x double> %AB, %C + %Add = fadd contract <2 x double> %AB, %C %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3> ret <2 x double> %Addsub } -define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 { +define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) { ; NOFMA-LABEL: mul_addsub_ps128: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 @@ -47,14 +47,14 @@ define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> ; FMA4-NEXT: vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 ; FMA4-NEXT: retq entry: - %AB = fmul <4 x float> %A, %B - %Sub = fsub <4 x float> %AB, %C - %Add = fadd <4 x float> %AB, %C + %AB = fmul contract <4 x float> %A, %B + %Sub = fsub contract <4 x float> %AB, %C + %Add = fadd contract <4 x float> %AB, %C %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7> ret <4 x float> %Addsub } -define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 { +define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) { ; NOFMA-LABEL: mul_addsub_pd256: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0 @@ -71,14 +71,14 @@ define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x dou ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 ; FMA4-NEXT: retq entry: - %AB = fmul <4 x double> %A, %B - %Sub = fsub <4 x double> %AB, %C - %Add = fadd <4 x double> %AB, %C + %AB = fmul contract <4 x double> %A, %B + %Sub = fsub contract <4 x double> %AB, %C + %Add = fadd contract <4 x double> %AB, %C %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7> ret <4 x double> %Addsub } -define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 { +define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) { ; NOFMA-LABEL: mul_addsub_ps256: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 @@ -95,14 +95,14 @@ define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 ; FMA4-NEXT: retq entry: - %AB = fmul <8 x float> %A, %B - %Sub = fsub <8 x float> %AB, %C - %Add = fadd <8 x float> %AB, %C + %AB = fmul contract <8 x float> %A, %B + %Sub = fsub contract <8 x float> %AB, %C + %Add = fadd contract <8 x float> %AB, %C %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> ret <8 x float> %Addsub } -define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 { +define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) { ; NOFMA-LABEL: mul_addsub_pd512: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 @@ -128,14 +128,14 @@ define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x dou ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 ; FMA4-NEXT: retq entry: - %AB = fmul <8 x double> %A, %B - %Sub = fsub <8 x double> %AB, %C - %Add = fadd <8 x double> %AB, %C + %AB = fmul contract <8 x double> %A, %B + %Sub = fsub contract <8 x double> %AB, %C + %Add = fadd contract <8 x double> %AB, %C %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> ret <8 x double> %Addsub } -define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 { +define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) { ; NOFMA-LABEL: mul_addsub_ps512: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 @@ -161,14 +161,14 @@ define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x fl ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 ; FMA4-NEXT: retq entry: - %AB = fmul <16 x float> %A, %B - %Sub = fsub <16 x float> %AB, %C - %Add = fadd <16 x float> %AB, %C + %AB = fmul contract <16 x float> %A, %B + %Sub = fsub contract <16 x float> %AB, %C + %Add = fadd contract <16 x float> %AB, %C %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> ret <16 x float> %Addsub } -define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 { +define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) { ; NOFMA-LABEL: buildvector_mul_addsub_ps128: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 @@ -185,19 +185,19 @@ define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, ; FMA4-NEXT: vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 ; FMA4-NEXT: retq bb: - %A = fmul <4 x float> %C, %D + %A = fmul contract <4 x float> %C, %D %A0 = extractelement <4 x float> %A, i32 0 %B0 = extractelement <4 x float> %B, i32 0 - %sub0 = fsub float %A0, %B0 + %sub0 = fsub contract float %A0, %B0 %A2 = extractelement <4 x float> %A, i32 2 %B2 = extractelement <4 x float> %B, i32 2 - %sub2 = fsub float %A2, %B2 + %sub2 = fsub contract float %A2, %B2 %A1 = extractelement <4 x float> %A, i32 1 %B1 = extractelement <4 x float> %B, i32 1 - %add1 = fadd float %A1, %B1 + %add1 = fadd contract float %A1, %B1 %A3 = extractelement <4 x float> %A, i32 3 %B3 = extractelement <4 x float> %B, i32 3 - %add3 = fadd float %A3, %B3 + %add3 = fadd contract float %A3, %B3 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2 @@ -205,7 +205,7 @@ bb: ret <4 x float> %vecinsert4 } -define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 { +define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) { ; NOFMA-LABEL: buildvector_mul_addsub_pd128: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0 @@ -222,19 +222,19 @@ define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> ; FMA4-NEXT: vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 ; FMA4-NEXT: retq bb: - %A = fmul <2 x double> %C, %D + %A = fmul contract <2 x double> %C, %D %A0 = extractelement <2 x double> %A, i32 0 %B0 = extractelement <2 x double> %B, i32 0 - %sub0 = fsub double %A0, %B0 + %sub0 = fsub contract double %A0, %B0 %A1 = extractelement <2 x double> %A, i32 1 %B1 = extractelement <2 x double> %B, i32 1 - %add1 = fadd double %A1, %B1 + %add1 = fadd contract double %A1, %B1 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1 ret <2 x double> %vecinsert2 } -define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 { +define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) { ; NOFMA-LABEL: buildvector_mul_addsub_ps256: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 @@ -251,31 +251,31 @@ define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 ; FMA4-NEXT: retq bb: - %A = fmul <8 x float> %C, %D + %A = fmul contract <8 x float> %C, %D %A0 = extractelement <8 x float> %A, i32 0 %B0 = extractelement <8 x float> %B, i32 0 - %sub0 = fsub float %A0, %B0 + %sub0 = fsub contract float %A0, %B0 %A2 = extractelement <8 x float> %A, i32 2 %B2 = extractelement <8 x float> %B, i32 2 - %sub2 = fsub float %A2, %B2 + %sub2 = fsub contract float %A2, %B2 %A4 = extractelement <8 x float> %A, i32 4 %B4 = extractelement <8 x float> %B, i32 4 - %sub4 = fsub float %A4, %B4 + %sub4 = fsub contract float %A4, %B4 %A6 = extractelement <8 x float> %A, i32 6 %B6 = extractelement <8 x float> %B, i32 6 - %sub6 = fsub float %A6, %B6 + %sub6 = fsub contract float %A6, %B6 %A1 = extractelement <8 x float> %A, i32 1 %B1 = extractelement <8 x float> %B, i32 1 - %add1 = fadd float %A1, %B1 + %add1 = fadd contract float %A1, %B1 %A3 = extractelement <8 x float> %A, i32 3 %B3 = extractelement <8 x float> %B, i32 3 - %add3 = fadd float %A3, %B3 + %add3 = fadd contract float %A3, %B3 %A5 = extractelement <8 x float> %A, i32 5 %B5 = extractelement <8 x float> %B, i32 5 - %add5 = fadd float %A5, %B5 + %add5 = fadd contract float %A5, %B5 %A7 = extractelement <8 x float> %A, i32 7 %B7 = extractelement <8 x float> %B, i32 7 - %add7 = fadd float %A7, %B7 + %add7 = fadd contract float %A7, %B7 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2 @@ -287,7 +287,7 @@ bb: ret <8 x float> %vecinsert8 } -define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 { +define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) { ; NOFMA-LABEL: buildvector_mul_addsub_pd256: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0 @@ -304,19 +304,19 @@ define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 ; FMA4-NEXT: retq bb: - %A = fmul <4 x double> %C, %D + %A = fmul contract <4 x double> %C, %D %A0 = extractelement <4 x double> %A, i32 0 %B0 = extractelement <4 x double> %B, i32 0 - %sub0 = fsub double %A0, %B0 + %sub0 = fsub contract double %A0, %B0 %A2 = extractelement <4 x double> %A, i32 2 %B2 = extractelement <4 x double> %B, i32 2 - %sub2 = fsub double %A2, %B2 + %sub2 = fsub contract double %A2, %B2 %A1 = extractelement <4 x double> %A, i32 1 %B1 = extractelement <4 x double> %B, i32 1 - %add1 = fadd double %A1, %B1 + %add1 = fadd contract double %A1, %B1 %A3 = extractelement <4 x double> %A, i32 3 %B3 = extractelement <4 x double> %B, i32 3 - %add3 = fadd double %A3, %B3 + %add3 = fadd contract double %A3, %B3 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2 @@ -324,7 +324,7 @@ bb: ret <4 x double> %vecinsert4 } -define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 { +define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) { ; NOFMA-LABEL: buildvector_mul_addsub_ps512: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 @@ -350,55 +350,55 @@ define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 ; FMA4-NEXT: retq bb: - %A = fmul <16 x float> %C, %D + %A = fmul contract <16 x float> %C, %D %A0 = extractelement <16 x float> %A, i32 0 %B0 = extractelement <16 x float> %B, i32 0 - %sub0 = fsub float %A0, %B0 + %sub0 = fsub contract float %A0, %B0 %A2 = extractelement <16 x float> %A, i32 2 %B2 = extractelement <16 x float> %B, i32 2 - %sub2 = fsub float %A2, %B2 + %sub2 = fsub contract float %A2, %B2 %A4 = extractelement <16 x float> %A, i32 4 %B4 = extractelement <16 x float> %B, i32 4 - %sub4 = fsub float %A4, %B4 + %sub4 = fsub contract float %A4, %B4 %A6 = extractelement <16 x float> %A, i32 6 %B6 = extractelement <16 x float> %B, i32 6 - %sub6 = fsub float %A6, %B6 + %sub6 = fsub contract float %A6, %B6 %A8 = extractelement <16 x float> %A, i32 8 %B8 = extractelement <16 x float> %B, i32 8 - %sub8 = fsub float %A8, %B8 + %sub8 = fsub contract float %A8, %B8 %A10 = extractelement <16 x float> %A, i32 10 %B10 = extractelement <16 x float> %B, i32 10 - %sub10 = fsub float %A10, %B10 + %sub10 = fsub contract float %A10, %B10 %A12 = extractelement <16 x float> %A, i32 12 %B12 = extractelement <16 x float> %B, i32 12 - %sub12 = fsub float %A12, %B12 + %sub12 = fsub contract float %A12, %B12 %A14 = extractelement <16 x float> %A, i32 14 %B14 = extractelement <16 x float> %B, i32 14 - %sub14 = fsub float %A14, %B14 + %sub14 = fsub contract float %A14, %B14 %A1 = extractelement <16 x float> %A, i32 1 %B1 = extractelement <16 x float> %B, i32 1 - %add1 = fadd float %A1, %B1 + %add1 = fadd contract float %A1, %B1 %A3 = extractelement <16 x float> %A, i32 3 %B3 = extractelement <16 x float> %B, i32 3 - %add3 = fadd float %A3, %B3 + %add3 = fadd contract float %A3, %B3 %A5 = extractelement <16 x float> %A, i32 5 %B5 = extractelement <16 x float> %B, i32 5 - %add5 = fadd float %A5, %B5 + %add5 = fadd contract float %A5, %B5 %A7 = extractelement <16 x float> %A, i32 7 %B7 = extractelement <16 x float> %B, i32 7 - %add7 = fadd float %A7, %B7 + %add7 = fadd contract float %A7, %B7 %A9 = extractelement <16 x float> %A, i32 9 %B9 = extractelement <16 x float> %B, i32 9 - %add9 = fadd float %A9, %B9 + %add9 = fadd contract float %A9, %B9 %A11 = extractelement <16 x float> %A, i32 11 %B11 = extractelement <16 x float> %B, i32 11 - %add11 = fadd float %A11, %B11 + %add11 = fadd contract float %A11, %B11 %A13 = extractelement <16 x float> %A, i32 13 %B13 = extractelement <16 x float> %B, i32 13 - %add13 = fadd float %A13, %B13 + %add13 = fadd contract float %A13, %B13 %A15 = extractelement <16 x float> %A, i32 15 %B15 = extractelement <16 x float> %B, i32 15 - %add15 = fadd float %A15, %B15 + %add15 = fadd contract float %A15, %B15 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2 @@ -418,7 +418,7 @@ bb: ret <16 x float> %vecinsert16 } -define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 { +define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) { ; NOFMA-LABEL: buildvector_mul_addsub_pd512: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 @@ -444,28 +444,28 @@ define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 ; FMA4-NEXT: retq bb: - %A = fmul <8 x double> %C, %D + %A = fmul contract <8 x double> %C, %D %A0 = extractelement <8 x double> %A, i32 0 %B0 = extractelement <8 x double> %B, i32 0 - %sub0 = fsub double %A0, %B0 + %sub0 = fsub contract double %A0, %B0 %A2 = extractelement <8 x double> %A, i32 2 %B2 = extractelement <8 x double> %B, i32 2 - %sub2 = fsub double %A2, %B2 + %sub2 = fsub contract double %A2, %B2 %A4 = extractelement <8 x double> %A, i32 4 %B4 = extractelement <8 x double> %B, i32 4 - %sub4 = fsub double %A4, %B4 + %sub4 = fsub contract double %A4, %B4 %A6 = extractelement <8 x double> %A, i32 6 %B6 = extractelement <8 x double> %B, i32 6 - %sub6 = fsub double %A6, %B6 + %sub6 = fsub contract double %A6, %B6 %A1 = extractelement <8 x double> %A, i32 1 %B1 = extractelement <8 x double> %B, i32 1 - %add1 = fadd double %A1, %B1 + %add1 = fadd contract double %A1, %B1 %A3 = extractelement <8 x double> %A, i32 3 %B3 = extractelement <8 x double> %B, i32 3 - %add3 = fadd double %A3, %B3 + %add3 = fadd contract double %A3, %B3 %A7 = extractelement <8 x double> %A, i32 7 %B7 = extractelement <8 x double> %B, i32 7 - %add7 = fadd double %A7, %B7 + %add7 = fadd contract double %A7, %B7 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2 @@ -477,7 +477,7 @@ bb: ret <8 x double> %vecinsert8 } -define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 { +define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) { ; NOFMA-LABEL: buildvector_mul_subadd_ps128: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 @@ -506,19 +506,19 @@ define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, ; FMA4-NEXT: vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 ; FMA4-NEXT: retq bb: - %A = fmul <4 x float> %C, %D + %A = fmul contract <4 x float> %C, %D %A0 = extractelement <4 x float> %A, i32 0 %B0 = extractelement <4 x float> %B, i32 0 - %sub0 = fadd float %A0, %B0 + %sub0 = fadd contract float %A0, %B0 %A2 = extractelement <4 x float> %A, i32 2 %B2 = extractelement <4 x float> %B, i32 2 - %sub2 = fadd float %A2, %B2 + %sub2 = fadd contract float %A2, %B2 %A1 = extractelement <4 x float> %A, i32 1 %B1 = extractelement <4 x float> %B, i32 1 - %add1 = fsub float %A1, %B1 + %add1 = fsub contract float %A1, %B1 %A3 = extractelement <4 x float> %A, i32 3 %B3 = extractelement <4 x float> %B, i32 3 - %add3 = fsub float %A3, %B3 + %add3 = fsub contract float %A3, %B3 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2 @@ -526,7 +526,7 @@ bb: ret <4 x float> %vecinsert4 } -define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 { +define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) { ; NOFMA-LABEL: buildvector_mul_subadd_pd128: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0 @@ -547,19 +547,19 @@ define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> ; FMA4-NEXT: vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 ; FMA4-NEXT: retq bb: - %A = fmul <2 x double> %C, %D + %A = fmul contract <2 x double> %C, %D %A0 = extractelement <2 x double> %A, i32 0 %B0 = extractelement <2 x double> %B, i32 0 - %sub0 = fadd double %A0, %B0 + %sub0 = fadd contract double %A0, %B0 %A1 = extractelement <2 x double> %A, i32 1 %B1 = extractelement <2 x double> %B, i32 1 - %add1 = fsub double %A1, %B1 + %add1 = fsub contract double %A1, %B1 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1 ret <2 x double> %vecinsert2 } -define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 { +define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) { ; NOFMA-LABEL: buildvector_mul_subadd_ps256: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 @@ -604,31 +604,31 @@ define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 ; FMA4-NEXT: retq bb: - %A = fmul <8 x float> %C, %D + %A = fmul contract <8 x float> %C, %D %A0 = extractelement <8 x float> %A, i32 0 %B0 = extractelement <8 x float> %B, i32 0 - %sub0 = fadd float %A0, %B0 + %sub0 = fadd contract float %A0, %B0 %A2 = extractelement <8 x float> %A, i32 2 %B2 = extractelement <8 x float> %B, i32 2 - %sub2 = fadd float %A2, %B2 + %sub2 = fadd contract float %A2, %B2 %A4 = extractelement <8 x float> %A, i32 4 %B4 = extractelement <8 x float> %B, i32 4 - %sub4 = fadd float %A4, %B4 + %sub4 = fadd contract float %A4, %B4 %A6 = extractelement <8 x float> %A, i32 6 %B6 = extractelement <8 x float> %B, i32 6 - %sub6 = fadd float %A6, %B6 + %sub6 = fadd contract float %A6, %B6 %A1 = extractelement <8 x float> %A, i32 1 %B1 = extractelement <8 x float> %B, i32 1 - %add1 = fsub float %A1, %B1 + %add1 = fsub contract float %A1, %B1 %A3 = extractelement <8 x float> %A, i32 3 %B3 = extractelement <8 x float> %B, i32 3 - %add3 = fsub float %A3, %B3 + %add3 = fsub contract float %A3, %B3 %A5 = extractelement <8 x float> %A, i32 5 %B5 = extractelement <8 x float> %B, i32 5 - %add5 = fsub float %A5, %B5 + %add5 = fsub contract float %A5, %B5 %A7 = extractelement <8 x float> %A, i32 7 %B7 = extractelement <8 x float> %B, i32 7 - %add7 = fsub float %A7, %B7 + %add7 = fsub contract float %A7, %B7 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2 @@ -640,7 +640,7 @@ bb: ret <8 x float> %vecinsert8 } -define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 { +define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) { ; NOFMA-LABEL: buildvector_mul_subadd_pd256: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0 @@ -669,19 +669,19 @@ define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 ; FMA4-NEXT: retq bb: - %A = fmul <4 x double> %C, %D + %A = fmul contract <4 x double> %C, %D %A0 = extractelement <4 x double> %A, i32 0 %B0 = extractelement <4 x double> %B, i32 0 - %sub0 = fadd double %A0, %B0 + %sub0 = fadd contract double %A0, %B0 %A2 = extractelement <4 x double> %A, i32 2 %B2 = extractelement <4 x double> %B, i32 2 - %sub2 = fadd double %A2, %B2 + %sub2 = fadd contract double %A2, %B2 %A1 = extractelement <4 x double> %A, i32 1 %B1 = extractelement <4 x double> %B, i32 1 - %add1 = fsub double %A1, %B1 + %add1 = fsub contract double %A1, %B1 %A3 = extractelement <4 x double> %A, i32 3 %B3 = extractelement <4 x double> %B, i32 3 - %add3 = fsub double %A3, %B3 + %add3 = fsub contract double %A3, %B3 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2 @@ -689,7 +689,7 @@ bb: ret <4 x double> %vecinsert4 } -define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 { +define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) { ; NOFMA-LABEL: buildvector_mul_subadd_ps512: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 @@ -765,55 +765,55 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5 ; FMA4-NEXT: retq bb: - %A = fmul <16 x float> %C, %D + %A = fmul contract <16 x float> %C, %D %A0 = extractelement <16 x float> %A, i32 0 %B0 = extractelement <16 x float> %B, i32 0 - %sub0 = fadd float %A0, %B0 + %sub0 = fadd contract float %A0, %B0 %A2 = extractelement <16 x float> %A, i32 2 %B2 = extractelement <16 x float> %B, i32 2 - %sub2 = fadd float %A2, %B2 + %sub2 = fadd contract float %A2, %B2 %A4 = extractelement <16 x float> %A, i32 4 %B4 = extractelement <16 x float> %B, i32 4 - %sub4 = fadd float %A4, %B4 + %sub4 = fadd contract float %A4, %B4 %A6 = extractelement <16 x float> %A, i32 6 %B6 = extractelement <16 x float> %B, i32 6 - %sub6 = fadd float %A6, %B6 + %sub6 = fadd contract float %A6, %B6 %A8 = extractelement <16 x float> %A, i32 8 %B8 = extractelement <16 x float> %B, i32 8 - %sub8 = fadd float %A8, %B8 + %sub8 = fadd contract float %A8, %B8 %A10 = extractelement <16 x float> %A, i32 10 %B10 = extractelement <16 x float> %B, i32 10 - %sub10 = fadd float %A10, %B10 + %sub10 = fadd contract float %A10, %B10 %A12 = extractelement <16 x float> %A, i32 12 %B12 = extractelement <16 x float> %B, i32 12 - %sub12 = fadd float %A12, %B12 + %sub12 = fadd contract float %A12, %B12 %A14 = extractelement <16 x float> %A, i32 14 %B14 = extractelement <16 x float> %B, i32 14 - %sub14 = fadd float %A14, %B14 + %sub14 = fadd contract float %A14, %B14 %A1 = extractelement <16 x float> %A, i32 1 %B1 = extractelement <16 x float> %B, i32 1 - %add1 = fsub float %A1, %B1 + %add1 = fsub contract float %A1, %B1 %A3 = extractelement <16 x float> %A, i32 3 %B3 = extractelement <16 x float> %B, i32 3 - %add3 = fsub float %A3, %B3 + %add3 = fsub contract float %A3, %B3 %A5 = extractelement <16 x float> %A, i32 5 %B5 = extractelement <16 x float> %B, i32 5 - %add5 = fsub float %A5, %B5 + %add5 = fsub contract float %A5, %B5 %A7 = extractelement <16 x float> %A, i32 7 %B7 = extractelement <16 x float> %B, i32 7 - %add7 = fsub float %A7, %B7 + %add7 = fsub contract float %A7, %B7 %A9 = extractelement <16 x float> %A, i32 9 %B9 = extractelement <16 x float> %B, i32 9 - %add9 = fsub float %A9, %B9 + %add9 = fsub contract float %A9, %B9 %A11 = extractelement <16 x float> %A, i32 11 %B11 = extractelement <16 x float> %B, i32 11 - %add11 = fsub float %A11, %B11 + %add11 = fsub contract float %A11, %B11 %A13 = extractelement <16 x float> %A, i32 13 %B13 = extractelement <16 x float> %B, i32 13 - %add13 = fsub float %A13, %B13 + %add13 = fsub contract float %A13, %B13 %A15 = extractelement <16 x float> %A, i32 15 %B15 = extractelement <16 x float> %B, i32 15 - %add15 = fsub float %A15, %B15 + %add15 = fsub contract float %A15, %B15 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2 @@ -833,7 +833,7 @@ bb: ret <16 x float> %vecinsert16 } -define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 { +define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) { ; NOFMA-LABEL: buildvector_mul_subadd_pd512: ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 @@ -879,28 +879,28 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5 ; FMA4-NEXT: retq bb: - %A = fmul <8 x double> %C, %D + %A = fmul contract <8 x double> %C, %D %A0 = extractelement <8 x double> %A, i32 0 %B0 = extractelement <8 x double> %B, i32 0 - %sub0 = fadd double %A0, %B0 + %sub0 = fadd contract double %A0, %B0 %A2 = extractelement <8 x double> %A, i32 2 %B2 = extractelement <8 x double> %B, i32 2 - %sub2 = fadd double %A2, %B2 + %sub2 = fadd contract double %A2, %B2 %A4 = extractelement <8 x double> %A, i32 4 %B4 = extractelement <8 x double> %B, i32 4 - %sub4 = fadd double %A4, %B4 + %sub4 = fadd contract double %A4, %B4 %A6 = extractelement <8 x double> %A, i32 6 %B6 = extractelement <8 x double> %B, i32 6 - %sub6 = fadd double %A6, %B6 + %sub6 = fadd contract double %A6, %B6 %A1 = extractelement <8 x double> %A, i32 1 %B1 = extractelement <8 x double> %B, i32 1 - %add1 = fsub double %A1, %B1 + %add1 = fsub contract double %A1, %B1 %A3 = extractelement <8 x double> %A, i32 3 %B3 = extractelement <8 x double> %B, i32 3 - %add3 = fsub double %A3, %B3 + %add3 = fsub contract double %A3, %B3 %A7 = extractelement <8 x double> %A, i32 7 %B7 = extractelement <8 x double> %B, i32 7 - %add7 = fsub double %A7, %B7 + %add7 = fsub contract double %A7, %B7 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2 @@ -911,5 +911,3 @@ bb: %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7 ret <8 x double> %vecinsert8 } - -attributes #0 = { nounwind "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/fmsubadd-combine.ll b/llvm/test/CodeGen/X86/fmsubadd-combine.ll index 674a1d5..3f562dd 100644 --- a/llvm/test/CodeGen/X86/fmsubadd-combine.ll +++ b/llvm/test/CodeGen/X86/fmsubadd-combine.ll @@ -6,7 +6,7 @@ ; This test checks the fusing of MUL + SUB/ADD to FMSUBADD. -define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { +define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) { ; NOFMA-LABEL: mul_subadd_pd128: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0 @@ -25,14 +25,14 @@ define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x dou ; FMA4-NEXT: vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 ; FMA4-NEXT: retq entry: - %AB = fmul <2 x double> %A, %B - %Sub = fsub <2 x double> %AB, %C - %Add = fadd <2 x double> %AB, %C + %AB = fmul contract<2 x double> %A, %B + %Sub = fsub contract<2 x double> %AB, %C + %Add = fadd contract<2 x double> %AB, %C %subadd = shufflevector <2 x double> %Add, <2 x double> %Sub, <2 x i32> <i32 0, i32 3> ret <2 x double> %subadd } -define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 { +define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) { ; NOFMA-LABEL: mul_subadd_ps128: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 @@ -51,14 +51,14 @@ define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> ; FMA4-NEXT: vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 ; FMA4-NEXT: retq entry: - %AB = fmul <4 x float> %A, %B - %Sub = fsub <4 x float> %AB, %C - %Add = fadd <4 x float> %AB, %C + %AB = fmul contract <4 x float> %A, %B + %Sub = fsub contract <4 x float> %AB, %C + %Add = fadd contract <4 x float> %AB, %C %subadd = shufflevector <4 x float> %Add, <4 x float> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> ret <4 x float> %subadd } -define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 { +define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) { ; NOFMA-LABEL: mul_subadd_pd256: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0 @@ -77,14 +77,14 @@ define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x dou ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 ; FMA4-NEXT: retq entry: - %AB = fmul <4 x double> %A, %B - %Sub = fsub <4 x double> %AB, %C - %Add = fadd <4 x double> %AB, %C + %AB = fmul contract <4 x double> %A, %B + %Sub = fsub contract <4 x double> %AB, %C + %Add = fadd contract <4 x double> %AB, %C %subadd = shufflevector <4 x double> %Add, <4 x double> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> ret <4 x double> %subadd } -define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 { +define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) { ; NOFMA-LABEL: mul_subadd_ps256: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 @@ -103,14 +103,14 @@ define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 ; FMA4-NEXT: retq entry: - %AB = fmul <8 x float> %A, %B - %Sub = fsub <8 x float> %AB, %C - %Add = fadd <8 x float> %AB, %C + %AB = fmul contract <8 x float> %A, %B + %Sub = fsub contract <8 x float> %AB, %C + %Add = fadd contract <8 x float> %AB, %C %subadd = shufflevector <8 x float> %Add, <8 x float> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> ret <8 x float> %subadd } -define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 { +define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) { ; NOFMA-LABEL: mul_subadd_pd512: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0 @@ -140,14 +140,14 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5 ; FMA4-NEXT: retq entry: - %AB = fmul <8 x double> %A, %B - %Sub = fsub <8 x double> %AB, %C - %Add = fadd <8 x double> %AB, %C + %AB = fmul contract <8 x double> %A, %B + %Sub = fsub contract <8 x double> %AB, %C + %Add = fadd contract <8 x double> %AB, %C %subadd = shufflevector <8 x double> %Add, <8 x double> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> ret <8 x double> %subadd } -define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 { +define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) { ; NOFMA-LABEL: mul_subadd_ps512: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -177,15 +177,15 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5 ; FMA4-NEXT: retq entry: - %AB = fmul <16 x float> %A, %B - %Sub = fsub <16 x float> %AB, %C - %Add = fadd <16 x float> %AB, %C + %AB = fmul contract <16 x float> %A, %B + %Sub = fsub contract <16 x float> %AB, %C + %Add = fadd contract <16 x float> %AB, %C %subadd = shufflevector <16 x float> %Add, <16 x float> %Sub, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> ret <16 x float> %subadd } ; This should not be matched to fmsubadd because the mul is on the wrong side of the fsub. -define <2 x double> @mul_subadd_bad_commute(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { +define <2 x double> @mul_subadd_bad_commute(<2 x double> %A, <2 x double> %B, <2 x double> %C) { ; CHECK-LABEL: mul_subadd_bad_commute: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmulpd %xmm1, %xmm0, %xmm0 @@ -194,11 +194,9 @@ define <2 x double> @mul_subadd_bad_commute(<2 x double> %A, <2 x double> %B, <2 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; CHECK-NEXT: retq entry: - %AB = fmul <2 x double> %A, %B + %AB = fmul contract <2 x double> %A, %B %Sub = fsub <2 x double> %C, %AB %Add = fadd <2 x double> %AB, %C %subadd = shufflevector <2 x double> %Add, <2 x double> %Sub, <2 x i32> <i32 0, i32 3> ret <2 x double> %subadd } - -attributes #0 = { nounwind "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index 189de05..962ffe4 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -490,18 +490,19 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind { define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_ashr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $3, %eax -; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sarl $3, %ecx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $6, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact_extra_use: ; X64: # %bb.0: +; X64-NEXT: sarl $3, %edi +; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax -; X64-NEXT: sarl $3, %eax -; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: sarl $6, %eax ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 @@ -603,18 +604,19 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind { define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_lshr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrl $5, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact_extra_use: ; X64: # %bb.0: +; X64-NEXT: shrl $3, %edi +; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: shrl $5, %eax ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll index 953a5e7..15b43c4 100644 --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -600,8 +600,8 @@ define void @freeze_buildvector_extrause(ptr %origin0, ptr %origin1, ptr %origin ; X86-NEXT: vpinsrd $1, (%edi), %xmm0, %xmm0 ; X86-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0 ; X86-NEXT: vpinsrd $3, (%edx), %xmm0, %xmm0 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-NEXT: vmovdqa %xmm0, (%ecx) +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; X86-NEXT: vmovdqa %xmm1, (%ecx) ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: popl %esi @@ -616,8 +616,8 @@ define void @freeze_buildvector_extrause(ptr %origin0, ptr %origin1, ptr %origin ; X64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0 ; X64-NEXT: vpinsrd $3, (%rcx), %xmm0, %xmm0 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vmovdqa %xmm0, (%r9) +; X64-NEXT: vpand %xmm1, %xmm0, %xmm1 +; X64-NEXT: vmovdqa %xmm1, (%r9) ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%r8) diff --git a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll index 6376b4d..f3bb334 100644 --- a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll @@ -4,7 +4,14 @@ define <4 x i32> @hadd_select_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: hadd_select_v4i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3] +; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] +; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq entry: %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3> @@ -73,7 +80,15 @@ entry: define <4 x i32> @hsub_select_shl_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: hsub_select_shl_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [65535,65535,65535,65535] +; CHECK-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpor %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vphsubd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpslld $16, %xmm0, %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9] +; CHECK-NEXT: vpmaxud %xmm2, %xmm1, %xmm2 +; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535> %or2 = or <4 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535> diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 3a4a638..fb2433d 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -730,36 +730,36 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 ; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 +; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9 ; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm7 ; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 +; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -767,20 +767,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX2-LABEL: vec256_i64_signed_mem_reg: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlq $33, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -790,36 +790,36 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOP-NEXT: vmovdqa (%rdi), %xmm2 ; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5 +; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm4 ; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; XOP-NEXT: vpsubq %xmm0, %xmm4, %xmm0 +; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm5 ; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 +; XOP-NEXT: vpor %xmm4, %xmm8, %xmm9 ; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 -; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 -; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 +; XOP-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 +; XOP-NEXT: vpor %xmm5, %xmm8, %xmm7 ; XOP-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 +; XOP-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 ; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm3, %xmm5, %xmm3 ; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq @@ -897,101 +897,101 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i64_signed_reg_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 -; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 -; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 -; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 -; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9 +; AVX1-NEXT: vpmuludq %xmm2, %xmm9, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 +; AVX1-NEXT: vpsrlq $33, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm7 +; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 +; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i64_signed_reg_mem: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: vec256_i64_signed_reg_mem: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOP-NEXT: vpcomgtq %xmm2, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm5 -; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 -; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOP-NEXT: vmovdqa (%rdi), %xmm2 +; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 +; XOP-NEXT: vpcomgtq %xmm2, %xmm0, %xmm4 +; XOP-NEXT: vpsubq %xmm2, %xmm0, %xmm2 ; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2 -; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 -; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 -; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 -; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 -; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 -; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; XOP-NEXT: vpcomgtq %xmm3, %xmm1, %xmm5 +; XOP-NEXT: vpsubq %xmm3, %xmm1, %xmm3 +; XOP-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; XOP-NEXT: vpsubq %xmm3, %xmm5, %xmm3 +; XOP-NEXT: vpsrlq $1, %xmm3, %xmm6 +; XOP-NEXT: vpsrlq $1, %xmm2, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm2, %xmm2 -; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 -; XOP-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vpor %xmm4, %xmm8, %xmm9 +; XOP-NEXT: vpmuludq %xmm2, %xmm9, %xmm2 ; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 +; XOP-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 ; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; XOP-NEXT: vpsllq $32, %xmm2, %xmm2 -; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; XOP-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 -; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 +; XOP-NEXT: vpsrlq $33, %xmm3, %xmm3 +; XOP-NEXT: vpor %xmm5, %xmm8, %xmm7 +; XOP-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 +; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 +; XOP-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; XOP-NEXT: vpsllq $32, %xmm3, %xmm3 +; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i64_signed_reg_mem: @@ -1071,36 +1071,36 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX1-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 ; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 +; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9 ; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm7 ; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 +; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1109,20 +1109,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1133,36 +1133,36 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vmovdqa 16(%rsi), %xmm1 ; XOP-NEXT: vmovdqa (%rdi), %xmm2 ; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5 +; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm4 ; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; XOP-NEXT: vpsubq %xmm0, %xmm4, %xmm0 +; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm5 ; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 +; XOP-NEXT: vpor %xmm4, %xmm8, %xmm9 ; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 -; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 -; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 +; XOP-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 +; XOP-NEXT: vpor %xmm5, %xmm8, %xmm7 ; XOP-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 +; XOP-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 ; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm3, %xmm5, %xmm3 ; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq @@ -1627,27 +1627,27 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i16_signed_reg_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpsubw %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i16_signed_reg_mem: @@ -1665,25 +1665,25 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw ; ; XOP-LABEL: vec256_i16_signed_reg_mem: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOP-NEXT: vpcomgtw %xmm2, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm5 -; XOP-NEXT: vpminsw %xmm2, %xmm3, %xmm6 -; XOP-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOP-NEXT: vmovdqa (%rdi), %xmm2 +; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 +; XOP-NEXT: vpcomgtw %xmm3, %xmm1, %xmm4 +; XOP-NEXT: vpcomgtw %xmm2, %xmm0, %xmm5 +; XOP-NEXT: vpminsw %xmm3, %xmm1, %xmm6 +; XOP-NEXT: vpmaxsw %xmm3, %xmm1, %xmm3 +; XOP-NEXT: vpsubw %xmm6, %xmm3, %xmm3 +; XOP-NEXT: vpminsw %xmm2, %xmm0, %xmm6 +; XOP-NEXT: vpmaxsw %xmm2, %xmm0, %xmm2 ; XOP-NEXT: vpsubw %xmm6, %xmm2, %xmm2 -; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm6 -; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 +; XOP-NEXT: vpsrlw $1, %xmm3, %xmm3 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2 -; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm1, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOP-NEXT: vpmacsww %xmm1, %xmm4, %xmm3, %xmm1 +; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm2, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i16_signed_reg_mem: @@ -2425,9 +2425,9 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i8_signed_reg_mem: ; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm6 @@ -2487,38 +2487,38 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; ; XOP-LABEL: vec256_i8_signed_reg_mem: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOP-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5 -; XOP-NEXT: vpminsb %xmm1, %xmm0, %xmm6 -; XOP-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpminsb %xmm2, %xmm3, %xmm6 -; XOP-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOP-NEXT: vmovdqa (%rdi), %xmm2 +; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 +; XOP-NEXT: vpcomgtb %xmm3, %xmm1, %xmm4 +; XOP-NEXT: vpcomgtb %xmm2, %xmm0, %xmm5 +; XOP-NEXT: vpminsb %xmm2, %xmm0, %xmm6 +; XOP-NEXT: vpmaxsb %xmm2, %xmm0, %xmm2 ; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpminsb %xmm3, %xmm1, %xmm6 +; XOP-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3 +; XOP-NEXT: vpsubb %xmm6, %xmm3, %xmm3 ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; XOP-NEXT: vpshlb %xmm6, %xmm3, %xmm3 ; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 -; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 -; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 +; XOP-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 ; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] -; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 +; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm2, %xmm2 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 -; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 +; XOP-NEXT: vpmaddubsw %xmm6, %xmm3, %xmm6 ; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOP-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 +; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm3, %xmm3 +; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i8_signed_reg_mem: diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll index 5f6337e2..a4750b4 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -507,58 +507,58 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i16_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm5 +; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubw %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_reg_mem: @@ -939,66 +939,66 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm7, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_reg_mem: diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll index 1921cf38..a75d42e 100644 --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -28,24 +28,27 @@ define i32 @scalar_i32_signed_reg_reg(i32 %a1, i32 %a2) nounwind { ; ; X86-LABEL: scalar_i32_signed_reg_reg: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %esi, %eax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %esi ; X86-NEXT: jg .LBB0_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: negl %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: .LBB0_2: ; X86-NEXT: shrl %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %t3 = icmp sgt i32 %a1, %a2 ; signed %t4 = select i1 %t3, i32 -1, i32 1 @@ -76,26 +79,27 @@ define i32 @scalar_i32_unsigned_reg_reg(i32 %a1, i32 %a2) nounwind { ; ; X86-LABEL: scalar_i32_unsigned_reg_reg: ; X86: # %bb.0: -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: movl %edi, %esi -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: subl %edi, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: setbe %bl +; X86-NEXT: leal -1(%ebx,%ebx), %esi ; X86-NEXT: ja .LBB1_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %eax +; X86-NEXT: negl %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: .LBB1_2: ; X86-NEXT: shrl %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %t3 = icmp ugt i32 %a1, %a2 %t4 = select i1 %t3, i32 -1, i32 1 @@ -128,25 +132,28 @@ define i32 @scalar_i32_signed_mem_reg(ptr %a1_addr, i32 %a2) nounwind { ; ; X86-LABEL: scalar_i32_signed_mem_reg: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %esi +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: subl %edx, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %esi ; X86-NEXT: jg .LBB2_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: subl %ecx, %edx +; X86-NEXT: negl %edx ; X86-NEXT: movl %edx, %eax ; X86-NEXT: .LBB2_2: ; X86-NEXT: shrl %eax ; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %a1 = load i32, ptr %a1_addr %t3 = icmp sgt i32 %a1, %a2 ; signed @@ -178,25 +185,28 @@ define i32 @scalar_i32_signed_reg_mem(i32 %a1, ptr %a2_addr) nounwind { ; ; X86-LABEL: scalar_i32_signed_reg_mem: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %esi, %eax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %esi ; X86-NEXT: jg .LBB3_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: negl %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: .LBB3_2: ; X86-NEXT: shrl %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %a2 = load i32, ptr %a2_addr %t3 = icmp sgt i32 %a1, %a2 ; signed @@ -229,26 +239,29 @@ define i32 @scalar_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; ; X86-LABEL: scalar_i32_signed_mem_mem: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx ; X86-NEXT: movl (%eax), %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %esi, %eax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %esi ; X86-NEXT: jg .LBB4_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: negl %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: .LBB4_2: ; X86-NEXT: shrl %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %a1 = load i32, ptr %a1_addr %a2 = load i32, ptr %a2_addr @@ -291,36 +304,34 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: orl $1, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: subl %esi, %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: setl %bl +; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: jl .LBB5_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: .LBB5_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %edx +; X86-NEXT: negl %ebx +; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: shrl %edi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: imull %ebx, %ebp +; X86-NEXT: orl $1, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -359,10 +370,10 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpl %eax, %ebp ; X86-NEXT: sbbl %ecx, %esi @@ -429,45 +440,36 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %ebx -; X86-NEXT: movl 4(%eax), %esi -; X86-NEXT: cmpl %ebx, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: orl $1, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl (%eax), %esi +; X86-NEXT: movl 4(%eax), %ecx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: subl %esi, %edx ; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl %ebx, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: setl %bl +; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: jl .LBB7_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: .LBB7_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: negl %ebx +; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: shrl %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: imull %ebx, %ebp +; X86-NEXT: orl $1, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: addl $12, %esp +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -508,37 +510,35 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: orl $1, %ebx +; X86-NEXT: movl 4(%eax), %ebp ; X86-NEXT: movl %esi, %eax ; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: subl %esi, %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: setl %bl +; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: jl .LBB8_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: .LBB8_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %edx +; X86-NEXT: negl %ebx +; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: shrl %edi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: imull %ebx, %ebp +; X86-NEXT: orl $1, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -579,46 +579,37 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ebx -; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: movl 4(%ecx), %ecx ; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: cmpl %ebx, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: orl $1, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl 4(%eax), %ebp +; X86-NEXT: movl %esi, %eax ; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: subl %esi, %edx ; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl %ebx, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: setl %bl +; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: jl .LBB9_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: .LBB9_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: negl %ebx +; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: shrl %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: imull %ebx, %ebp +; X86-NEXT: orl $1, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: addl $12, %esp +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -667,17 +658,16 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subw %dx, %ax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: jg .LBB10_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: negl %eax ; X86-NEXT: .LBB10_2: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -720,17 +710,16 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subw %dx, %ax +; X86-NEXT: setbe %bl +; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: ja .LBB11_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: negl %eax ; X86-NEXT: .LBB11_2: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpw %cx, %dx -; X86-NEXT: setae %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -777,16 +766,15 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind { ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl (%eax), %ecx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subw %dx, %ax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: jg .LBB12_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: negl %eax ; X86-NEXT: .LBB12_2: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -829,19 +817,18 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_reg_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subw %dx, %ax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: jg .LBB13_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: negl %eax ; X86-NEXT: .LBB13_2: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -888,16 +875,15 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %ecx ; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subw %dx, %ax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: jg .LBB14_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: negl %eax ; X86-NEXT: .LBB14_2: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -946,17 +932,16 @@ define i8 @scalar_i8_signed_reg_reg(i8 %a1, i8 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: cmpb %ah, %cl -; X86-NEXT: setg %dl -; X86-NEXT: negb %dl -; X86-NEXT: orb $1, %dl ; X86-NEXT: movb %cl, %al ; X86-NEXT: subb %ah, %al +; X86-NEXT: setg %dl ; X86-NEXT: jg .LBB15_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: subb %cl, %ah ; X86-NEXT: movb %ah, %al ; X86-NEXT: .LBB15_2: +; X86-NEXT: negb %dl +; X86-NEXT: orb $1, %dl ; X86-NEXT: shrb %al ; X86-NEXT: mulb %dl ; X86-NEXT: addb %cl, %al @@ -993,18 +978,17 @@ define i8 @scalar_i8_unsigned_reg_reg(i8 %a1, i8 %a2) nounwind { ; X86-LABEL: scalar_i8_unsigned_reg_reg: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: movb %ch, %ah -; X86-NEXT: subb %cl, %ah -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: orb $1, %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: movb %cl, %al -; X86-NEXT: subb %ch, %al +; X86-NEXT: subb %ah, %al +; X86-NEXT: seta %dl ; X86-NEXT: ja .LBB16_2 ; X86-NEXT: # %bb.1: +; X86-NEXT: subb %cl, %ah ; X86-NEXT: movb %ah, %al ; X86-NEXT: .LBB16_2: +; X86-NEXT: negb %dl +; X86-NEXT: orb $1, %dl ; X86-NEXT: shrb %al ; X86-NEXT: mulb %dl ; X86-NEXT: addb %cl, %al @@ -1046,17 +1030,16 @@ define i8 @scalar_i8_signed_mem_reg(ptr %a1_addr, i8 %a2) nounwind { ; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl (%ecx), %ecx -; X86-NEXT: cmpb %ah, %cl -; X86-NEXT: setg %dl -; X86-NEXT: negb %dl -; X86-NEXT: orb $1, %dl ; X86-NEXT: movb %cl, %al ; X86-NEXT: subb %ah, %al +; X86-NEXT: setg %dl ; X86-NEXT: jg .LBB17_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: subb %cl, %ah ; X86-NEXT: movb %ah, %al ; X86-NEXT: .LBB17_2: +; X86-NEXT: negb %dl +; X86-NEXT: orb $1, %dl ; X86-NEXT: shrb %al ; X86-NEXT: mulb %dl ; X86-NEXT: addb %cl, %al @@ -1096,17 +1079,16 @@ define i8 @scalar_i8_signed_reg_mem(i8 %a1, ptr %a2_addr) nounwind { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb (%eax), %ah -; X86-NEXT: cmpb %ah, %cl -; X86-NEXT: setg %dl -; X86-NEXT: negb %dl -; X86-NEXT: orb $1, %dl ; X86-NEXT: movb %cl, %al ; X86-NEXT: subb %ah, %al +; X86-NEXT: setg %dl ; X86-NEXT: jg .LBB18_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: subb %cl, %ah ; X86-NEXT: movb %ah, %al ; X86-NEXT: .LBB18_2: +; X86-NEXT: negb %dl +; X86-NEXT: orb $1, %dl ; X86-NEXT: shrb %al ; X86-NEXT: mulb %dl ; X86-NEXT: addb %cl, %al @@ -1148,17 +1130,16 @@ define i8 @scalar_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl (%ecx), %ecx ; X86-NEXT: movb (%eax), %ah -; X86-NEXT: cmpb %ah, %cl -; X86-NEXT: setg %dl -; X86-NEXT: negb %dl -; X86-NEXT: orb $1, %dl ; X86-NEXT: movb %cl, %al ; X86-NEXT: subb %ah, %al +; X86-NEXT: setg %dl ; X86-NEXT: jg .LBB19_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: subb %cl, %ah ; X86-NEXT: movb %ah, %al ; X86-NEXT: .LBB19_2: +; X86-NEXT: negb %dl +; X86-NEXT: orb $1, %dl ; X86-NEXT: shrb %al ; X86-NEXT: mulb %dl ; X86-NEXT: addb %cl, %al diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index a1da40e7..f539830 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -155,10 +155,10 @@ define <16 x i32> @PR42819(ptr %a0) { define void @PR42833() { ; SSE2-LABEL: PR42833: ; SSE2: # %bb.0: -; SSE2-NEXT: movl b(%rip), %eax -; SSE2-NEXT: movdqa c+128(%rip), %xmm0 ; SSE2-NEXT: movdqa c+144(%rip), %xmm2 -; SSE2-NEXT: addl c+128(%rip), %eax +; SSE2-NEXT: movdqa c+128(%rip), %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: addl b(%rip), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm3 @@ -191,10 +191,10 @@ define void @PR42833() { ; ; SSE42-LABEL: PR42833: ; SSE42: # %bb.0: -; SSE42-NEXT: movl b(%rip), %eax -; SSE42-NEXT: movdqa c+128(%rip), %xmm0 ; SSE42-NEXT: movdqa c+144(%rip), %xmm1 -; SSE42-NEXT: addl c+128(%rip), %eax +; SSE42-NEXT: movdqa c+128(%rip), %xmm0 +; SSE42-NEXT: movd %xmm0, %eax +; SSE42-NEXT: addl b(%rip), %eax ; SSE42-NEXT: movd %eax, %xmm2 ; SSE42-NEXT: paddd %xmm0, %xmm2 ; SSE42-NEXT: movdqa d+144(%rip), %xmm3 diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index 300da68..ead7110 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -2166,3 +2166,708 @@ define <8 x i16> @sse2_pmulhu_w_const(<8 x i16> %a0, <8 x i16> %a1) { } declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) +define <8 x i16> @zext_mul_and_shift17(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: zext_mul_and_shift17: +; SSE: # %bb.0: +; SSE-NEXT: pmulhuw %xmm1, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: zext_mul_and_shift17: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX-NEXT: retq + %a.ext = zext <8 x i16> %a to <8 x i32> + %b.ext = zext <8 x i16> %b to <8 x i32> + %mul = mul <8 x i32> %a.ext, %b.ext + %shift = lshr <8 x i32> %mul, splat(i32 17) + %trunc = trunc <8 x i32> %shift to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @zext_mul_and_shift24(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: zext_mul_and_shift24: +; SSE: # %bb.0: +; SSE-NEXT: pmulhuw %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: zext_mul_and_shift24: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: retq + %a.ext = zext <8 x i16> %a to <8 x i32> + %b.ext = zext <8 x i16> %b to <8 x i32> + %mul = mul <8 x i32> %a.ext, %b.ext + %shift = lshr <8 x i32> %mul, splat(i32 24) + %trunc = trunc <8 x i32> %shift to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @zext_mul_and_shift31(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: zext_mul_and_shift31: +; SSE: # %bb.0: +; SSE-NEXT: pmulhuw %xmm1, %xmm0 +; SSE-NEXT: psrlw $15, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: zext_mul_and_shift31: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX-NEXT: retq + %a.ext = zext <8 x i16> %a to <8 x i32> + %b.ext = zext <8 x i16> %b to <8 x i32> + %mul = mul <8 x i32> %a.ext, %b.ext + %shift = lshr <8 x i32> %mul, splat(i32 31) + %trunc = trunc <8 x i32> %shift to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @sext_mul_and_shift17(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: sext_mul_and_shift17: +; SSE: # %bb.0: +; SSE-NEXT: pmulhw %xmm1, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sext_mul_and_shift17: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX-NEXT: retq + %a.ext = sext <8 x i16> %a to <8 x i32> + %b.ext = sext <8 x i16> %b to <8 x i32> + %mul = mul <8 x i32> %a.ext, %b.ext + %shift = lshr <8 x i32> %mul, splat(i32 17) + %trunc = trunc <8 x i32> %shift to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @sext_mul_and_shift24(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: sext_mul_and_shift24: +; SSE: # %bb.0: +; SSE-NEXT: pmulhw %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sext_mul_and_shift24: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: retq + %a.ext = sext <8 x i16> %a to <8 x i32> + %b.ext = sext <8 x i16> %b to <8 x i32> + %mul = mul <8 x i32> %a.ext, %b.ext + %shift = lshr <8 x i32> %mul, splat(i32 24) + %trunc = trunc <8 x i32> %shift to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @sext_mul_and_shift31(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: sext_mul_and_shift31: +; SSE: # %bb.0: +; SSE-NEXT: pmulhw %xmm1, %xmm0 +; SSE-NEXT: psrlw $15, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sext_mul_and_shift31: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX-NEXT: retq + %a.ext = sext <8 x i16> %a to <8 x i32> + %b.ext = sext <8 x i16> %b to <8 x i32> + %mul = mul <8 x i32> %a.ext, %b.ext + %shift = lshr <8 x i32> %mul, splat(i32 31) + %trunc = trunc <8 x i32> %shift to <8 x i16> + ret <8 x i16> %trunc +} + +define <4 x i16> @sext_mulhw_v4i16_shift17(<4 x i16> %a, <4 x i16> %b) { +; SSE-LABEL: sext_mulhw_v4i16_shift17: +; SSE: # %bb.0: +; SSE-NEXT: pmulhw %xmm1, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sext_mulhw_v4i16_shift17: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX-NEXT: retq + %a1 = sext <4 x i16> %a to <4 x i32> + %b1 = sext <4 x i16> %b to <4 x i32> + %c = mul <4 x i32> %a1, %b1 + %d = lshr <4 x i32> %c, splat (i32 17) + %e = trunc <4 x i32> %d to <4 x i16> + ret <4 x i16> %e +} + +define <4 x i16> @sext_mulhw_v4i16_shift24(<4 x i16> %a, <4 x i16> %b) { +; SSE-LABEL: sext_mulhw_v4i16_shift24: +; SSE: # %bb.0: +; SSE-NEXT: pmulhw %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sext_mulhw_v4i16_shift24: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: retq + %a1 = sext <4 x i16> %a to <4 x i32> + %b1 = sext <4 x i16> %b to <4 x i32> + %c = mul <4 x i32> %a1, %b1 + %d = lshr <4 x i32> %c, splat (i32 24) + %e = trunc <4 x i32> %d to <4 x i16> + ret <4 x i16> %e +} + +define <4 x i16> @sext_mulhw_v4i16_shift31(<4 x i16> %a, <4 x i16> %b) { +; SSE-LABEL: sext_mulhw_v4i16_shift31: +; SSE: # %bb.0: +; SSE-NEXT: pmulhw %xmm1, %xmm0 +; SSE-NEXT: psrlw $15, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sext_mulhw_v4i16_shift31: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX-NEXT: retq + %a1 = sext <4 x i16> %a to <4 x i32> + %b1 = sext <4 x i16> %b to <4 x i32> + %c = mul <4 x i32> %a1, %b1 + %d = lshr <4 x i32> %c, splat (i32 31) + %e = trunc <4 x i32> %d to <4 x i16> + ret <4 x i16> %e +} + +define <4 x i16> @and_mulhuw_v4i16_shift17(<4 x i64> %a, <4 x i64> %b) { +; SSE2-LABEL: and_mulhuw_v4i16_shift17: +; SSE2: # %bb.0: +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: pmulhuw %xmm2, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: and_mulhuw_v4i16_shift17: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm4, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm4, %xmm0 +; SSE41-NEXT: pmulhuw %xmm2, %xmm0 +; SSE41-NEXT: psrlw $1, %xmm0 +; SSE41-NEXT: retq +; +; AVX2-LABEL: and_mulhuw_v4i16_shift17: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: and_mulhuw_v4i16_shift17: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a1 = and <4 x i64> %a, <i64 65535, i64 65535, i64 65535, i64 65535> + %b1 = and <4 x i64> %b, <i64 65535, i64 65535, i64 65535, i64 65535> + %c = mul <4 x i64> %a1, %b1 + %d = lshr <4 x i64> %c, splat (i64 17) + %e = trunc <4 x i64> %d to <4 x i16> + ret <4 x i16> %e +} + +define <4 x i16> @and_mulhuw_v4i16_shift24(<4 x i64> %a, <4 x i64> %b) { +; SSE2-LABEL: and_mulhuw_v4i16_shift24: +; SSE2: # %bb.0: +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: pmulhuw %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: and_mulhuw_v4i16_shift24: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm4, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm4, %xmm0 +; SSE41-NEXT: pmulhuw %xmm2, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: retq +; +; AVX2-LABEL: and_mulhuw_v4i16_shift24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: and_mulhuw_v4i16_shift24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a1 = and <4 x i64> %a, <i64 65535, i64 65535, i64 65535, i64 65535> + %b1 = and <4 x i64> %b, <i64 65535, i64 65535, i64 65535, i64 65535> + %c = mul <4 x i64> %a1, %b1 + %d = lshr <4 x i64> %c, splat (i64 24) + %e = trunc <4 x i64> %d to <4 x i16> + ret <4 x i16> %e +} + +define <4 x i16> @and_mulhuw_v4i16_shift31(<4 x i64> %a, <4 x i64> %b) { +; SSE2-LABEL: and_mulhuw_v4i16_shift31: +; SSE2: # %bb.0: +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: pmulhuw %xmm2, %xmm0 +; SSE2-NEXT: psrlw $15, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: and_mulhuw_v4i16_shift31: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm4, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm4, %xmm0 +; SSE41-NEXT: pmulhuw %xmm2, %xmm0 +; SSE41-NEXT: psrlw $15, %xmm0 +; SSE41-NEXT: retq +; +; AVX2-LABEL: and_mulhuw_v4i16_shift31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: and_mulhuw_v4i16_shift31: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a1 = and <4 x i64> %a, <i64 65535, i64 65535, i64 65535, i64 65535> + %b1 = and <4 x i64> %b, <i64 65535, i64 65535, i64 65535, i64 65535> + %c = mul <4 x i64> %a1, %b1 + %d = lshr <4 x i64> %c, splat (i64 31) + %e = trunc <4 x i64> %d to <4 x i16> + ret <4 x i16> %e +} + +define <8 x i16> @lshr_mulhuw_v8i16_shift17(<8 x i32> %a, <8 x i32> %b) { +; SSE2-LABEL: lshr_mulhuw_v8i16_shift17: +; SSE2: # %bb.0: +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pmulhuw %xmm2, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: lshr_mulhuw_v8i16_shift17: +; SSE41: # %bb.0: +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pmulhuw %xmm2, %xmm0 +; SSE41-NEXT: psrlw $1, %xmm0 +; SSE41-NEXT: retq +; +; AVX2-LABEL: lshr_mulhuw_v8i16_shift17: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: lshr_mulhuw_v8i16_shift17: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a1 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %b1 = lshr <8 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %c = mul <8 x i32> %a1, %b1 + %d = lshr <8 x i32> %c, splat (i32 17) + %e = trunc <8 x i32> %d to <8 x i16> + ret <8 x i16> %e +} + +define <8 x i16> @lshr_mulhuw_v8i16_shift24(<8 x i32> %a, <8 x i32> %b) { +; SSE2-LABEL: lshr_mulhuw_v8i16_shift24: +; SSE2: # %bb.0: +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pmulhuw %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: lshr_mulhuw_v8i16_shift24: +; SSE41: # %bb.0: +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pmulhuw %xmm2, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: retq +; +; AVX2-LABEL: lshr_mulhuw_v8i16_shift24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: lshr_mulhuw_v8i16_shift24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a1 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %b1 = lshr <8 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %c = mul <8 x i32> %a1, %b1 + %d = lshr <8 x i32> %c, splat (i32 24) + %e = trunc <8 x i32> %d to <8 x i16> + ret <8 x i16> %e +} + +define <8 x i16> @lshr_mulhuw_v8i16_shift31(<8 x i32> %a, <8 x i32> %b) { +; SSE2-LABEL: lshr_mulhuw_v8i16_shift31: +; SSE2: # %bb.0: +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pmulhuw %xmm2, %xmm0 +; SSE2-NEXT: psrlw $15, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: lshr_mulhuw_v8i16_shift31: +; SSE41: # %bb.0: +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pmulhuw %xmm2, %xmm0 +; SSE41-NEXT: psrlw $15, %xmm0 +; SSE41-NEXT: retq +; +; AVX2-LABEL: lshr_mulhuw_v8i16_shift31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: lshr_mulhuw_v8i16_shift31: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a1 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %b1 = lshr <8 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %c = mul <8 x i32> %a1, %b1 + %d = lshr <8 x i32> %c, splat (i32 31) + %e = trunc <8 x i32> %d to <8 x i16> + ret <8 x i16> %e +} + +define <16 x i16> @and_mulhuw_v16i16_shift17(<16 x i32> %a, <16 x i32> %b) { +; SSE2-LABEL: and_mulhuw_v16i16_shift17: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: pmulhw %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: packssdw %xmm5, %xmm8 +; SSE2-NEXT: pmulhw %xmm8, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: and_mulhuw_v16i16_shift17: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE41-NEXT: pand %xmm8, %xmm3 +; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm7 +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: pmulhw %xmm2, %xmm6 +; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm8 +; SSE41-NEXT: packusdw %xmm5, %xmm8 +; SSE41-NEXT: pmulhw %xmm8, %xmm0 +; SSE41-NEXT: psrlw $1, %xmm0 +; SSE41-NEXT: psrlw $1, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: retq +; +; AVX2-LABEL: and_mulhuw_v16i16_shift17: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: and_mulhuw_v16i16_shift17: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: and_mulhuw_v16i16_shift17: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq + %a1 = and <16 x i32> %a, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %b1 = and <16 x i32> %b, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %c = mul <16 x i32> %a1, %b1 + %d = lshr <16 x i32> %c, splat (i32 17) + %e = trunc <16 x i32> %d to <16 x i16> + ret <16 x i16> %e +} + +define <16 x i16> @and_mulhuw_v16i16_shift24(<16 x i32> %a, <16 x i32> %b) { +; SSE2-LABEL: and_mulhuw_v16i16_shift24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: pmulhw %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: packssdw %xmm5, %xmm8 +; SSE2-NEXT: pmulhw %xmm8, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: and_mulhuw_v16i16_shift24: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE41-NEXT: pand %xmm8, %xmm3 +; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm7 +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: pmulhw %xmm2, %xmm6 +; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm8 +; SSE41-NEXT: packusdw %xmm5, %xmm8 +; SSE41-NEXT: pmulhw %xmm8, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: retq +; +; AVX2-LABEL: and_mulhuw_v16i16_shift24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: and_mulhuw_v16i16_shift24: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: and_mulhuw_v16i16_shift24: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: retq + %a1 = and <16 x i32> %a, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %b1 = and <16 x i32> %b, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %c = mul <16 x i32> %a1, %b1 + %d = lshr <16 x i32> %c, splat (i32 24) + %e = trunc <16 x i32> %d to <16 x i16> + ret <16 x i16> %e +} + +define <16 x i16> @and_mulhuw_v16i16_shift31(<16 x i32> %a, <16 x i32> %b) { +; SSE-LABEL: and_mulhuw_v16i16_shift31: +; SSE: # %bb.0: +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: and_mulhuw_v16i16_shift31: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %a1 = and <16 x i32> %a, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %b1 = and <16 x i32> %b, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %c = mul <16 x i32> %a1, %b1 + %d = lshr <16 x i32> %c, splat (i32 31) + %e = trunc <16 x i32> %d to <16 x i16> + ret <16 x i16> %e +} diff --git a/llvm/test/CodeGen/X86/pr30284.ll b/llvm/test/CodeGen/X86/pr30284.ll index f4fb1b3..708f0f7 100644 --- a/llvm/test/CodeGen/X86/pr30284.ll +++ b/llvm/test/CodeGen/X86/pr30284.ll @@ -19,14 +19,12 @@ define void @f_f___un_3C_unf_3E_un_3C_unf_3E_(<16 x i1> %x) { ; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 ; CHECK-NEXT: vpslld $31, %zmm0, %zmm0 ; CHECK-NEXT: vpmovd2m %zmm0, %k1 -; CHECK-NEXT: vmovapd 0, %zmm0 -; CHECK-NEXT: vmovapd 64, %zmm1 -; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm2 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16] +; CHECK-NEXT: vporq 64, %zmm0, %zmm1 +; CHECK-NEXT: vporq 0, %zmm0, %zmm0 ; CHECK-NEXT: kshiftrw $8, %k1, %k2 -; CHECK-NEXT: vorpd %zmm2, %zmm1, %zmm1 {%k2} -; CHECK-NEXT: vorpd %zmm2, %zmm0, %zmm0 {%k1} -; CHECK-NEXT: vmovapd %zmm0, 0 -; CHECK-NEXT: vmovapd %zmm1, 64 +; CHECK-NEXT: vmovdqa64 %zmm0, 0 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, 64 {%k2} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %a_load22 = load <16 x i64>, ptr null, align 1 diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll index b633c28a..4124553 100644 --- a/llvm/test/CodeGen/X86/pr38539.ll +++ b/llvm/test/CodeGen/X86/pr38539.ll @@ -23,7 +23,7 @@ define void @f() nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $160, %esp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl (%eax), %eax diff --git a/llvm/test/CodeGen/X86/pr78897.ll b/llvm/test/CodeGen/X86/pr78897.ll index 4613c2b..db77baa 100644 --- a/llvm/test/CodeGen/X86/pr78897.ll +++ b/llvm/test/CodeGen/X86/pr78897.ll @@ -22,7 +22,7 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind { ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u] +; X86-SSE2-NEXT: movq {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,0,0,0,0,0,0,0,0] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm2, %esi diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll index 4a0a68e..2759a98 100644 --- a/llvm/test/CodeGen/X86/sibcall.ll +++ b/llvm/test/CodeGen/X86/sibcall.ll @@ -444,21 +444,11 @@ define dso_local void @t15(ptr noalias sret(%struct.foo) %agg.result) nounwind ; ; X64-LABEL: t15: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: callq f -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: popq %rbx -; X64-NEXT: retq +; X64-NEXT: jmp f # TAILCALL ; ; X32-LABEL: t15: ; X32: # %bb.0: -; X32-NEXT: pushq %rbx -; X32-NEXT: movq %rdi, %rbx -; X32-NEXT: callq f -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: popq %rbx -; X32-NEXT: retq +; X32-NEXT: jmp f # TAILCALL tail call fastcc void @f(ptr noalias sret(%struct.foo) %agg.result) nounwind ret void } @@ -607,32 +597,15 @@ declare dso_local fastcc double @foo20(double) nounwind define fastcc void @t21_sret_to_sret(ptr noalias sret(%struct.foo) %agg.result) nounwind { ; X86-LABEL: t21_sret_to_sret: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: calll t21_f_sret -; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $8, %esp -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-NEXT: jmp t21_f_sret # TAILCALL ; ; X64-LABEL: t21_sret_to_sret: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: callq t21_f_sret -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: popq %rbx -; X64-NEXT: retq +; X64-NEXT: jmp t21_f_sret # TAILCALL ; ; X32-LABEL: t21_sret_to_sret: ; X32: # %bb.0: -; X32-NEXT: pushq %rbx -; X32-NEXT: movq %rdi, %rbx -; X32-NEXT: callq t21_f_sret -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: popq %rbx -; X32-NEXT: retq +; X32-NEXT: jmp t21_f_sret # TAILCALL tail call fastcc void @t21_f_sret(ptr noalias sret(%struct.foo) %agg.result) nounwind ret void } @@ -640,34 +613,15 @@ define fastcc void @t21_sret_to_sret(ptr noalias sret(%struct.foo) %agg.result) define fastcc void @t21_sret_to_sret_more_args(ptr noalias sret(%struct.foo) %agg.result, i32 %a, i32 %b) nounwind { ; X86-LABEL: t21_sret_to_sret_more_args: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: calll f_sret@PLT -; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $8, %esp -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-NEXT: jmp f_sret@PLT # TAILCALL ; ; X64-LABEL: t21_sret_to_sret_more_args: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: callq f_sret@PLT -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: popq %rbx -; X64-NEXT: retq +; X64-NEXT: jmp f_sret@PLT # TAILCALL ; ; X32-LABEL: t21_sret_to_sret_more_args: ; X32: # %bb.0: -; X32-NEXT: pushq %rbx -; X32-NEXT: movq %rdi, %rbx -; X32-NEXT: callq f_sret@PLT -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: popq %rbx -; X32-NEXT: retq +; X32-NEXT: jmp f_sret@PLT # TAILCALL tail call fastcc void @f_sret(ptr noalias sret(%struct.foo) %agg.result, i32 %a, i32 %b) nounwind ret void } @@ -675,35 +629,18 @@ define fastcc void @t21_sret_to_sret_more_args(ptr noalias sret(%struct.foo) %ag define fastcc void @t21_sret_to_sret_second_arg_sret(ptr noalias %agg.result, ptr noalias sret(%struct.foo) %ret) nounwind { ; X86-LABEL: t21_sret_to_sret_second_arg_sret: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: calll t21_f_sret -; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $8, %esp -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-NEXT: jmp t21_f_sret # TAILCALL ; ; X64-LABEL: t21_sret_to_sret_second_arg_sret: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rsi, %rbx ; X64-NEXT: movq %rsi, %rdi -; X64-NEXT: callq t21_f_sret -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: popq %rbx -; X64-NEXT: retq +; X64-NEXT: jmp t21_f_sret # TAILCALL ; ; X32-LABEL: t21_sret_to_sret_second_arg_sret: ; X32: # %bb.0: -; X32-NEXT: pushq %rbx -; X32-NEXT: movq %rsi, %rbx ; X32-NEXT: movq %rsi, %rdi -; X32-NEXT: callq t21_f_sret -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: popq %rbx -; X32-NEXT: retq +; X32-NEXT: jmp t21_f_sret # TAILCALL tail call fastcc void @t21_f_sret(ptr noalias sret(%struct.foo) %ret) nounwind ret void } @@ -725,27 +662,17 @@ define fastcc void @t21_sret_to_sret_more_args2(ptr noalias sret(%struct.foo) %a ; ; X64-LABEL: t21_sret_to_sret_more_args2: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx ; X64-NEXT: movl %esi, %eax -; X64-NEXT: movq %rdi, %rbx ; X64-NEXT: movl %edx, %esi ; X64-NEXT: movl %eax, %edx -; X64-NEXT: callq f_sret@PLT -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: popq %rbx -; X64-NEXT: retq +; X64-NEXT: jmp f_sret@PLT # TAILCALL ; ; X32-LABEL: t21_sret_to_sret_more_args2: ; X32: # %bb.0: -; X32-NEXT: pushq %rbx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movq %rdi, %rbx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edx -; X32-NEXT: callq f_sret@PLT -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: popq %rbx -; X32-NEXT: retq +; X32-NEXT: jmp f_sret@PLT # TAILCALL tail call fastcc void @f_sret(ptr noalias sret(%struct.foo) %agg.result, i32 %b, i32 %a) nounwind ret void } @@ -977,6 +904,176 @@ define ccc void @t22_non_sret_to_sret(ptr %agg.result) nounwind { ret void } +; Not tailcallable, caller and callee have different return types. +define void @t23_sret_to_non_sret(ptr noalias sret(%struct.foo) align 4 %agg.result, ptr %arg) { +; X86-LABEL: t23_sret_to_non_sret: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll callee_1@PLT +; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl $4 +; +; X64-LABEL: t23_sret_to_non_sret: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: .cfi_offset %rbx, -16 +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: callq callee_1@PLT +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: popq %rbx +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq +; +; X32-LABEL: t23_sret_to_non_sret: +; X32: # %bb.0: +; X32-NEXT: pushq %rbx +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %rbx, -16 +; X32-NEXT: movq %rdi, %rbx +; X32-NEXT: movq %rsi, %rdi +; X32-NEXT: callq callee_1@PLT +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: popq %rbx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: retq + tail call void @callee_1(ptr %arg) + ret void +} + +; Not tailcallable, caller and callee have the same return type, but different return values. +define void @t24_sret_to_sret_different_val(ptr noalias sret(%struct.foo) align 4 %agg.result, ptr %arg) { +; X86-LABEL: t24_sret_to_sret_different_val: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: subl $24, %esp +; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorps %xmm0, %xmm0 +; X86-NEXT: movsd %xmm0, 8(%esi) +; X86-NEXT: movsd %xmm0, (%esi) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll callee_2@PLT +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl $24, %esp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl $4 +; +; X64-LABEL: t24_sret_to_sret_different_val: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: subq $16, %rsp +; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: .cfi_offset %rbx, -16 +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: movq $0, 8(%rdi) +; X64-NEXT: movq $0, (%rdi) +; X64-NEXT: movq %rsp, %rdi +; X64-NEXT: callq callee_2@PLT +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: addq $16, %rsp +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: popq %rbx +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq +; +; X32-LABEL: t24_sret_to_sret_different_val: +; X32: # %bb.0: +; X32-NEXT: pushq %rbx +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: subl $16, %esp +; X32-NEXT: .cfi_def_cfa_offset 32 +; X32-NEXT: .cfi_offset %rbx, -16 +; X32-NEXT: movq %rdi, %rbx +; X32-NEXT: movq $0, 8(%ebx) +; X32-NEXT: movq $0, (%ebx) +; X32-NEXT: movl %esp, %edi +; X32-NEXT: callq callee_2@PLT +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: addl $16, %esp +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: popq %rbx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: retq + %callee.return = alloca %struct.foo, align 4 + tail call void @llvm.memset.p0.i64(ptr align 4 %agg.result, i8 0, i64 16, i1 false) + tail call void @callee_2(ptr sret(%struct.foo) align 4 %callee.return) + ret void +} + +; Not tailcallable, caller and callee have the same return type, but different return values. +define void @t25_sret_to_sret_different_val(ptr noalias sret(%struct.foo) align 8 %agg.result, ptr %arg) { +; X86-LABEL: t25_sret_to_sret_different_val: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll callee_2@PLT +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl $4 +; +; X64-LABEL: t25_sret_to_sret_different_val: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: .cfi_offset %rbx, -16 +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: callq callee_2@PLT +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: popq %rbx +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq +; +; X32-LABEL: t25_sret_to_sret_different_val: +; X32: # %bb.0: +; X32-NEXT: pushq %rbx +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %rbx, -16 +; X32-NEXT: movq %rdi, %rbx +; X32-NEXT: movq %rsi, %rdi +; X32-NEXT: callq callee_2@PLT +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: popq %rbx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: retq + tail call void @callee_2(ptr sret(%struct.foo) align 8 %arg) + ret void +} + +declare void @llvm.memset.p0.i64(ptr, i8, i64, i1) +declare void @callee_1(ptr) +declare void @callee_2(ptr noalias sret(%struct.foo)) + declare dso_local fastcc void @t21_f_sret(ptr noalias sret(%struct.foo)) nounwind declare dso_local fastcc void @t21_f_sret2(ptr noalias sret(%struct.foo), ptr noalias) nounwind declare dso_local fastcc void @t21_f_non_sret(ptr) nounwind diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll index 5cd604c..a260b32 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -410,34 +410,34 @@ define <4 x float> @v4f32_estimate(<4 x float> %x) #1 { define <4 x float> @v4f32_estimate2(<4 x float> %x) #5 { ; SSE-LABEL: v4f32_estimate2: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm0, %xmm2 -; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] -; SSE-NEXT: cmpleps %xmm0, %xmm1 -; SSE-NEXT: andps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; SSE-NEXT: andps %xmm0, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SSE-NEXT: cmpleps %xmm1, %xmm2 +; SSE-NEXT: rsqrtps %xmm0, %xmm1 +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: andps %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: v4f32_estimate2: ; AVX1: # %bb.0: -; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 -; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] -; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vcmpleps %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vrsqrtps %xmm0, %xmm2 +; AVX1-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: v4f32_estimate2: ; AVX512: # %bb.0: -; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] -; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vcmpleps %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vrsqrtps %xmm0, %xmm2 +; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %sqrt = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) ret <4 x float> %sqrt diff --git a/llvm/test/CodeGen/X86/trunc-nsw-nuw.ll b/llvm/test/CodeGen/X86/trunc-nsw-nuw.ll index 5c5f704..6b07891 100644 --- a/llvm/test/CodeGen/X86/trunc-nsw-nuw.ll +++ b/llvm/test/CodeGen/X86/trunc-nsw-nuw.ll @@ -62,10 +62,11 @@ entry: define i32 @simplify_demanded_bits_drop_flag(i1 zeroext %x, i1 zeroext %y) nounwind { ; CHECK-LABEL: simplify_demanded_bits_drop_flag: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: negl %edi +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NEXT: shll $2, %esi -; CHECK-NEXT: xorl %edi, %esi -; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: negq %rax +; CHECK-NEXT: xorq %rsi, %rax ; CHECK-NEXT: imulq $-1634202141, %rax, %rax # imm = 0x9E980DE3 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $63, %rcx diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index ebb5e13..b8e83da 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -281,7 +281,7 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-AVX2-NEXT: vpsllvd %ymm1, %ymm2, %ymm2 -; X64-AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; X64-AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; X64-AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; X64-AVX2-NEXT: vpsrlvd %ymm1, %ymm3, %ymm1 diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 7f4111e..6174011 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -501,39 +501,39 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw ; SSE3-NEXT: pextrw $0, %xmm1, %eax ; SSE3-NEXT: pextrw $1, %xmm1, %ecx ; SSE3-NEXT: pextrw $2, %xmm1, %edx -; SSE3-NEXT: pextrw $3, %xmm1, %esi -; SSE3-NEXT: pextrw $4, %xmm1, %edi -; SSE3-NEXT: pextrw $5, %xmm1, %r8d -; SSE3-NEXT: pextrw $6, %xmm1, %r9d -; SSE3-NEXT: pextrw $7, %xmm1, %r10d +; SSE3-NEXT: pextrw $3, %xmm1, %edi +; SSE3-NEXT: pextrw $4, %xmm1, %r8d +; SSE3-NEXT: pextrw $5, %xmm1, %r9d +; SSE3-NEXT: pextrw $6, %xmm1, %r10d +; SSE3-NEXT: pextrw $7, %xmm1, %esi ; SSE3-NEXT: movdqa %xmm2, -24(%rsp) ; SSE3-NEXT: andl $7, %eax +; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax ; SSE3-NEXT: andl $7, %ecx +; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx ; SSE3-NEXT: andl $7, %edx -; SSE3-NEXT: andl $7, %esi +; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx ; SSE3-NEXT: andl $7, %edi +; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi ; SSE3-NEXT: andl $7, %r8d +; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d ; SSE3-NEXT: andl $7, %r9d +; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d ; SSE3-NEXT: andl $7, %r10d ; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d -; SSE3-NEXT: movd %r10d, %xmm1 -; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d -; SSE3-NEXT: movd %r9d, %xmm2 +; SSE3-NEXT: andl $7, %esi +; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi +; SSE3-NEXT: movd %esi, %xmm1 +; SSE3-NEXT: movd %r10d, %xmm2 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d -; SSE3-NEXT: movd %r8d, %xmm1 -; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi -; SSE3-NEXT: movd %edi, %xmm3 +; SSE3-NEXT: movd %r9d, %xmm1 +; SSE3-NEXT: movd %r8d, %xmm3 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi -; SSE3-NEXT: movd %esi, %xmm1 -; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx +; SSE3-NEXT: movd %edi, %xmm1 ; SSE3-NEXT: movd %edx, %xmm2 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx ; SSE3-NEXT: movd %ecx, %xmm1 -; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax ; SSE3-NEXT: movd %eax, %xmm4 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] @@ -1053,8 +1053,9 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) ; SSE3-NEXT: movq %xmm1, %rcx ; SSE3-NEXT: andl $1, %ecx ; SSE3-NEXT: movaps %xmm0, -24(%rsp) -; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero -; SSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1] +; SSE3-NEXT: movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero +; SSE3-NEXT: movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE3-NEXT: pandn %xmm0, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: retq @@ -1077,8 +1078,9 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) ; SSSE3-NEXT: movq %xmm1, %rcx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movaps %xmm0, -24(%rsp) -; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero -; SSSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1] +; SSSE3-NEXT: movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero +; SSSE3-NEXT: movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: pandn %xmm0, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq @@ -1251,16 +1253,16 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n ; SSE3-NEXT: movd %xmm1, %esi ; SSE3-NEXT: movaps %xmm2, -24(%rsp) ; SSE3-NEXT: andl $3, %eax +; SSE3-NEXT: movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero ; SSE3-NEXT: andl $3, %ecx +; SSE3-NEXT: movd -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero ; SSE3-NEXT: andl $3, %edx +; SSE3-NEXT: movd -24(%rsp,%rdx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero ; SSE3-NEXT: andl $3, %esi -; SSE3-NEXT: movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero -; SSE3-NEXT: movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero -; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE3-NEXT: movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero -; SSE3-NEXT: movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE3-NEXT: movd -24(%rsp,%rsi,4), %xmm4 # xmm4 = mem[0],zero,zero,zero +; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE3-NEXT: pandn %xmm1, %xmm0 ; SSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll index 11e7fe85..0e37e5a 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -468,29 +468,29 @@ define <16 x float> @fsub_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; ; SSE42-LABEL: fsub_v16f32_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm8 ; SSE42-NEXT: psrad $31, %xmm8 -; SSE42-NEXT: pandn %xmm7, %xmm8 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm9 +; SSE42-NEXT: psrad $31, %xmm9 +; SSE42-NEXT: pandn %xmm7, %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm7 ; SSE42-NEXT: psrad $31, %xmm7 ; SSE42-NEXT: pandn %xmm6, %xmm7 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm6 -; SSE42-NEXT: psrad $31, %xmm6 -; SSE42-NEXT: pandn %xmm5, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: psrad $31, %xmm0 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 -; SSE42-NEXT: subps %xmm6, %xmm1 +; SSE42-NEXT: pandn %xmm5, %xmm0 +; SSE42-NEXT: subps %xmm0, %xmm1 ; SSE42-NEXT: subps %xmm7, %xmm2 -; SSE42-NEXT: subps %xmm8, %xmm3 -; SSE42-NEXT: subps %xmm0, %xmm4 +; SSE42-NEXT: subps %xmm9, %xmm3 +; SSE42-NEXT: subps %xmm8, %xmm4 ; SSE42-NEXT: movaps %xmm1, %xmm0 ; SSE42-NEXT: movaps %xmm2, %xmm1 ; SSE42-NEXT: movaps %xmm3, %xmm2 @@ -562,33 +562,32 @@ define <16 x float> @fsub_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; ; SSE42-LABEL: fsub_v16f32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movaps %xmm2, %xmm8 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm2 -; SSE42-NEXT: psrad $31, %xmm2 -; SSE42-NEXT: pandn %xmm7, %xmm2 +; SSE42-NEXT: movaps %xmm3, %xmm8 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm3 +; SSE42-NEXT: psrad $31, %xmm3 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm9 +; SSE42-NEXT: psrad $31, %xmm9 +; SSE42-NEXT: pandn %xmm7, %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm7 ; SSE42-NEXT: psrad $31, %xmm7 ; SSE42-NEXT: pandn %xmm6, %xmm7 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm6 -; SSE42-NEXT: psrad $31, %xmm6 -; SSE42-NEXT: pandn %xmm5, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm5 -; SSE42-NEXT: psrad $31, %xmm5 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 -; SSE42-NEXT: subps %xmm1, %xmm6 -; SSE42-NEXT: subps %xmm8, %xmm7 -; SSE42-NEXT: subps %xmm3, %xmm2 -; SSE42-NEXT: subps %xmm4, %xmm5 -; SSE42-NEXT: movaps %xmm6, %xmm0 +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm0 +; SSE42-NEXT: psrad $31, %xmm0 +; SSE42-NEXT: pandn %xmm5, %xmm0 +; SSE42-NEXT: subps %xmm1, %xmm0 +; SSE42-NEXT: subps %xmm2, %xmm7 +; SSE42-NEXT: subps %xmm8, %xmm9 +; SSE42-NEXT: subps %xmm4, %xmm3 ; SSE42-NEXT: movaps %xmm7, %xmm1 -; SSE42-NEXT: movaps %xmm5, %xmm3 +; SSE42-NEXT: movaps %xmm9, %xmm2 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fsub_v16f32_commute_swap: @@ -2407,29 +2406,29 @@ define <16 x i32> @sub_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i3 ; ; SSE42-LABEL: sub_v16i32_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm8 ; SSE42-NEXT: psrad $31, %xmm8 -; SSE42-NEXT: pandn %xmm7, %xmm8 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm9 +; SSE42-NEXT: psrad $31, %xmm9 +; SSE42-NEXT: pandn %xmm7, %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm7 ; SSE42-NEXT: psrad $31, %xmm7 ; SSE42-NEXT: pandn %xmm6, %xmm7 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm6 -; SSE42-NEXT: psrad $31, %xmm6 -; SSE42-NEXT: pandn %xmm5, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: psrad $31, %xmm0 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 -; SSE42-NEXT: psubd %xmm6, %xmm1 +; SSE42-NEXT: pandn %xmm5, %xmm0 +; SSE42-NEXT: psubd %xmm0, %xmm1 ; SSE42-NEXT: psubd %xmm7, %xmm2 -; SSE42-NEXT: psubd %xmm8, %xmm3 -; SSE42-NEXT: psubd %xmm0, %xmm4 +; SSE42-NEXT: psubd %xmm9, %xmm3 +; SSE42-NEXT: psubd %xmm8, %xmm4 ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: movdqa %xmm3, %xmm2 @@ -2501,33 +2500,32 @@ define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; ; SSE42-LABEL: sub_v16i32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm2, %xmm8 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm2 -; SSE42-NEXT: psrad $31, %xmm2 -; SSE42-NEXT: pandn %xmm7, %xmm2 +; SSE42-NEXT: movdqa %xmm3, %xmm8 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm3 +; SSE42-NEXT: psrad $31, %xmm3 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm9 +; SSE42-NEXT: psrad $31, %xmm9 +; SSE42-NEXT: pandn %xmm7, %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm7 ; SSE42-NEXT: psrad $31, %xmm7 ; SSE42-NEXT: pandn %xmm6, %xmm7 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm6 -; SSE42-NEXT: psrad $31, %xmm6 -; SSE42-NEXT: pandn %xmm5, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm5 -; SSE42-NEXT: psrad $31, %xmm5 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 -; SSE42-NEXT: psubd %xmm1, %xmm6 -; SSE42-NEXT: psubd %xmm8, %xmm7 -; SSE42-NEXT: psubd %xmm3, %xmm2 -; SSE42-NEXT: psubd %xmm4, %xmm5 -; SSE42-NEXT: movdqa %xmm6, %xmm0 +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm0 +; SSE42-NEXT: psrad $31, %xmm0 +; SSE42-NEXT: pandn %xmm5, %xmm0 +; SSE42-NEXT: psubd %xmm1, %xmm0 +; SSE42-NEXT: psubd %xmm2, %xmm7 +; SSE42-NEXT: psubd %xmm8, %xmm9 +; SSE42-NEXT: psubd %xmm4, %xmm3 ; SSE42-NEXT: movdqa %xmm7, %xmm1 -; SSE42-NEXT: movdqa %xmm5, %xmm3 +; SSE42-NEXT: movdqa %xmm9, %xmm2 ; SSE42-NEXT: retq ; ; AVX2-LABEL: sub_v16i32_commute_swap: @@ -3371,41 +3369,41 @@ define <16 x i32> @shl_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i3 ; ; SSE42-LABEL: shl_v16i32_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm8 ; SSE42-NEXT: psrad $31, %xmm8 -; SSE42-NEXT: pandn %xmm7, %xmm8 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm9 +; SSE42-NEXT: psrad $31, %xmm9 +; SSE42-NEXT: pandn %xmm7, %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm7 ; SSE42-NEXT: psrad $31, %xmm7 ; SSE42-NEXT: pandn %xmm6, %xmm7 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm6 -; SSE42-NEXT: psrad $31, %xmm6 -; SSE42-NEXT: pandn %xmm5, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm5 -; SSE42-NEXT: psrad $31, %xmm5 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 -; SSE42-NEXT: pslld $23, %xmm6 -; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [1065353216,1065353216,1065353216,1065353216] -; SSE42-NEXT: paddd %xmm9, %xmm6 -; SSE42-NEXT: cvttps2dq %xmm6, %xmm0 +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm0 +; SSE42-NEXT: psrad $31, %xmm0 +; SSE42-NEXT: pandn %xmm5, %xmm0 +; SSE42-NEXT: pslld $23, %xmm0 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; SSE42-NEXT: paddd %xmm5, %xmm0 +; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 ; SSE42-NEXT: pmulld %xmm1, %xmm0 ; SSE42-NEXT: pslld $23, %xmm7 -; SSE42-NEXT: paddd %xmm9, %xmm7 +; SSE42-NEXT: paddd %xmm5, %xmm7 ; SSE42-NEXT: cvttps2dq %xmm7, %xmm1 ; SSE42-NEXT: pmulld %xmm2, %xmm1 -; SSE42-NEXT: pslld $23, %xmm8 -; SSE42-NEXT: paddd %xmm9, %xmm8 -; SSE42-NEXT: cvttps2dq %xmm8, %xmm2 +; SSE42-NEXT: pslld $23, %xmm9 +; SSE42-NEXT: paddd %xmm5, %xmm9 +; SSE42-NEXT: cvttps2dq %xmm9, %xmm2 ; SSE42-NEXT: pmulld %xmm3, %xmm2 -; SSE42-NEXT: pslld $23, %xmm5 -; SSE42-NEXT: paddd %xmm9, %xmm5 -; SSE42-NEXT: cvttps2dq %xmm5, %xmm3 +; SSE42-NEXT: pslld $23, %xmm8 +; SSE42-NEXT: paddd %xmm5, %xmm8 +; SSE42-NEXT: cvttps2dq %xmm8, %xmm3 ; SSE42-NEXT: pmulld %xmm4, %xmm3 ; SSE42-NEXT: retq ; @@ -3508,11 +3506,16 @@ define <16 x i32> @shl_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; ; SSE42-LABEL: shl_v16i32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm8 ; SSE42-NEXT: psrad $31, %xmm8 -; SSE42-NEXT: pandn %xmm7, %xmm8 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm9 +; SSE42-NEXT: psrad $31, %xmm9 +; SSE42-NEXT: pandn %xmm7, %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm7 @@ -3522,28 +3525,23 @@ define <16 x i32> @shl_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE42-NEXT: pslld $31, %xmm6 ; SSE42-NEXT: psrad $31, %xmm6 ; SSE42-NEXT: pandn %xmm5, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm5 -; SSE42-NEXT: psrad $31, %xmm5 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 ; SSE42-NEXT: pslld $23, %xmm1 -; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [1065353216,1065353216,1065353216,1065353216] -; SSE42-NEXT: paddd %xmm9, %xmm1 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; SSE42-NEXT: paddd %xmm5, %xmm1 ; SSE42-NEXT: cvttps2dq %xmm1, %xmm0 ; SSE42-NEXT: pmulld %xmm6, %xmm0 ; SSE42-NEXT: pslld $23, %xmm2 -; SSE42-NEXT: paddd %xmm9, %xmm2 +; SSE42-NEXT: paddd %xmm5, %xmm2 ; SSE42-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE42-NEXT: pmulld %xmm7, %xmm1 ; SSE42-NEXT: pslld $23, %xmm3 -; SSE42-NEXT: paddd %xmm9, %xmm3 +; SSE42-NEXT: paddd %xmm5, %xmm3 ; SSE42-NEXT: cvttps2dq %xmm3, %xmm2 -; SSE42-NEXT: pmulld %xmm8, %xmm2 +; SSE42-NEXT: pmulld %xmm9, %xmm2 ; SSE42-NEXT: pslld $23, %xmm4 -; SSE42-NEXT: paddd %xmm9, %xmm4 +; SSE42-NEXT: paddd %xmm5, %xmm4 ; SSE42-NEXT: cvttps2dq %xmm4, %xmm3 -; SSE42-NEXT: pmulld %xmm5, %xmm3 +; SSE42-NEXT: pmulld %xmm8, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: shl_v16i32_commute_swap: @@ -4078,85 +4076,85 @@ define <16 x i32> @lshr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i ; ; SSE42-LABEL: lshr_v16i32_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm8 ; SSE42-NEXT: psrad $31, %xmm8 -; SSE42-NEXT: pandn %xmm7, %xmm8 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm9 +; SSE42-NEXT: psrad $31, %xmm9 +; SSE42-NEXT: pandn %xmm7, %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm7 ; SSE42-NEXT: psrad $31, %xmm7 ; SSE42-NEXT: pandn %xmm6, %xmm7 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm6 -; SSE42-NEXT: psrad $31, %xmm6 -; SSE42-NEXT: pandn %xmm5, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm5 -; SSE42-NEXT: psrad $31, %xmm5 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm9 -; SSE42-NEXT: psrld %xmm0, %xmm9 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm0 +; SSE42-NEXT: psrad $31, %xmm0 +; SSE42-NEXT: pandn %xmm5, %xmm0 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm6 +; SSE42-NEXT: psrld %xmm5, %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm11 -; SSE42-NEXT: psrld %xmm0, %xmm11 -; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm11[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrld %xmm10, %xmm11 +; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm6[0,1,2,3],xmm11[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: psrld %xmm6, %xmm0 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrld %xmm6, %xmm1 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrld %xmm5, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: psrld %xmm1, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm2, %xmm5 +; SSE42-NEXT: psrld %xmm1, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm10 ; SSE42-NEXT: psrld %xmm1, %xmm10 -; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm10[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm5[0,1,2,3],xmm10[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm1 -; SSE42-NEXT: psrld %xmm6, %xmm1 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrld %xmm6, %xmm2 +; SSE42-NEXT: psrld %xmm5, %xmm1 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrld %xmm5, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5],xmm10[6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm6 -; SSE42-NEXT: psrld %xmm2, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm9 -; SSE42-NEXT: psrld %xmm2, %xmm9 -; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm9[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm3, %xmm5 +; SSE42-NEXT: psrld %xmm2, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm3, %xmm7 +; SSE42-NEXT: psrld %xmm2, %xmm7 +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm2 -; SSE42-NEXT: psrld %xmm6, %xmm2 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrld %xmm6, %xmm3 +; SSE42-NEXT: psrld %xmm5, %xmm2 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrld %xmm5, %xmm3 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: psrld %xmm3, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm4, %xmm8 -; SSE42-NEXT: psrld %xmm3, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5],xmm7[6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm4, %xmm5 +; SSE42-NEXT: psrld %xmm3, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm4, %xmm7 +; SSE42-NEXT: psrld %xmm3, %xmm7 +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm4, %xmm3 ; SSE42-NEXT: psrld %xmm5, %xmm3 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm5, %xmm4 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5],xmm7[6,7] ; SSE42-NEXT: retq ; ; AVX2-LABEL: lshr_v16i32_swap: @@ -4280,74 +4278,73 @@ define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; ; SSE42-LABEL: lshr_v16i32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm3, %xmm10 -; SSE42-NEXT: movdqa %xmm2, %xmm9 -; SSE42-NEXT: movdqa %xmm1, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: movdqa %xmm3, %xmm8 +; SSE42-NEXT: movdqa %xmm2, %xmm10 +; SSE42-NEXT: movdqa %xmm1, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm3 +; SSE42-NEXT: psrad $31, %xmm3 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm2 ; SSE42-NEXT: psrad $31, %xmm2 ; SSE42-NEXT: pandn %xmm7, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm1 ; SSE42-NEXT: psrad $31, %xmm1 ; SSE42-NEXT: pandn %xmm6, %xmm1 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: psrad $31, %xmm0 ; SSE42-NEXT: pandn %xmm5, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm3 -; SSE42-NEXT: psrad $31, %xmm3 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm7 ; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm11 ; SSE42-NEXT: psrld %xmm6, %xmm11 ; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm6 ; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm5, %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm7 ; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm8 -; SSE42-NEXT: psrld %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm9 +; SSE42-NEXT: psrld %xmm6, %xmm9 +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm6 ; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm5, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5],xmm9[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm7 ; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm8 -; SSE42-NEXT: psrld %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm2, %xmm9 +; SSE42-NEXT: psrld %xmm6, %xmm9 +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm5, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm7 @@ -4929,85 +4926,85 @@ define <16 x i32> @ashr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i ; ; SSE42-LABEL: ashr_v16i32_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm8 ; SSE42-NEXT: psrad $31, %xmm8 -; SSE42-NEXT: pandn %xmm7, %xmm8 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm9 +; SSE42-NEXT: psrad $31, %xmm9 +; SSE42-NEXT: pandn %xmm7, %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm7 ; SSE42-NEXT: psrad $31, %xmm7 ; SSE42-NEXT: pandn %xmm6, %xmm7 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm6 -; SSE42-NEXT: psrad $31, %xmm6 -; SSE42-NEXT: pandn %xmm5, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm5 -; SSE42-NEXT: psrad $31, %xmm5 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm9 -; SSE42-NEXT: psrad %xmm0, %xmm9 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm0 +; SSE42-NEXT: psrad $31, %xmm0 +; SSE42-NEXT: pandn %xmm5, %xmm0 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm6 +; SSE42-NEXT: psrad %xmm5, %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm11 -; SSE42-NEXT: psrad %xmm0, %xmm11 -; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm11[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrad %xmm10, %xmm11 +; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm6[0,1,2,3],xmm11[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: psrad %xmm6, %xmm0 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrad %xmm6, %xmm1 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrad %xmm5, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: psrad %xmm1, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm2, %xmm5 +; SSE42-NEXT: psrad %xmm1, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm10 ; SSE42-NEXT: psrad %xmm1, %xmm10 -; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm10[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm5[0,1,2,3],xmm10[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm1 -; SSE42-NEXT: psrad %xmm6, %xmm1 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrad %xmm6, %xmm2 +; SSE42-NEXT: psrad %xmm5, %xmm1 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrad %xmm5, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5],xmm10[6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm6 -; SSE42-NEXT: psrad %xmm2, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm9 -; SSE42-NEXT: psrad %xmm2, %xmm9 -; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm9[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm3, %xmm5 +; SSE42-NEXT: psrad %xmm2, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm3, %xmm7 +; SSE42-NEXT: psrad %xmm2, %xmm7 +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm2 -; SSE42-NEXT: psrad %xmm6, %xmm2 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrad %xmm6, %xmm3 +; SSE42-NEXT: psrad %xmm5, %xmm2 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrad %xmm5, %xmm3 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: psrad %xmm3, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm4, %xmm8 -; SSE42-NEXT: psrad %xmm3, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5],xmm7[6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm4, %xmm5 +; SSE42-NEXT: psrad %xmm3, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm4, %xmm7 +; SSE42-NEXT: psrad %xmm3, %xmm7 +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm4, %xmm3 ; SSE42-NEXT: psrad %xmm5, %xmm3 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm5, %xmm4 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5],xmm7[6,7] ; SSE42-NEXT: retq ; ; AVX2-LABEL: ashr_v16i32_swap: @@ -5131,74 +5128,73 @@ define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; ; SSE42-LABEL: ashr_v16i32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm3, %xmm10 -; SSE42-NEXT: movdqa %xmm2, %xmm9 -; SSE42-NEXT: movdqa %xmm1, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: movdqa %xmm3, %xmm8 +; SSE42-NEXT: movdqa %xmm2, %xmm10 +; SSE42-NEXT: movdqa %xmm1, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm3 +; SSE42-NEXT: psrad $31, %xmm3 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm2 ; SSE42-NEXT: psrad $31, %xmm2 ; SSE42-NEXT: pandn %xmm7, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm1 ; SSE42-NEXT: psrad $31, %xmm1 ; SSE42-NEXT: pandn %xmm6, %xmm1 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: psrad $31, %xmm0 ; SSE42-NEXT: pandn %xmm5, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm3 -; SSE42-NEXT: psrad $31, %xmm3 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm7 ; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm11 ; SSE42-NEXT: psrad %xmm6, %xmm11 ; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm6 ; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm5, %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm7 ; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm8 -; SSE42-NEXT: psrad %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm9 +; SSE42-NEXT: psrad %xmm6, %xmm9 +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm6 ; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm5, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5],xmm9[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm7 ; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm8 -; SSE42-NEXT: psrad %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm2, %xmm9 +; SSE42-NEXT: psrad %xmm6, %xmm9 +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm5, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm7 diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll index 894186f..1ab1a1a 100644 --- a/llvm/test/CodeGen/X86/vector-compress.ll +++ b/llvm/test/CodeGen/X86/vector-compress.ll @@ -1094,26 +1094,25 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $1, %xmm1, %r11d -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: movzbl %al, %edx -; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vpextrb $1, %xmm1, %r13d +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax ; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: subb %r11b, %al -; AVX2-NEXT: vpextrb $2, %xmm1, %esi -; AVX2-NEXT: subb %sil, %al -; AVX2-NEXT: vpextrb $3, %xmm1, %r13d ; AVX2-NEXT: subb %r13b, %al +; AVX2-NEXT: vpextrb $2, %xmm1, %edx +; AVX2-NEXT: subb %dl, %al +; AVX2-NEXT: vpextrb $3, %xmm1, %ebp +; AVX2-NEXT: subb %bpl, %al ; AVX2-NEXT: vpextrb $4, %xmm1, %r12d ; AVX2-NEXT: subb %r12b, %al ; AVX2-NEXT: vpextrb $5, %xmm1, %r15d ; AVX2-NEXT: subb %r15b, %al ; AVX2-NEXT: vpextrb $6, %xmm1, %r14d ; AVX2-NEXT: subb %r14b, %al -; AVX2-NEXT: vpextrb $7, %xmm1, %ebp -; AVX2-NEXT: subb %bpl, %al -; AVX2-NEXT: vpextrb $8, %xmm1, %ebx +; AVX2-NEXT: vpextrb $7, %xmm1, %ebx ; AVX2-NEXT: subb %bl, %al +; AVX2-NEXT: vpextrb $8, %xmm1, %r11d +; AVX2-NEXT: subb %r11b, %al ; AVX2-NEXT: vpextrb $9, %xmm1, %r10d ; AVX2-NEXT: subb %r10b, %al ; AVX2-NEXT: vpextrb $10, %xmm1, %r9d @@ -1123,108 +1122,94 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> ; AVX2-NEXT: vpextrb $12, %xmm1, %edi ; AVX2-NEXT: subb %dil, %al ; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: subb %cl, %al ; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: subb %cl, %al ; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX2-NEXT: subb %cl, %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzbl -40(%rsp,%rax), %eax ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rsi) +; AVX2-NEXT: andl $1, %r13d +; AVX2-NEXT: addq %rsi, %r13 +; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%r13) ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rdx) -; AVX2-NEXT: movzbl %r11b, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %sil, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %r13b, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %r12b, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: movzbl %r15b, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %r14b, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %bpl, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %bl, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %r10b, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %r9b, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %r8b, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %dil, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: addq %r13, %rdx +; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rdx) +; AVX2-NEXT: andl $1, %ebp +; AVX2-NEXT: addq %rdx, %rbp +; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rbp) +; AVX2-NEXT: andl $1, %r12d +; AVX2-NEXT: addq %rbp, %r12 +; AVX2-NEXT: andl $1, %r15d +; AVX2-NEXT: addq %r12, %r15 +; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12 +; AVX2-NEXT: andl $15, %r12d +; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%r12) +; AVX2-NEXT: andl $1, %r14d +; AVX2-NEXT: addq %r15, %r14 +; AVX2-NEXT: # kill: def $r15d killed $r15d killed $r15 def $r15 +; AVX2-NEXT: andl $15, %r15d +; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%r15) +; AVX2-NEXT: andl $1, %ebx +; AVX2-NEXT: addq %r14, %rbx +; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14 +; AVX2-NEXT: andl $15, %r14d +; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%r14) +; AVX2-NEXT: andl $1, %r11d +; AVX2-NEXT: addq %rbx, %r11 +; AVX2-NEXT: # kill: def $ebx killed $ebx killed $rbx def $rbx +; AVX2-NEXT: andl $15, %ebx +; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rbx) +; AVX2-NEXT: andl $1, %r10d +; AVX2-NEXT: addq %r11, %r10 +; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11 +; AVX2-NEXT: andl $15, %r11d +; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%r11) +; AVX2-NEXT: andl $1, %r9d +; AVX2-NEXT: addq %r10, %r9 +; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10 +; AVX2-NEXT: andl $15, %r10d +; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%r10) +; AVX2-NEXT: andl $1, %r8d +; AVX2-NEXT: addq %r9, %r8 +; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9 +; AVX2-NEXT: andl $15, %r9d +; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%r9) +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8 +; AVX2-NEXT: andl $15, %r8d +; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%r8) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: addq %rdi, %rsi +; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rdi) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: addq %rsi, %rax +; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rsi) ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: cmpq $15, %rax -; AVX2-NEXT: movl $15, %ecx -; AVX2-NEXT: cmovbq %rax, %rcx -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload -; AVX2-NEXT: movb %al, -40(%rsp,%rcx) +; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rax) +; AVX2-NEXT: cmpq $15, %rcx +; AVX2-NEXT: movl $15, %eax +; AVX2-NEXT: cmovbq %rcx, %rax +; AVX2-NEXT: vpextrb $15, %xmm0, %ecx +; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload +; AVX2-NEXT: movb %cl, -40(%rsp,%rax) ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 @@ -1805,140 +1790,137 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $128, %rsp -; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 -; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl %ecx, %r13d -; AVX2-NEXT: movl %edx, %r15d -; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: subq $96, %rsp +; AVX2-NEXT: movl %r9d, %r11d +; AVX2-NEXT: movl %r8d, %r10d +; AVX2-NEXT: movl %ecx, %r9d +; AVX2-NEXT: movl %edx, %r8d +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl 360(%rbp), %eax -; AVX2-NEXT: movl 352(%rbp), %ecx +; AVX2-NEXT: movzbl 360(%rbp), %eax +; AVX2-NEXT: movzbl 352(%rbp), %ecx ; AVX2-NEXT: vmovd %ecx, %xmm4 ; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 368(%rbp), %eax +; AVX2-NEXT: movzbl 368(%rbp), %eax ; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 376(%rbp), %eax +; AVX2-NEXT: movzbl 376(%rbp), %eax ; AVX2-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 384(%rbp), %eax +; AVX2-NEXT: movzbl 384(%rbp), %eax ; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 392(%rbp), %eax +; AVX2-NEXT: movzbl 392(%rbp), %eax ; AVX2-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 400(%rbp), %eax +; AVX2-NEXT: movzbl 400(%rbp), %eax ; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 408(%rbp), %eax +; AVX2-NEXT: movzbl 408(%rbp), %eax ; AVX2-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 416(%rbp), %eax +; AVX2-NEXT: movzbl 416(%rbp), %eax ; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 424(%rbp), %eax +; AVX2-NEXT: movzbl 424(%rbp), %eax ; AVX2-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 432(%rbp), %eax +; AVX2-NEXT: movzbl 432(%rbp), %eax ; AVX2-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 440(%rbp), %eax +; AVX2-NEXT: movzbl 440(%rbp), %eax ; AVX2-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 448(%rbp), %eax +; AVX2-NEXT: movzbl 448(%rbp), %eax ; AVX2-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 456(%rbp), %eax +; AVX2-NEXT: movzbl 456(%rbp), %eax ; AVX2-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 464(%rbp), %eax +; AVX2-NEXT: movzbl 464(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 472(%rbp), %eax +; AVX2-NEXT: movzbl 472(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 224(%rbp), %eax +; AVX2-NEXT: movzbl 224(%rbp), %eax ; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: movl 232(%rbp), %eax +; AVX2-NEXT: movzbl 232(%rbp), %eax ; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 240(%rbp), %eax +; AVX2-NEXT: movzbl 240(%rbp), %eax ; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 248(%rbp), %eax +; AVX2-NEXT: movzbl 248(%rbp), %eax ; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 256(%rbp), %eax +; AVX2-NEXT: movzbl 256(%rbp), %eax ; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 264(%rbp), %eax +; AVX2-NEXT: movzbl 264(%rbp), %eax ; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 272(%rbp), %eax +; AVX2-NEXT: movzbl 272(%rbp), %eax ; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 280(%rbp), %eax +; AVX2-NEXT: movzbl 280(%rbp), %eax ; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 288(%rbp), %eax +; AVX2-NEXT: movzbl 288(%rbp), %eax ; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 296(%rbp), %eax +; AVX2-NEXT: movzbl 296(%rbp), %eax ; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 304(%rbp), %eax +; AVX2-NEXT: movzbl 304(%rbp), %eax ; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 312(%rbp), %eax +; AVX2-NEXT: movzbl 312(%rbp), %eax ; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 320(%rbp), %eax +; AVX2-NEXT: movzbl 320(%rbp), %eax ; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 328(%rbp), %eax +; AVX2-NEXT: movzbl 328(%rbp), %eax ; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 336(%rbp), %eax +; AVX2-NEXT: movzbl 336(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 344(%rbp), %eax +; AVX2-NEXT: movzbl 344(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-NEXT: movl 96(%rbp), %eax -; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: movl 104(%rbp), %eax -; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 112(%rbp), %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 120(%rbp), %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 128(%rbp), %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 136(%rbp), %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 144(%rbp), %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 152(%rbp), %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 160(%rbp), %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 168(%rbp), %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 176(%rbp), %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 184(%rbp), %eax +; AVX2-NEXT: vmovd %edi, %xmm5 +; AVX2-NEXT: vpinsrb $1, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $2, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $3, %r9d, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $4, %r10d, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $5, %r11d, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 16(%rbp), %ebx +; AVX2-NEXT: vpinsrb $6, %ebx, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 24(%rbp), %r14d +; AVX2-NEXT: vpinsrb $7, %r14d, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 32(%rbp), %r15d +; AVX2-NEXT: vpinsrb $8, %r15d, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 40(%rbp), %r12d +; AVX2-NEXT: vpinsrb $9, %r12d, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 48(%rbp), %r13d +; AVX2-NEXT: vpinsrb $10, %r13d, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 56(%rbp), %eax ; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 192(%rbp), %eax +; AVX2-NEXT: movzbl 64(%rbp), %eax ; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 200(%rbp), %eax +; AVX2-NEXT: movzbl 72(%rbp), %eax ; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 208(%rbp), %eax +; AVX2-NEXT: movzbl 80(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 216(%rbp), %eax +; AVX2-NEXT: movzbl 88(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vmovd %edi, %xmm6 -; AVX2-NEXT: vpinsrb $1, %esi, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $2, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $3, %r13d, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $5, %r9d, %xmm6, %xmm6 -; AVX2-NEXT: movl 16(%rbp), %esi -; AVX2-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6 -; AVX2-NEXT: movl 24(%rbp), %edi -; AVX2-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6 -; AVX2-NEXT: movl 32(%rbp), %r8d -; AVX2-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6 -; AVX2-NEXT: movl 40(%rbp), %r9d -; AVX2-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6 -; AVX2-NEXT: movl 48(%rbp), %r10d -; AVX2-NEXT: vpinsrb $10, %r10d, %xmm6, %xmm6 -; AVX2-NEXT: movl 56(%rbp), %r11d -; AVX2-NEXT: vpinsrb $11, %r11d, %xmm6, %xmm6 -; AVX2-NEXT: movl 64(%rbp), %r14d -; AVX2-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6 -; AVX2-NEXT: movl 72(%rbp), %r12d -; AVX2-NEXT: vpinsrb $13, %r12d, %xmm6, %xmm6 -; AVX2-NEXT: movl 80(%rbp), %eax +; AVX2-NEXT: movzbl 96(%rbp), %eax +; AVX2-NEXT: vmovd %eax, %xmm6 +; AVX2-NEXT: movzbl 104(%rbp), %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 112(%rbp), %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 120(%rbp), %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 128(%rbp), %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 136(%rbp), %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 144(%rbp), %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 152(%rbp), %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 160(%rbp), %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 168(%rbp), %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 176(%rbp), %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 184(%rbp), %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 192(%rbp), %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 200(%rbp), %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 208(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movl 88(%rbp), %eax +; AVX2-NEXT: movzbl 216(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6 -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm4 @@ -1980,379 +1962,435 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: vmovaps %ymm2, (%rsp) ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movzbl (%rsp,%rax), %edx ; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: andl $1, %ebx -; AVX2-NEXT: addq %rax, %rbx -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rbx) -; AVX2-NEXT: andl $1, %r15d -; AVX2-NEXT: addq %rbx, %r15 -; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r15) -; AVX2-NEXT: andl $1, %r13d -; AVX2-NEXT: addq %r15, %r13 -; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r13) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %r13, %rcx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: andl $1, %esi -; AVX2-NEXT: addq %rax, %rsi -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) ; AVX2-NEXT: andl $1, %edi -; AVX2-NEXT: addq %rsi, %rdi -; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi -; AVX2-NEXT: andl $63, %esi -; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rsi) +; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rdi) +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: addq %rdi, %rsi +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rsi) ; AVX2-NEXT: andl $1, %r8d -; AVX2-NEXT: addq %rdi, %r8 -; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi -; AVX2-NEXT: andl $63, %edi -; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdi) +; AVX2-NEXT: addq %rsi, %r8 +; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r8) ; AVX2-NEXT: andl $1, %r9d ; AVX2-NEXT: addq %r8, %r9 -; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8 -; AVX2-NEXT: andl $63, %r8d -; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%r8) +; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r9) ; AVX2-NEXT: andl $1, %r10d ; AVX2-NEXT: addq %r9, %r10 -; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9 -; AVX2-NEXT: andl $63, %r9d -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%r9) +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) ; AVX2-NEXT: andl $1, %r11d ; AVX2-NEXT: addq %r10, %r11 -; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10 -; AVX2-NEXT: andl $63, %r10d -; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%r10) -; AVX2-NEXT: andl $1, %r14d -; AVX2-NEXT: addq %r11, %r14 +; AVX2-NEXT: movzbl %bl, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %r11, %rax ; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11 ; AVX2-NEXT: andl $63, %r11d -; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%r11) -; AVX2-NEXT: andl $1, %r12d -; AVX2-NEXT: addq %r14, %r12 -; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14 -; AVX2-NEXT: andl $63, %r14d -; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%r14) -; AVX2-NEXT: movl 80(%rbp), %eax +; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%r11) +; AVX2-NEXT: movzbl %r14b, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movzbl %r15b, %eax ; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %r12, %rax -; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12 -; AVX2-NEXT: andl $63, %r12d -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%r12) -; AVX2-NEXT: movl 88(%rbp), %ecx +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl %r12b, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 96(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 56(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movzbl 64(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 72(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movzbl 80(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 88(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movzbl 96(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 104(%rbp), %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 104(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 112(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 120(%rbp), %ecx +; AVX2-NEXT: movzbl 112(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 120(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 128(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 136(%rbp), %ecx +; AVX2-NEXT: movzbl 128(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 136(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 144(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 152(%rbp), %ecx +; AVX2-NEXT: movzbl 144(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 152(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 160(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 168(%rbp), %ecx +; AVX2-NEXT: movzbl 160(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 168(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 176(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 184(%rbp), %ecx +; AVX2-NEXT: movzbl 176(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 184(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 192(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 200(%rbp), %ecx +; AVX2-NEXT: movzbl 192(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 200(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 208(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 216(%rbp), %ecx +; AVX2-NEXT: movzbl 208(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 216(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 224(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 232(%rbp), %ecx +; AVX2-NEXT: movzbl 224(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 232(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $1, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 240(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 248(%rbp), %ecx +; AVX2-NEXT: movzbl 240(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 248(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 256(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 264(%rbp), %ecx +; AVX2-NEXT: movzbl 256(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 264(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 272(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 280(%rbp), %ecx +; AVX2-NEXT: movzbl 272(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 280(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 288(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 296(%rbp), %ecx +; AVX2-NEXT: movzbl 288(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 296(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 304(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 312(%rbp), %ecx +; AVX2-NEXT: movzbl 304(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 312(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 320(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 328(%rbp), %ecx +; AVX2-NEXT: movzbl 320(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 328(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 336(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 344(%rbp), %ecx +; AVX2-NEXT: movzbl 336(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 344(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 352(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: movzbl 352(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 360(%rbp), %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 360(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 368(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 376(%rbp), %ecx +; AVX2-NEXT: movzbl 368(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 376(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 384(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 392(%rbp), %ecx +; AVX2-NEXT: movzbl 384(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 392(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 400(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 408(%rbp), %ecx +; AVX2-NEXT: movzbl 400(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 408(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 416(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 424(%rbp), %ecx +; AVX2-NEXT: movzbl 416(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 424(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 432(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 440(%rbp), %ecx +; AVX2-NEXT: movzbl 432(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 440(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 448(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 456(%rbp), %ecx +; AVX2-NEXT: movzbl 448(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 456(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 464(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 472(%rbp), %ecx +; AVX2-NEXT: movzbl 464(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 472(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) ; AVX2-NEXT: vpextrb $15, %xmm0, %eax ; AVX2-NEXT: cmpq $64, %rcx -; AVX2-NEXT: cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload +; AVX2-NEXT: cmovbl %edx, %eax ; AVX2-NEXT: cmpq $63, %rcx -; AVX2-NEXT: movq %rcx, %rdx -; AVX2-NEXT: movl $63, %ecx -; AVX2-NEXT: cmovbq %rdx, %rcx -; AVX2-NEXT: movb %al, (%rsp,%rcx) +; AVX2-NEXT: movl $63, %edx +; AVX2-NEXT: cmovbq %rcx, %rdx +; AVX2-NEXT: movb %al, (%rsp,%rdx) ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: leaq -40(%rbp), %rsp diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index e60b565..d0690bd 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -509,10 +509,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlw $7, %xmm3 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: paddb %xmm2, %xmm4 +; SSE2-NEXT: psrlw $7, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 @@ -545,10 +545,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $7, %xmm0 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: psrlw $7, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 ; SSE41-NEXT: paddb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -572,10 +572,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -704,10 +704,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 ; X86-SSE2-NEXT: por %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: psrlw $7, %xmm3 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: paddb %xmm2, %xmm4 +; X86-SSE2-NEXT: psrlw $7, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 ; X86-SSE2-NEXT: por %xmm3, %xmm4 ; X86-SSE2-NEXT: paddb %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 11a02f8..421fa98 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -431,10 +431,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -451,10 +451,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2)) ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index d979997..4969cb5 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -533,10 +533,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrlw $7, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: paddb %xmm2, %xmm4 +; SSE2-NEXT: psrlw $7, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: paddb %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm0 @@ -568,10 +568,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE41-NEXT: paddb %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlw $7, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm3 +; SSE41-NEXT: psrlw $7, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: por %xmm1, %xmm3 ; SSE41-NEXT: paddb %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 @@ -596,10 +596,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -731,10 +731,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: psrlw $7, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: paddb %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: paddb %xmm2, %xmm4 +; X86-SSE2-NEXT: psrlw $7, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 ; X86-SSE2-NEXT: por %xmm1, %xmm4 ; X86-SSE2-NEXT: paddb %xmm3, %xmm3 ; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index 15e09c3..e2a3e26 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -457,10 +457,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 6c79be7..93f4ce7 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -442,10 +442,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlw $7, %xmm3 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: paddb %xmm2, %xmm4 +; SSE2-NEXT: psrlw $7, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 @@ -478,10 +478,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $7, %xmm0 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: psrlw $7, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 ; SSE41-NEXT: paddb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -505,10 +505,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -637,10 +637,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 ; X86-SSE2-NEXT: por %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: psrlw $7, %xmm3 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: paddb %xmm2, %xmm4 +; X86-SSE2-NEXT: psrlw $7, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 ; X86-SSE2-NEXT: por %xmm3, %xmm4 ; X86-SSE2-NEXT: paddb %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index 684721f..64c3118 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -375,10 +375,10 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -395,10 +395,10 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2)) ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/xray-custom-log.ll b/llvm/test/CodeGen/X86/xray-custom-log.ll index 8f23055..f4cdc23 100644 --- a/llvm/test/CodeGen/X86/xray-custom-log.ll +++ b/llvm/test/CodeGen/X86/xray-custom-log.ll @@ -1,9 +1,6 @@ ; RUN: llc -mtriple=x86_64 < %s | FileCheck %s ; RUN: llc -mtriple=x86_64 -relocation-model=pic < %s | FileCheck %s --check-prefix=PIC -; RUN: llc -mtriple=x86_64 -filetype=obj %s -o %t -; RUN: llvm-dwarfdump %t | FileCheck %s --check-prefix=DBG - define i32 @customevent() nounwind "function-instrument"="xray-always" !dbg !1 { %eventptr = alloca i8 %eventsize = alloca i64 @@ -93,17 +90,6 @@ define void @leaf_func() "function-instrument"="xray-always" "frame-pointer"="no declare void @llvm.xray.customevent(ptr, i64) declare void @llvm.xray.typedevent(i64, ptr, i64) -;; Construct call site entries for PATCHABLE_EVENT_CALL. -; DBG: DW_TAG_subprogram -; DBG: DW_TAG_call_site -; DBG-NEXT: DW_AT_call_target (DW_OP_reg{{.*}}) -; DBG-NEXT: DW_AT_call_return_pc - -; DBG: DW_TAG_subprogram -; DBG: DW_TAG_call_site -; DBG-NEXT: DW_AT_call_target (DW_OP_reg{{.*}}) -; DBG-NEXT: DW_AT_call_return_pc - !llvm.dbg.cu = !{!7} !llvm.module.flags = !{!10, !11} |