diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2025-03-08 17:10:39 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-03-08 17:10:39 +0000 |
commit | b15e7f3c23bfffe3b4a0180b5433bf5a876b87ae (patch) | |
tree | 98e69817b159f35c416c7b378d820da49ac27193 | |
parent | 037756242f6eb3cdd6e73ab6597651f87aa7b933 (diff) | |
download | llvm-b15e7f3c23bfffe3b4a0180b5433bf5a876b87ae.zip llvm-b15e7f3c23bfffe3b4a0180b5433bf5a876b87ae.tar.gz llvm-b15e7f3c23bfffe3b4a0180b5433bf5a876b87ae.tar.bz2 |
[X86] combineINSERT_SUBVECTOR - attempt to recursively shuffle combine if both base/sub-vectors are already shuffles (#130304)
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll | 22 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll | 87 |
3 files changed, 45 insertions, 72 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0602f50..f9661b0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58758,6 +58758,14 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, } } + // Attempt to recursively combine to a shuffle. + if (isTargetShuffle(peekThroughBitcasts(Vec).getOpcode()) && + isTargetShuffle(peekThroughBitcasts(SubVec).getOpcode())) { + SDValue Op(N, 0); + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) + return Res; + } + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 1102044..2dfa515 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -3519,9 +3519,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -3651,9 +3649,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -3776,9 +3772,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -3909,9 +3903,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,0,31] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4401,10 +4393,8 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] -; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,13,14,0,0,0,0,0,0,0,1,1,0,0,0,0] +; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 409d889..35f25d3 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -3652,13 +3652,11 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -3857,13 +3855,11 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -4085,13 +4081,11 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -4292,13 +4286,11 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47,48,49,0,51,52,53,54,55,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -5101,32 +5093,17 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,21,22,23] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31] -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -5381,13 +5358,11 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,11] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] +; AVX512BW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq |