aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2021-01-13 08:02:54 +0100
committerJakub Jelinek <jakub@redhat.com>2021-01-13 08:06:25 +0100
commitb668a06e37f72fd96bacd6769990ec97dac4ac6d (patch)
tree0c71ee709d96084a2d1e36cea217ba4007040b8b
parent7993fe1877a689463d8c71a0873e5cc8db080273 (diff)
downloadgcc-b668a06e37f72fd96bacd6769990ec97dac4ac6d.zip
gcc-b668a06e37f72fd96bacd6769990ec97dac4ac6d.tar.gz
gcc-b668a06e37f72fd96bacd6769990ec97dac4ac6d.tar.bz2
i386: Optimize _mm_unpacklo_epi8 of 0 vector as second argument or similar VEC_PERM_EXPRs into pmovzx [PR95905]
The following patch adds patterns (so far 128-bit only) for permutations like { 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23 } where the second operand is CONST0_RTX CONST_VECTOR to be emitted as pmovzx. 2021-01-13 Jakub Jelinek <jakub@redhat.com> PR target/95905 * config/i386/predicates.md (pmovzx_parallel): New predicate. * config/i386/sse.md (*sse4_1_zero_extendv8qiv8hi2_3): New define_insn_and_split pattern. (*sse4_1_zero_extendv4hiv4si2_3): Likewise. (*sse4_1_zero_extendv2siv2di2_3): Likewise. * gcc.target/i386/pr95905-1.c: New test. * gcc.target/i386/pr95905-2.c: New test.
-rw-r--r--gcc/config/i386/predicates.md32
-rw-r--r--gcc/config/i386/sse.md84
-rw-r--r--gcc/testsuite/gcc.target/i386/pr95905-1.c26
-rw-r--r--gcc/testsuite/gcc.target/i386/pr95905-2.c46
4 files changed, 188 insertions, 0 deletions
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 1b69d7b..0a3ab4d 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1600,6 +1600,38 @@
return true;
})
+;; Return true if OP is a parallel for an pmovz{bw,wd,dq} vec_select,
+;; where one of the two operands of the vec_concat is const0_operand.
+(define_predicate "pmovzx_parallel"
+ (and (match_code "parallel")
+ (match_code "const_int" "a"))
+{
+ int nelt = XVECLEN (op, 0);
+ int elt, i;
+
+ if (nelt < 2)
+ return false;
+
+ /* Check that the permutation is suitable for pmovz{bw,wd,dq}.
+ For example { 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23 }. */
+ elt = INTVAL (XVECEXP (op, 0, 0));
+ if (elt == 0)
+ {
+ for (i = 1; i < nelt; ++i)
+ if ((i & 1) != 0)
+ {
+ if (INTVAL (XVECEXP (op, 0, i)) < nelt)
+ return false;
+ }
+ else if (INTVAL (XVECEXP (op, 0, i)) != i / 2)
+ return false;
+ }
+ else
+ return false;
+
+ return true;
+})
+
;; Return true if OP is a parallel for a vbroadcast permute.
(define_predicate "avx_vbroadcast_operand"
(and (match_code "parallel")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 928eff5..2a260c1c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17683,6 +17683,36 @@
(any_extend:V8HI (match_dup 1)))]
"operands[1] = adjust_address_nv (operands[1], V8QImode, 0);")
+(define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_3"
+ [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,v")
+ (vec_select:V16QI
+ (vec_concat:V32QI
+ (match_operand:V16QI 1 "vector_operand" "Yrm,*xm,vm")
+ (match_operand:V16QI 2 "const0_operand" "C,C,C"))
+ (match_parallel 3 "pmovzx_parallel"
+ [(match_operand 4 "const_int_operand" "n,n,n")])))]
+ "TARGET_SSE4_1"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0)
+ (zero_extend:V8HI
+ (vec_select:V8QI
+ (match_dup 1)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)]))))]
+{
+ operands[0] = lowpart_subreg (V8HImode, operands[0], V16QImode);
+ if (MEM_P (operands[1]))
+ {
+ operands[1] = lowpart_subreg (V8QImode, operands[1], V16QImode);
+ operands[1] = gen_rtx_ZERO_EXTEND (V8HImode, operands[1]);
+ emit_insn (gen_rtx_SET (operands[0], operands[1]));
+ DONE;
+ }
+})
+
(define_expand "<insn>v8qiv8hi2"
[(set (match_operand:V8HI 0 "register_operand")
(any_extend:V8HI
@@ -17929,6 +17959,34 @@
}
})
+(define_insn_and_split "*sse4_1_zero_extendv4hiv4si2_3"
+ [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
+ (vec_select:V8HI
+ (vec_concat:V16HI
+ (match_operand:V8HI 1 "vector_operand" "Yrm,*xm,vm")
+ (match_operand:V8HI 2 "const0_operand" "C,C,C"))
+ (match_parallel 3 "pmovzx_parallel"
+ [(match_operand 4 "const_int_operand" "n,n,n")])))]
+ "TARGET_SSE4_1"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0)
+ (zero_extend:V4SI
+ (vec_select:V4HI
+ (match_dup 1)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)]))))]
+{
+ operands[0] = lowpart_subreg (V4SImode, operands[0], V8HImode);
+ if (MEM_P (operands[1]))
+ {
+ operands[1] = lowpart_subreg (V4HImode, operands[1], V8HImode);
+ operands[1] = gen_rtx_ZERO_EXTEND (V4SImode, operands[1]);
+ emit_insn (gen_rtx_SET (operands[0], operands[1]));
+ DONE;
+ }
+})
+
(define_insn "avx512f_<code>v8qiv8di2<mask_name>"
[(set (match_operand:V8DI 0 "register_operand" "=v")
(any_extend:V8DI
@@ -18283,6 +18341,32 @@
(any_extend:V2DI (match_dup 1)))]
"operands[1] = adjust_address_nv (operands[1], V2SImode, 0);")
+(define_insn_and_split "*sse4_1_zero_extendv2siv2di2_3"
+ [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
+ (vec_select:V4SI
+ (vec_concat:V8SI
+ (match_operand:V4SI 1 "vector_operand" "Yrm,*xm,vm")
+ (match_operand:V4SI 2 "const0_operand" "C,C,C"))
+ (match_parallel 3 "pmovzx_parallel"
+ [(match_operand 4 "const_int_operand" "n,n,n")])))]
+ "TARGET_SSE4_1"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0)
+ (zero_extend:V2DI
+ (vec_select:V2SI (match_dup 1)
+ (parallel [(const_int 0) (const_int 1)]))))]
+{
+ operands[0] = lowpart_subreg (V2DImode, operands[0], V4SImode);
+ if (MEM_P (operands[1]))
+ {
+ operands[1] = lowpart_subreg (V2SImode, operands[1], V4SImode);
+ operands[1] = gen_rtx_ZERO_EXTEND (V2DImode, operands[1]);
+ emit_insn (gen_rtx_SET (operands[0], operands[1]));
+ DONE;
+ }
+})
+
(define_expand "<insn>v2siv2di2"
[(set (match_operand:V2DI 0 "register_operand")
(any_extend:V2DI
diff --git a/gcc/testsuite/gcc.target/i386/pr95905-1.c b/gcc/testsuite/gcc.target/i386/pr95905-1.c
new file mode 100644
index 0000000..8de715e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95905-1.c
@@ -0,0 +1,26 @@
+/* PR target/95905 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1 -mno-avx" } */
+/* { dg-final { scan-assembler "\tpmovzxbw\t" } } */
+/* { dg-final { scan-assembler "\tpmovzxwd\t" } } */
+/* { dg-final { scan-assembler "\tpmovzxdq\t" } } */
+
+#include <x86intrin.h>
+
+__m128i
+f1 (__m128i a)
+{
+ return _mm_unpacklo_epi8 (a, _mm_setzero_si128 ());
+}
+
+__m128i
+f2 (__m128i a)
+{
+ return _mm_unpacklo_epi16 (a, _mm_setzero_si128 ());
+}
+
+__m128i
+f3 (__m128i a)
+{
+ return _mm_unpacklo_epi32 (a, _mm_setzero_si128 ());
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95905-2.c b/gcc/testsuite/gcc.target/i386/pr95905-2.c
new file mode 100644
index 0000000..7cd20a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95905-2.c
@@ -0,0 +1,46 @@
+/* PR target/95905 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */
+/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */
+/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */
+
+typedef unsigned char V1 __attribute__((vector_size (16)));
+typedef unsigned short V2 __attribute__((vector_size (16)));
+typedef unsigned int V3 __attribute__((vector_size (16)));
+
+V1
+f1 (V1 x)
+{
+ return __builtin_shuffle (x, (V1) {}, (V1) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+}
+
+V2
+f2 (V2 x)
+{
+ return __builtin_shuffle (x, (V2) {}, (V2) { 0, 8, 1, 9, 2, 10, 3, 11 });
+}
+
+V3
+f3 (V3 x)
+{
+ return __builtin_shuffle (x, (V3) {}, (V3) { 0, 4, 1, 5 });
+}
+
+V1
+f4 (V1 *x)
+{
+ return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+}
+
+V2
+f5 (V2 *x)
+{
+ return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 8, 1, 9, 2, 10, 3, 11 });
+}
+
+V3
+f6 (V3 *x)
+{
+ return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 });
+}