aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorMichael Meissner <meissner@linux.ibm.com>2024-09-17 21:05:27 -0400
committerMichael Meissner <meissner@linux.ibm.com>2024-09-17 21:05:27 -0400
commit9a07ac151327f61963b092062eb8566dd0c6f0cd (patch)
tree740ec055d0f50e1ef617cd805efab0608d428e95 /gcc
parent38b5a568f8738c16baa978a62a9e983a85b4c6f0 (diff)
downloadgcc-9a07ac151327f61963b092062eb8566dd0c6f0cd.zip
gcc-9a07ac151327f61963b092062eb8566dd0c6f0cd.tar.gz
gcc-9a07ac151327f61963b092062eb8566dd0c6f0cd.tar.bz2
PR 89213: Add better support for shifting vectors with 64-bit elements
This patch fixes PR target/89213 to allow better code to be generated to do constant shifts of V2DI/V2DF vectors. Previously GCC would do constant shifts of vectors with 64-bit elements by using: XXSPLTIB 32,4 VEXTSB2D 0,0 VSRAD 2,2,0 I.e., the PowerPC does not have a VSPLTISD instruction to load -15..14 for the 64-bit shift count in one instruction. Instead, it would need to load a byte and then convert it to 64-bit. With this patch, GCC now realizes that the vector shift instructions will look at the bottom 6 bits for the shift count, and it can use either a VSPLTISW or XXSPLTIB instruction to load the shift count. 2024-09-17 Michael Meissner <meissner@linux.ibm.com> gcc/ PR target/89213 * config/rs6000/altivec.md (UNSPEC_VECTOR_SHIFT): New unspec. (VSHIFT_MODE): New mode iterator. (vshift_code): New code iterator. (vshift_attr): New code attribute. (altivec_<mode>_<vshift_attr>_const): New pattern to optimize vector long long/int shifts by a constant. (altivec_<mode>_shift_const): New helper insn to load up a constant used by the shift operation. * config/rs6000/predicates.md (vector_shift_constant): New predicate. gcc/testsuite/ PR target/89213 * gcc.target/powerpc/pr89213.c: New test. * gcc.target/powerpc/vec-rlmi-rlnm.c: Update instruction count.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/rs6000/altivec.md51
-rw-r--r--gcc/config/rs6000/predicates.md63
-rw-r--r--gcc/testsuite/gcc.target/powerpc/pr89213.c106
-rw-r--r--gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c4
4 files changed, 222 insertions, 2 deletions
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 1f5489b..e4576c6 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -170,6 +170,7 @@
UNSPEC_VSTRIL
UNSPEC_SLDB
UNSPEC_SRDB
+ UNSPEC_VECTOR_SHIFT
])
(define_c_enum "unspecv"
@@ -2176,6 +2177,56 @@
"vsro %0,%1,%2"
[(set_attr "type" "vecperm")])
+;; Optimize V2DI shifts by constants. This relies on the shift instructions
+;; only looking at the bits needed to do the shift. This means we can use
+;; VSPLTISW or XXSPLTIB to load up the constant, and not worry about the bits
+;; that the vector shift instructions will not use.
+(define_mode_iterator VSHIFT_MODE [(V4SI "TARGET_P9_VECTOR")
+ (V2DI "TARGET_P8_VECTOR")])
+
+(define_code_iterator vshift_code [ashift ashiftrt lshiftrt])
+(define_code_attr vshift_attr [(ashift "ashift")
+ (ashiftrt "ashiftrt")
+ (lshiftrt "lshiftrt")])
+
+(define_insn_and_split "*altivec_<mode>_<vshift_attr>_const"
+ [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v")
+ (vshift_code:VSHIFT_MODE
+ (match_operand:VSHIFT_MODE 1 "register_operand" "v")
+ (match_operand:VSHIFT_MODE 2 "vector_shift_constant" "")))
+ (clobber (match_scratch:VSHIFT_MODE 3 "=&v"))]
+ "((<MODE>mode == V2DImode && TARGET_P8_VECTOR)
+ || (<MODE>mode == V4SImode && TARGET_P9_VECTOR))"
+ "#"
+ "&& 1"
+ [(set (match_dup 3)
+ (unspec:VSHIFT_MODE [(match_dup 4)] UNSPEC_VECTOR_SHIFT))
+ (set (match_dup 0)
+ (vshift_code:VSHIFT_MODE (match_dup 1)
+ (match_dup 3)))]
+{
+ if (GET_CODE (operands[3]) == SCRATCH)
+ operands[3] = gen_reg_rtx (<MODE>mode);
+
+ operands[4] = GET_CODE (operands[2]) == CONST_VECTOR
+ ? CONST_VECTOR_ELT (operands[2], 0)
+ : XEXP (operands[2], 0);
+})
+
+(define_insn "*altivec_<mode>_shift_const"
+ [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v")
+ (unspec:VSHIFT_MODE [(match_operand 1 "const_int_operand" "n")]
+ UNSPEC_VECTOR_SHIFT))]
+ "TARGET_P8_VECTOR"
+{
+ if (UINTVAL (operands[1]) <= 15)
+ return "vspltisw %0,%1";
+ else if (TARGET_P9_VECTOR)
+ return "xxspltib %x0,%1";
+ else
+ gcc_unreachable ();
+})
+
(define_insn "altivec_vsum4ubs"
[(set (match_operand:V4SI 0 "register_operand" "=v")
(unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v")
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 7f0b4ab..0b78901 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -861,6 +861,69 @@
return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
})
+;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element
+;; is the same constant, and the constant can be used for a shift operation.
+;; This is to prevent sub-optimal code, that needs to load up the constant and
+;; then zero extend it 32 or 64-bit vectors or load up the constant from the
+;; literal pool.
+;;
+;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by
+;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction.
+;; For V2DImode, we do this all of the time, since there is no convenient
+;; instruction to load up a vector long long splatted constant.
+;;
+;; If we can use XXSPLTIB, then allow constants up to 63. If not, we restrict
+;; the constant to 0..15 that can be loaded with VSPLTISW. V4SI shifts are
+;; only optimized for ISA 3.0 when the shift value is >= 16 and <= 31. Values
+;; between 0 and 15 can use a normal VSPLTISW to load the value, and it doesn't
+;; need this optimization.
+(define_predicate "vector_shift_constant"
+ (match_code "const_vector,vec_duplicate")
+{
+ unsigned HOST_WIDE_INT min_value;
+
+ if (mode == V2DImode)
+ {
+ min_value = 0;
+ if (!TARGET_P8_VECTOR)
+ return 0;
+ }
+ else if (mode == V4SImode)
+ {
+ min_value = 16;
+ if (!TARGET_P9_VECTOR)
+ return 0;
+ }
+ else
+ return 0;
+
+ unsigned HOST_WIDE_INT max_value = TARGET_P9_VECTOR ? 63 : 15;
+
+ if (GET_CODE (op) == CONST_VECTOR)
+ {
+ unsigned HOST_WIDE_INT first = UINTVAL (CONST_VECTOR_ELT (op, 0));
+ unsigned nunits = GET_MODE_NUNITS (mode);
+ unsigned i;
+
+ if (!IN_RANGE (first, min_value, max_value))
+ return 0;
+
+ for (i = 1; i < nunits; i++)
+ if (first != UINTVAL (CONST_VECTOR_ELT (op, i)))
+ return 0;
+
+ return 1;
+ }
+ else
+ {
+ rtx op0 = XEXP (op, 0);
+ if (!CONST_INT_P (op0))
+ return 0;
+
+ return IN_RANGE (UINTVAL (op0), min_value, max_value);
+ }
+})
+
;; Return 1 if operand is 0.0.
(define_predicate "zero_fp_constant"
(and (match_code "const_double")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr89213.c b/gcc/testsuite/gcc.target/powerpc/pr89213.c
new file mode 100644
index 0000000..90a8b1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89213.c
@@ -0,0 +1,106 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_vsx } */
+/* { dg-options "-mcpu=power9 -O2" } */
+
+/* Optimize vector shifts by constants. */
+
+#include <altivec.h>
+
+typedef vector long long vi64_t;
+typedef vector unsigned long long vui64_t;
+
+typedef vector int vi32_t;
+typedef vector unsigned int vui32_t;
+
+vi64_t
+shiftra_test64_4 (vi64_t a)
+{
+ vui64_t x = {4, 4};
+ return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_4 (vi64_t a)
+{
+ vui64_t x = {4, 4};
+ return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_4 (vi64_t a)
+{
+ vui64_t x = {4, 4};
+ return (vi64_t) vec_vsld (a, x);
+}
+
+vi64_t
+shiftra_test64_29 (vi64_t a)
+{
+ vui64_t x = {29, 29};
+ return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_29 (vi64_t a)
+{
+ vui64_t x = {29, 29};
+ return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_29 (vi64_t a)
+{
+ vui64_t x = {29, 29};
+ return (vi64_t) vec_vsld (a, x);
+}
+
+vi32_t
+shiftra_test32_4 (vi32_t a)
+{
+ vui32_t x = {4, 4, 4, 4};
+ return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_4 (vi32_t a)
+{
+ vui32_t x = {4, 4, 4, 4};
+ return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_4 (vi32_t a)
+{
+ vui32_t x = {4, 4, 4, 4};
+ return (vi32_t) vec_vslw (a, x);
+}
+
+vi32_t
+shiftra_test32_29 (vi32_t a)
+{
+ vui32_t x = {29, 29, 29, 29};
+ return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_29 (vi32_t a)
+{
+ vui32_t x = {29, 29, 29, 29};
+ return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_29 (vi32_t a)
+{
+ vui32_t x = {29, 29, 29, 29};
+ return (vi32_t) vec_vslw (a, x);
+}
+
+/* { dg-final { scan-assembler-times {\mxxspltib\M} 6 } } */
+/* { dg-final { scan-assembler-times {\mvsld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvslw\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvspltisw\M} 6 } } */
+/* { dg-final { scan-assembler-times {\mvsrd\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvsrw\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvsrad\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvsraw\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c b/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c
index 6834733..01fa0a9 100644
--- a/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c
+++ b/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c
@@ -54,12 +54,12 @@ rlnm_test_2 (vector unsigned long long x, vector unsigned long long y,
- For rlnm_test_1: vspltisw, vslw, xxlor, vrlwnm.
- For rlnm_test_2: xxspltib, vextsb2d, vsld, xxlor, vrldnm.
There is a choice of splat instructions in both cases, so we
- just check for "splt". */
+ just check for "splt". In the past vextsb2d would be generated for
+ rlnm_test_2, but the compiler no longer generates it. */
/* { dg-final { scan-assembler-times "vrlwmi" 1 } } */
/* { dg-final { scan-assembler-times "vrldmi" 1 } } */
/* { dg-final { scan-assembler-times "splt" 2 } } */
-/* { dg-final { scan-assembler-times "vextsb2d" 1 } } */
/* { dg-final { scan-assembler-times "vslw" 1 } } */
/* { dg-final { scan-assembler-times "vsld" 1 } } */
/* { dg-final { scan-assembler-times "xxlor" 4 } } */