diff options
author | Haochen Gui <guihaoc@gcc.gnu.org> | 2023-11-17 17:12:32 +0800 |
---|---|---|
committer | Haochen Gui <guihaoc@gcc.gnu.org> | 2023-11-17 17:19:39 +0800 |
commit | cd295a80c91040fd4d826528c8e8e07fe909ae62 (patch) | |
tree | 6831d9e61e1285c088e922d444ba8efa63aa784d /gcc | |
parent | 5c8cb429762062f851a56801f021d81f4ecfe3d9 (diff) | |
download | gcc-cd295a80c91040fd4d826528c8e8e07fe909ae62.zip gcc-cd295a80c91040fd4d826528c8e8e07fe909ae62.tar.gz gcc-cd295a80c91040fd4d826528c8e8e07fe909ae62.tar.bz2 |
rs6000: Enable vector mode for by pieces equality compare
This patch adds a new expand pattern - cbranchv16qi4 to enable vector
mode by pieces equality compare on rs6000. The macro MOVE_MAX_PIECES
(COMPARE_MAX_PIECES) is set to 16 bytes when EFFICIENT_UNALIGNED_VSX
is enabled, otherwise keeps unchanged. The macro STORE_MAX_PIECES is
set to the same value as MOVE_MAX_PIECES by default, so now it's
explicitly defined and keeps unchanged.
gcc/
PR target/111449
* config/rs6000/altivec.md (cbranchv16qi4): New expand pattern.
* config/rs6000/rs6000.cc (rs6000_generate_compare): Generate
insn sequence for V16QImode equality compare.
* config/rs6000/rs6000.h (MOVE_MAX_PIECES): Define.
(STORE_MAX_PIECES): Define.
gcc/testsuite/
PR target/111449
* gcc.target/powerpc/pr111449-1.c: New.
* gcc.dg/tree-ssa/sra-17.c: Add additional options for 32-bit powerpc.
* gcc.dg/tree-ssa/sra-18.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/rs6000/altivec.md | 42 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.cc | 12 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.h | 3 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/tree-ssa/sra-17.c | 1 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/tree-ssa/sra-18.c | 1 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/powerpc/pr111449-1.c | 18 |
6 files changed, 77 insertions, 0 deletions
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index e8a596f..8a4998c 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -2605,6 +2605,48 @@ } [(set_attr "type" "vecperm")]) +/* The cbranch_optab doesn't allow FAIL, so old cpus which are + inefficient on unaligned vsx are disabled as the cost is high + for unaligned load/store. */ +(define_expand "cbranchv16qi4" + [(use (match_operator 0 "equality_operator" + [(match_operand:V16QI 1 "reg_or_mem_operand") + (match_operand:V16QI 2 "reg_or_mem_operand")])) + (use (match_operand 3))] + "VECTOR_MEM_VSX_P (V16QImode) + && TARGET_EFFICIENT_UNALIGNED_VSX" +{ + /* Use direct move for P8 LE to skip doubleword swap, as the byte + order doesn't matter for equality compare. If any operands are + altivec indexed or indirect operands, the load can be implemented + directly by altivec aligned load instruction and swap is no + need. */ + if (!TARGET_P9_VECTOR + && !BYTES_BIG_ENDIAN + && MEM_P (operands[1]) + && !altivec_indexed_or_indirect_operand (operands[1], V16QImode) + && MEM_P (operands[2]) + && !altivec_indexed_or_indirect_operand (operands[2], V16QImode)) + { + rtx reg_op1 = gen_reg_rtx (V16QImode); + rtx reg_op2 = gen_reg_rtx (V16QImode); + rs6000_emit_le_vsx_permute (reg_op1, operands[1], V16QImode); + rs6000_emit_le_vsx_permute (reg_op2, operands[2], V16QImode); + operands[1] = reg_op1; + operands[2] = reg_op2; + } + else + { + operands[1] = force_reg (V16QImode, operands[1]); + operands[2] = force_reg (V16QImode, operands[2]); + } + + rtx_code code = GET_CODE (operands[0]); + operands[0] = gen_rtx_fmt_ee (code, V16QImode, operands[1], operands[2]); + rs6000_emit_cbranch (V16QImode, operands); + DONE; +}) + ;; Compare vectors producing a vector result and a predicate, setting CR6 to ;; indicate a combined status (define_insn "altivec_vcmpequ<VI_char>_p" diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 5f56c3e..3dfd79c 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -15472,6 +15472,18 @@ rs6000_generate_compare (rtx cmp, machine_mode mode) else emit_insn (gen_stack_protect_testsi (compare_result, op0, op1b)); } + else if (mode == V16QImode) + { + gcc_assert (code == EQ || code == NE); + + rtx result_vector = gen_reg_rtx (V16QImode); + rtx cc_bit = gen_reg_rtx (SImode); + emit_insn (gen_altivec_vcmpequb_p (result_vector, op0, op1)); + emit_insn (gen_cr6_test_for_lt (cc_bit)); + emit_insn (gen_rtx_SET (compare_result, + gen_rtx_COMPARE (comp_mode, cc_bit, + const1_rtx))); + } else emit_insn (gen_rtx_SET (compare_result, gen_rtx_COMPARE (comp_mode, op0, op1))); diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index 22595f6..326c452 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -1730,6 +1730,9 @@ typedef struct rs6000_args in one reasonably fast instruction. */ #define MOVE_MAX (! TARGET_POWERPC64 ? 4 : 8) #define MAX_MOVE_MAX 8 +#define MOVE_MAX_PIECES (TARGET_EFFICIENT_UNALIGNED_VSX \ + ? 16 : (TARGET_POWERPC64 ? 8 : 4)) +#define STORE_MAX_PIECES (TARGET_POWERPC64 ? 8 : 4) /* Nonzero if access to memory by bytes is no faster than for words. Also nonzero if doing byte operations (specifically shifts) in registers diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c index 221d96b..b0d4811 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c @@ -1,6 +1,7 @@ /* { dg-do run { target { aarch64*-*-* alpha*-*-* arm*-*-* hppa*-*-* powerpc*-*-* s390*-*-* } } } */ /* { dg-options "-O2 -fdump-tree-esra --param sra-max-scalarization-size-Ospeed=32" } */ /* { dg-additional-options "-mcpu=ev4" { target alpha*-*-* } } */ +/* { dg-additional-options "-mno-vsx" { target { powerpc*-*-* && ilp32 } } } */ extern void abort (void); diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c index f5e6a21..2cdeae6 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c @@ -1,6 +1,7 @@ /* { dg-do run { target { aarch64*-*-* alpha*-*-* arm*-*-* hppa*-*-* powerpc*-*-* s390*-*-* } } } */ /* { dg-options "-O2 -fdump-tree-esra --param sra-max-scalarization-size-Ospeed=32" } */ /* { dg-additional-options "-mcpu=ev4" { target alpha*-*-* } } */ +/* { dg-additional-options "-mno-vsx" { target { powerpc*-*-* && ilp32 } } } */ extern void abort (void); struct foo { long x; }; diff --git a/gcc/testsuite/gcc.target/powerpc/pr111449-1.c b/gcc/testsuite/gcc.target/powerpc/pr111449-1.c new file mode 100644 index 0000000..0c9e176 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr111449-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-options "-mdejagnu-cpu=power8 -mvsx -O2" } */ + +/* Ensure vector mode is used for 16-byte by pieces equality compare. */ + +int compare1 (const char* s1, const char* s2) +{ + return __builtin_memcmp (s1, s2, 16) == 0; +} + +int compare2 (const char* s1) +{ + return __builtin_memcmp (s1, "0123456789012345", 16) == 0; +} + +/* { dg-final { scan-assembler-times {\mvcmpequb\.} 2 } } */ +/* { dg-final { scan-assembler-not {\mcmpd\M} } } */ |