diff options
author | Sylvia Taylor <sylvia.taylor@arm.com> | 2019-07-18 15:42:13 +0000 |
---|---|---|
committer | Kyrylo Tkachov <ktkachov@gcc.gnu.org> | 2019-07-18 15:42:13 +0000 |
commit | e38341a8e0c7f89eb2146feddea8c2f3bf25a331 (patch) | |
tree | a586be544218866de4a61d166b629597a7a1c2c8 /gcc | |
parent | 979526c9ce7bb79315f0f91fde0668a5ad8536df (diff) | |
download | gcc-e38341a8e0c7f89eb2146feddea8c2f3bf25a331.zip gcc-e38341a8e0c7f89eb2146feddea8c2f3bf25a331.tar.gz gcc-e38341a8e0c7f89eb2146feddea8c2f3bf25a331.tar.bz2 |
[patch1/2][arm][PR90317]: fix sha1 patterns
This patch fixes:
1) Ice message thrown when using the crypto_sha1h intrinsic due to
incompatible mode used for zero_extend. Removed zero extend as it is
not a good choice for vector modes and using an equivalent single
mode like TI (128bits) instead of V4SI produces extra instructions
making it inefficient.
This affects gcc version 8 and above.
2) Incorrect combine optimizations made due to vec_select usage
in the sha1 patterns on arm. The patterns should only combine
a vec select within a sha1h<op> instruction when the lane is 0.
This affects gcc version 5 and above.
- Fixed by explicitly declaring the valid const int for such
optimizations. For cases when the lane is not 0, the vector
lane selection now occurs in a e.g. vmov instruction prior
to sha1h<op>.
- Updated the sha1h testcases on arm to check for additional
cases with custom vector lane selection.
The intrinsic functions for the sha1 patterns have also been
simplified which seems to eliminate extra vmovs like:
- vmov.i32 q8, #0.
2019-07-18 Sylvia Taylor <sylvia.taylor@arm.com>
PR target/90317
* config/arm/arm_neon.h
(vsha1h_u32): Refactor.
(vsha1cq_u32): Likewise.
(vsha1pq_u32): Likewise.
(vsha1mq_u32): Likewise.
* config/arm/crypto.md:
(crypto_sha1h): Remove zero extend, correct vec select.
(crypto_sha1c): Correct vec select.
(crypto_sha1m): Likewise.
(crypto_sha1p): Likewise.
* gcc.target/arm/crypto-vsha1cq_u32.c (foo): Change return type to
uint32_t.
(GET_LANE, TEST_SHA1C_VEC_SELECT): New.
* gcc.target/arm/crypto-vsha1h_u32.c (foo): Change return type to
uint32_t.
(GET_LANE, TEST_SHA1H_VEC_SELECT): New.
* gcc.target/arm/crypto-vsha1mq_u32.c (foo): Change return type to
uint32_t.
(GET_LANE, TEST_SHA1M_VEC_SELECT): New.
* gcc.target/arm/crypto-vsha1pq_u32.c (foo): Change return type to
uint32_t.
(GET_LANE, TEST_SHA1P_VEC_SELECT): New.
From-SVN: r273574
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 14 | ||||
-rw-r--r-- | gcc/config/arm/arm_neon.h | 21 | ||||
-rw-r--r-- | gcc/config/arm/crypto.md | 22 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 16 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c | 23 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c | 23 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c | 23 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c | 23 |
8 files changed, 133 insertions, 32 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a4a625e..668dc40 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,17 @@ +2019-07-18 Sylvia Taylor <sylvia.taylor@arm.com> + + PR target/90317 + * config/arm/arm_neon.h + (vsha1h_u32): Refactor. + (vsha1cq_u32): Likewise. + (vsha1pq_u32): Likewise. + (vsha1mq_u32): Likewise. + * config/arm/crypto.md: + (crypto_sha1h): Remove zero extend, correct vec select. + (crypto_sha1c): Correct vec select. + (crypto_sha1m): Likewise. + (crypto_sha1p): Likewise. + 2019-07-18 Richard Earnshaw <rearnsha@arm.com> * config/arm/predicates.md (arm_borrow_operation): New predicate. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 6b98239..1f200d4 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -16938,37 +16938,32 @@ __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1h_u32 (uint32_t __hash_e) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - __t = __builtin_arm_crypto_sha1h (__t); - return vgetq_lane_u32 (__t, 0); + return vgetq_lane_u32 (__builtin_arm_crypto_sha1h (vdupq_n_u32 (__hash_e)), + 0); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk); + return __builtin_arm_crypto_sha1c (__hash_abcd, vdupq_n_u32 (__hash_e), + __wk); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk); + return __builtin_arm_crypto_sha1p (__hash_abcd, vdupq_n_u32 (__hash_e), + __wk); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk); + return __builtin_arm_crypto_sha1m (__hash_abcd, vdupq_n_u32 (__hash_e), + __wk); } __extension__ extern __inline uint32x4_t diff --git a/gcc/config/arm/crypto.md b/gcc/config/arm/crypto.md index bf34f69..115c515 100644 --- a/gcc/config/arm/crypto.md +++ b/gcc/config/arm/crypto.md @@ -105,14 +105,18 @@ [(set_attr "type" "<crypto_type>")] ) +/* The vec_select operation always selects index 0 from the lower V2SI subreg + of the V4SI, adjusted for endianness. Required due to neon_vget_lane and + neon_set_lane that change the element ordering in memory for big-endian. */ + (define_insn "crypto_sha1h" [(set (match_operand:V4SI 0 "register_operand" "=w") - (zero_extend:V4SI - (unspec:SI [(vec_select:SI - (match_operand:V4SI 1 "register_operand" "w") - (parallel [(match_operand:SI 2 "immediate_operand" "i")]))] - UNSPEC_SHA1H)))] - "TARGET_CRYPTO" + (unspec:V4SI + [(vec_select:SI + (match_operand:V4SI 1 "register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]))] + UNSPEC_SHA1H))] + "TARGET_CRYPTO && INTVAL (operands[2]) == NEON_ENDIAN_LANE_N (V2SImode, 0)" "sha1h.32\\t%q0, %q1" [(set_attr "type" "crypto_sha1_fast")] ) @@ -127,6 +131,10 @@ [(set_attr "type" "crypto_pmull")] ) +/* The vec_select operation always selects index 0 from the lower V2SI subreg + of the V4SI, adjusted for endianness. Required due to neon_vget_lane and + neon_set_lane that change the element ordering in memory for big-endian. */ + (define_insn "crypto_<crypto_pattern>" [(set (match_operand:V4SI 0 "register_operand" "=w") (unspec:<crypto_mode> @@ -136,7 +144,7 @@ (parallel [(match_operand:SI 4 "immediate_operand" "i")])) (match_operand:<crypto_mode> 3 "register_operand" "w")] CRYPTO_SELECTING))] - "TARGET_CRYPTO" + "TARGET_CRYPTO && INTVAL (operands[4]) == NEON_ENDIAN_LANE_N (V2SImode, 0)" "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2, %q3" [(set_attr "type" "<crypto_type>")] ) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 0f47604..7bf322f 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,19 @@ +2019-07-18 Sylvia Taylor <sylvia.taylor@arm.com> + + PR target/90317 + * gcc.target/arm/crypto-vsha1cq_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1C_VEC_SELECT): New. + * gcc.target/arm/crypto-vsha1h_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1H_VEC_SELECT): New. + * gcc.target/arm/crypto-vsha1mq_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1M_VEC_SELECT): New. + * gcc.target/arm/crypto-vsha1pq_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1P_VEC_SELECT): New. + 2019-07-18 Jan Hubicka <hubicka@ucw.cz> * g++.dg/lto/alias-5_0.C: New testcase. diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c index 4dc9dee..41f97a7 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c @@ -1,11 +1,12 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t hash = 0xdeadbeef; uint32x4_t a = {0, 1, 2, 3}; @@ -15,4 +16,20 @@ foo (void) return res[0]; } -/* { dg-final { scan-assembler "sha1c.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\ + { \ + return vsha1cq_u32 (a, vgetq_lane_u32 (val, lane), b); \ + } + +#define TEST_SHA1C_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1C_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1c.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */ diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c index dee2774..b284667 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c @@ -1,14 +1,31 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t val = 0xdeadbeef; return vsha1h_u32 (val); } -/* { dg-final { scan-assembler "sha1h.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32_t foo_lane##lane (uint32x4_t val) \ + { \ + return vsha1h_u32 (vgetq_lane_u32 (val, lane)); \ + } + +#define TEST_SHA1H_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1H_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1h.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 8 } } */ diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c index 672b93a..676e64c 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c @@ -1,11 +1,12 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t hash = 0xdeadbeef; uint32x4_t a = {0, 1, 2, 3}; @@ -15,4 +16,20 @@ foo (void) return res[0]; } -/* { dg-final { scan-assembler "sha1m.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\ + { \ + return vsha1mq_u32 (a, vgetq_lane_u32 (val, lane), b); \ + } + +#define TEST_SHA1M_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1M_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1m.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */ diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c index ff508e0..ed10fe2 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c @@ -1,11 +1,12 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t hash = 0xdeadbeef; uint32x4_t a = {0, 1, 2, 3}; @@ -15,4 +16,20 @@ foo (void) return res[0]; } -/* { dg-final { scan-assembler "sha1p.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\ + { \ + return vsha1pq_u32 (a, vgetq_lane_u32 (val, lane), b); \ + } + +#define TEST_SHA1P_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1P_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1p.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */ |