aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2023-12-05 10:11:18 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2023-12-05 10:11:18 +0000
commit80f47d7bbe38234e1530d27fe5c2f130223ca7a0 (patch)
tree156bdf4004485e1ae4a121ba8d674545db66a56a
parentc0cf2c893d54420b0c19fee7bd41ae40017d0106 (diff)
downloadgcc-80f47d7bbe38234e1530d27fe5c2f130223ca7a0.zip
gcc-80f47d7bbe38234e1530d27fe5c2f130223ca7a0.tar.gz
gcc-80f47d7bbe38234e1530d27fe5c2f130223ca7a0.tar.bz2
aarch64: Use SVE's RDVL instruction
We didn't previously use SVE's RDVL instruction, since the CNT* forms are preferred and provide most of the range. However, there are some cases that RDVL can handle and CNT* can't, and using RDVL-like instructions becomes important for SME. gcc/ * config/aarch64/aarch64-protos.h (aarch64_sve_rdvl_immediate_p) (aarch64_output_sve_rdvl): Declare. * config/aarch64/aarch64.cc (aarch64_sve_cnt_factor_p): New function, split out from... (aarch64_sve_cnt_immediate_p): ...here. (aarch64_sve_rdvl_factor_p): New function. (aarch64_sve_rdvl_immediate_p): Likewise. (aarch64_output_sve_rdvl): Likewise. (aarch64_offset_temporaries): Rewrite the SVE handling to use RDVL for some cases. (aarch64_expand_mov_immediate): Handle RDVL immediates. (aarch64_mov_operand_p): Likewise. * config/aarch64/constraints.md (Usr): New constraint. * config/aarch64/aarch64.md (*mov<SHORT:mode>_aarch64): Add an RDVL alternative. (*movsi_aarch64, *movdi_aarch64): Likewise. gcc/testsuite/ * gcc.target/aarch64/sve/acle/asm/cntb.c: Tweak expected output. * gcc.target/aarch64/sve/acle/asm/cnth.c: Likewise. * gcc.target/aarch64/sve/acle/asm/cntw.c: Likewise. * gcc.target/aarch64/sve/acle/asm/cntd.c: Likewise. * gcc.target/aarch64/sve/acle/asm/prfb.c: Likewise. * gcc.target/aarch64/sve/acle/asm/prfh.c: Likewise. * gcc.target/aarch64/sve/acle/asm/prfw.c: Likewise. * gcc.target/aarch64/sve/acle/asm/prfd.c: Likewise. * gcc.target/aarch64/sve/loop_add_4.c: Expect RDVL to be used to calculate the -17 and 17 factors. * gcc.target/aarch64/sve/pcs/stack_clash_1.c: Likewise the 18 factor.
-rw-r--r--gcc/config/aarch64/aarch64-protos.h2
-rw-r--r--gcc/config/aarch64/aarch64.cc191
-rw-r--r--gcc/config/aarch64/aarch64.md3
-rw-r--r--gcc/config/aarch64/constraints.md6
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c71
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c12
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c20
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c16
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c6
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c4
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c4
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c4
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c6
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c3
14 files changed, 225 insertions, 123 deletions
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index b0b7d33..765c429 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -798,6 +798,7 @@ bool aarch64_sve_mode_p (machine_mode);
HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
bool aarch64_sve_cnt_immediate_p (rtx);
bool aarch64_sve_scalar_inc_dec_immediate_p (rtx);
+bool aarch64_sve_rdvl_immediate_p (rtx);
bool aarch64_sve_addvl_addpl_immediate_p (rtx);
bool aarch64_sve_vector_inc_dec_immediate_p (rtx);
int aarch64_add_offset_temporaries (rtx);
@@ -810,6 +811,7 @@ char *aarch64_output_sve_prefetch (const char *, rtx, const char *);
char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
char *aarch64_output_sve_cnt_pat_immediate (const char *, const char *, rtx *);
char *aarch64_output_sve_scalar_inc_dec (rtx);
+char *aarch64_output_sve_rdvl (rtx);
char *aarch64_output_sve_addvl_addpl (rtx);
char *aarch64_output_sve_vector_inc_dec (const char *, rtx);
char *aarch64_output_scalar_simd_mov_immediate (rtx, scalar_int_mode);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c864f4c..7a5d0d3 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2933,6 +2933,18 @@ aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
return -1;
}
+/* Return true if a single CNT[BHWD] instruction can multiply FACTOR
+ by the number of 128-bit quadwords in an SVE vector. */
+
+static bool
+aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
+{
+ /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
+ return (IN_RANGE (factor, 2, 16 * 16)
+ && (factor & 1) == 0
+ && factor <= 16 * (factor & -factor));
+}
+
/* Return true if we can move VALUE into a register using a single
CNT[BHWD] instruction. */
@@ -2940,11 +2952,7 @@ static bool
aarch64_sve_cnt_immediate_p (poly_int64 value)
{
HOST_WIDE_INT factor = value.coeffs[0];
- /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
- return (value.coeffs[1] == factor
- && IN_RANGE (factor, 2, 16 * 16)
- && (factor & 1) == 0
- && factor <= 16 * (factor & -factor));
+ return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
}
/* Likewise for rtx X. */
@@ -3060,6 +3068,50 @@ aarch64_output_sve_scalar_inc_dec (rtx offset)
-offset_value.coeffs[1], 0);
}
+/* Return true if a single RDVL instruction can multiply FACTOR by the
+ number of 128-bit quadwords in an SVE vector. */
+
+static bool
+aarch64_sve_rdvl_factor_p (HOST_WIDE_INT factor)
+{
+ return (multiple_p (factor, 16)
+ && IN_RANGE (factor, -32 * 16, 31 * 16));
+}
+
+/* Return true if we can move VALUE into a register using a single
+ RDVL instruction. */
+
+static bool
+aarch64_sve_rdvl_immediate_p (poly_int64 value)
+{
+ HOST_WIDE_INT factor = value.coeffs[0];
+ return value.coeffs[1] == factor && aarch64_sve_rdvl_factor_p (factor);
+}
+
+/* Likewise for rtx X. */
+
+bool
+aarch64_sve_rdvl_immediate_p (rtx x)
+{
+ poly_int64 value;
+ return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
+}
+
+/* Return the asm string for moving RDVL immediate OFFSET into register
+ operand 0. */
+
+char *
+aarch64_output_sve_rdvl (rtx offset)
+{
+ static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
+ poly_int64 offset_value = rtx_to_poly_int64 (offset);
+ gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
+
+ int factor = offset_value.coeffs[1];
+ snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
+ return buffer;
+}
+
/* Return true if we can add VALUE to a register using a single ADDVL
or ADDPL instruction. */
@@ -3689,13 +3741,13 @@ aarch64_offset_temporaries (bool add_p, poly_int64 offset)
count += 1;
else if (factor != 0)
{
- factor = abs (factor);
- if (factor > 16 * (factor & -factor))
- /* Need one register for the CNT result and one for the multiplication
- factor. If necessary, the second temporary can be reused for the
- constant part of the offset. */
+ factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
+ if (!IN_RANGE (factor, -32, 31))
+ /* Need one register for the CNT or RDVL result and one for the
+ multiplication factor. If necessary, the second temporary
+ can be reused for the constant part of the offset. */
return 2;
- /* Need one register for the CNT result (which might then
+ /* Need one register for the CNT or RDVL result (which might then
be shifted). */
count += 1;
}
@@ -3784,85 +3836,100 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
/* Otherwise use a CNT-based sequence. */
else if (factor != 0)
{
- /* Use a subtraction if we have a negative factor. */
- rtx_code code = PLUS;
- if (factor < 0)
- {
- factor = -factor;
- code = MINUS;
- }
+ /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
+ with negative shifts indicating a shift right. */
+ HOST_WIDE_INT low_bit = least_bit_hwi (factor);
+ HOST_WIDE_INT rel_factor = factor / low_bit;
+ int shift = exact_log2 (low_bit) - 4;
+ gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
+
+ /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
+ equal to CNTB * FACTOR / 16, with CODE being the [+-].
- /* Calculate CNTD * FACTOR / 2. First try to fold the division
- into the multiplication. */
+ We can avoid a multiplication if REL_FACTOR is in the range
+ of RDVL, although there are then various optimizations that
+ we can try on top. */
+ rtx_code code = PLUS;
rtx val;
- int shift = 0;
- if (factor & 1)
- /* Use a right shift by 1. */
- shift = -1;
- else
- factor /= 2;
- HOST_WIDE_INT low_bit = factor & -factor;
- if (factor <= 16 * low_bit)
+ if (IN_RANGE (rel_factor, -32, 31))
{
- if (factor > 16 * 8)
+ /* Try to use an unshifted CNT[BHWD] or RDVL. */
+ if (aarch64_sve_cnt_factor_p (factor)
+ || aarch64_sve_rdvl_factor_p (factor))
+ {
+ val = gen_int_mode (poly_int64 (factor, factor), mode);
+ shift = 0;
+ }
+ /* Try to subtract an unshifted CNT[BHWD]. */
+ else if (aarch64_sve_cnt_factor_p (-factor))
{
- /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
- the value with the minimum multiplier and shift it into
- position. */
- int extra_shift = exact_log2 (low_bit);
- shift += extra_shift;
- factor >>= extra_shift;
+ code = MINUS;
+ val = gen_int_mode (poly_int64 (-factor, -factor), mode);
+ shift = 0;
}
- val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
+ /* If subtraction is free, prefer to load a positive constant.
+ In the best case this will fit a shifted CNTB. */
+ else if (src != const0_rtx && rel_factor < 0)
+ {
+ code = MINUS;
+ val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
+ }
+ /* Otherwise use a shifted RDVL or CNT[BHWD]. */
+ else
+ val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
}
else
{
- /* Base the factor on LOW_BIT if we can calculate LOW_BIT
- directly, since that should increase the chances of being
- able to use a shift and add sequence. If LOW_BIT itself
- is out of range, just use CNTD. */
- if (low_bit <= 16 * 8)
- factor /= low_bit;
+ /* If we can calculate CNTB << SHIFT directly, prefer to do that,
+ since it should increase the chances of being able to use
+ a shift and add sequence for the multiplication.
+ If CNTB << SHIFT is out of range, stick with the current
+ shift factor. */
+ if (IN_RANGE (low_bit, 2, 16 * 16))
+ {
+ val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
+ shift = 0;
+ }
else
- low_bit = 1;
+ val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
- val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
val = aarch64_force_temporary (mode, temp1, val);
+ /* Prefer to multiply by a positive factor and subtract rather
+ than multiply by a negative factor and add, since positive
+ values are usually easier to move. */
+ if (rel_factor < 0 && src != const0_rtx)
+ {
+ rel_factor = -rel_factor;
+ code = MINUS;
+ }
+
if (can_create_pseudo_p ())
{
- rtx coeff1 = gen_int_mode (factor, mode);
+ rtx coeff1 = gen_int_mode (rel_factor, mode);
val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
}
else
{
- /* Go back to using a negative multiplication factor if we have
- no register from which to subtract. */
- if (code == MINUS && src == const0_rtx)
- {
- factor = -factor;
- code = PLUS;
- }
- rtx coeff1 = gen_int_mode (factor, mode);
+ rtx coeff1 = gen_int_mode (rel_factor, mode);
coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
val = gen_rtx_MULT (mode, val, coeff1);
}
}
+ /* Multiply by 2 ** SHIFT. */
if (shift > 0)
{
- /* Multiply by 1 << SHIFT. */
val = aarch64_force_temporary (mode, temp1, val);
val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
}
- else if (shift == -1)
+ else if (shift < 0)
{
- /* Divide by 2. */
val = aarch64_force_temporary (mode, temp1, val);
- val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
+ val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
}
- /* Calculate SRC +/- CNTD * FACTOR / 2. */
+ /* Add the result to SRC or subtract the result from SRC. */
if (src != const0_rtx)
{
val = aarch64_force_temporary (mode, temp1, val);
@@ -4508,7 +4575,9 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
aarch64_report_sve_required ();
return;
}
- if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
+ if (base == const0_rtx
+ && (aarch64_sve_cnt_immediate_p (offset)
+ || aarch64_sve_rdvl_immediate_p (offset)))
emit_insn (gen_rtx_SET (dest, imm));
else
{
@@ -19641,7 +19710,9 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
return true;
- if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
+ if (TARGET_SVE
+ && (aarch64_sve_cnt_immediate_p (x)
+ || aarch64_sve_rdvl_immediate_p (x)))
return true;
return aarch64_classify_symbolic_expression (x)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 202190c..d843f47 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1230,6 +1230,7 @@
[w, D<hq>; neon_move , simd ] << aarch64_output_scalar_simd_mov_immediate (operands[1], <MODE>mode);
/* The "mov_imm" type for CNT is just a placeholder. */
[r, Usv ; mov_imm , sve ] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+ [r, Usr ; mov_imm , sve ] << aarch64_output_sve_rdvl (operands[1]);
[r, m ; load_4 , * ] ldr<size>\t%w0, %1
[w, m ; load_4 , * ] ldr\t%<size>0, %1
[m, r Z ; store_4 , * ] str<size>\\t%w1, %0
@@ -1289,6 +1290,7 @@
[r , n ; mov_imm , * ,16] #
/* The "mov_imm" type for CNT is just a placeholder. */
[r , Usv; mov_imm , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+ [r , Usr; mov_imm , sve, 4] << aarch64_output_sve_rdvl (operands[1]);
[r , m ; load_4 , * , 4] ldr\t%w0, %1
[w , m ; load_4 , fp , 4] ldr\t%s0, %1
[m , r Z; store_4 , * , 4] str\t%w1, %0
@@ -1324,6 +1326,7 @@
[r, n ; mov_imm , * ,16] #
/* The "mov_imm" type for CNT is just a placeholder. */
[r, Usv; mov_imm , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+ [r, Usr; mov_imm , sve, 4] << aarch64_output_sve_rdvl (operands[1]);
[r, m ; load_8 , * , 4] ldr\t%x0, %1
[w, m ; load_8 , fp , 4] ldr\t%d0, %1
[m, r Z; store_8 , * , 4] str\t%x1, %0
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index b3922bc..5c02d15 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -219,6 +219,12 @@
(and (match_code "const_int")
(match_test "aarch64_high_bits_all_ones_p (ival)")))
+(define_constraint "Usr"
+ "@internal
+ A constraint that matches a value produced by RDVL."
+ (and (match_code "const_poly_int")
+ (match_test "aarch64_sve_rdvl_immediate_p (op)")))
+
(define_constraint "Usv"
"@internal
A constraint that matches a VG-based constant that can be loaded by
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
index 8b8fe8e..a22d8a2 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
@@ -51,19 +51,24 @@ PROTO (cntb_15, uint64_t, ()) { return svcntb () * 15; }
*/
PROTO (cntb_16, uint64_t, ()) { return svcntb () * 16; }
-/* Other sequences would be OK. */
/*
** cntb_17:
-** cntb x0, all, mul #16
-** incb x0
+** rdvl x0, #17
** ret
*/
PROTO (cntb_17, uint64_t, ()) { return svcntb () * 17; }
/*
+** cntb_31:
+** rdvl x0, #31
+** ret
+*/
+PROTO (cntb_31, uint64_t, ()) { return svcntb () * 31; }
+
+/*
** cntb_32:
-** cntd (x[0-9]+)
-** lsl x0, \1, 8
+** cntb (x[0-9]+)
+** lsl x0, \1, 5
** ret
*/
PROTO (cntb_32, uint64_t, ()) { return svcntb () * 32; }
@@ -80,16 +85,16 @@ PROTO (cntb_33, uint64_t, ()) { return svcntb () * 33; }
/*
** cntb_64:
-** cntd (x[0-9]+)
-** lsl x0, \1, 9
+** cntb (x[0-9]+)
+** lsl x0, \1, 6
** ret
*/
PROTO (cntb_64, uint64_t, ()) { return svcntb () * 64; }
/*
** cntb_128:
-** cntd (x[0-9]+)
-** lsl x0, \1, 10
+** cntb (x[0-9]+)
+** lsl x0, \1, 7
** ret
*/
PROTO (cntb_128, uint64_t, ()) { return svcntb () * 128; }
@@ -106,47 +111,71 @@ PROTO (cntb_129, uint64_t, ()) { return svcntb () * 129; }
/*
** cntb_m1:
-** cntb (x[0-9]+)
-** neg x0, \1
+** rdvl x0, #-1
** ret
*/
PROTO (cntb_m1, uint64_t, ()) { return -svcntb (); }
/*
** cntb_m13:
-** cntb (x[0-9]+), all, mul #13
-** neg x0, \1
+** rdvl x0, #-13
** ret
*/
PROTO (cntb_m13, uint64_t, ()) { return -svcntb () * 13; }
/*
** cntb_m15:
-** cntb (x[0-9]+), all, mul #15
-** neg x0, \1
+** rdvl x0, #-15
** ret
*/
PROTO (cntb_m15, uint64_t, ()) { return -svcntb () * 15; }
/*
** cntb_m16:
-** cntb (x[0-9]+), all, mul #16
-** neg x0, \1
+** rdvl x0, #-16
** ret
*/
PROTO (cntb_m16, uint64_t, ()) { return -svcntb () * 16; }
-/* Other sequences would be OK. */
/*
** cntb_m17:
-** cntb x0, all, mul #16
-** incb x0
-** neg x0, x0
+** rdvl x0, #-17
** ret
*/
PROTO (cntb_m17, uint64_t, ()) { return -svcntb () * 17; }
/*
+** cntb_m32:
+** rdvl x0, #-32
+** ret
+*/
+PROTO (cntb_m32, uint64_t, ()) { return -svcntb () * 32; }
+
+/*
+** cntb_m33:
+** rdvl x0, #-32
+** decb x0
+** ret
+*/
+PROTO (cntb_m33, uint64_t, ()) { return -svcntb () * 33; }
+
+/*
+** cntb_m34:
+** rdvl (x[0-9]+), #-17
+** lsl x0, \1, #?1
+** ret
+*/
+PROTO (cntb_m34, uint64_t, ()) { return -svcntb () * 34; }
+
+/*
+** cntb_m64:
+** rdvl (x[0-9]+), #-1
+** lsl x0, \1, #?6
+** ret
+*/
+PROTO (cntb_m64, uint64_t, ()) { return -svcntb () * 64; }
+
+/*
** incb_1:
** incb x0
** ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
index 0d0ed48..090a643 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
@@ -54,8 +54,8 @@ PROTO (cntd_16, uint64_t, ()) { return svcntd () * 16; }
/* Other sequences would be OK. */
/*
** cntd_17:
-** cntb x0, all, mul #2
-** incd x0
+** rdvl (x[0-9]+), #17
+** asr x0, \1, 3
** ret
*/
PROTO (cntd_17, uint64_t, ()) { return svcntd () * 17; }
@@ -107,8 +107,7 @@ PROTO (cntd_m15, uint64_t, ()) { return -svcntd () * 15; }
/*
** cntd_m16:
-** cntb (x[0-9]+), all, mul #2
-** neg x0, \1
+** rdvl x0, #-2
** ret
*/
PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
@@ -116,9 +115,8 @@ PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
/* Other sequences would be OK. */
/*
** cntd_m17:
-** cntb x0, all, mul #2
-** incd x0
-** neg x0, x0
+** rdvl (x[0-9]+), #-17
+** asr x0, \1, 3
** ret
*/
PROTO (cntd_m17, uint64_t, ()) { return -svcntd () * 17; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
index c29930f..1a4e7dc 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
@@ -54,8 +54,8 @@ PROTO (cnth_16, uint64_t, ()) { return svcnth () * 16; }
/* Other sequences would be OK. */
/*
** cnth_17:
-** cntb x0, all, mul #8
-** inch x0
+** rdvl (x[0-9]+), #17
+** asr x0, \1, 1
** ret
*/
PROTO (cnth_17, uint64_t, ()) { return svcnth () * 17; }
@@ -69,16 +69,16 @@ PROTO (cnth_32, uint64_t, ()) { return svcnth () * 32; }
/*
** cnth_64:
-** cntd (x[0-9]+)
-** lsl x0, \1, 8
+** cntb (x[0-9]+)
+** lsl x0, \1, 5
** ret
*/
PROTO (cnth_64, uint64_t, ()) { return svcnth () * 64; }
/*
** cnth_128:
-** cntd (x[0-9]+)
-** lsl x0, \1, 9
+** cntb (x[0-9]+)
+** lsl x0, \1, 6
** ret
*/
PROTO (cnth_128, uint64_t, ()) { return svcnth () * 128; }
@@ -109,8 +109,7 @@ PROTO (cnth_m15, uint64_t, ()) { return -svcnth () * 15; }
/*
** cnth_m16:
-** cntb (x[0-9]+), all, mul #8
-** neg x0, \1
+** rdvl x0, #-8
** ret
*/
PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
@@ -118,9 +117,8 @@ PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
/* Other sequences would be OK. */
/*
** cnth_m17:
-** cntb x0, all, mul #8
-** inch x0
-** neg x0, x0
+** rdvl (x[0-9]+), #-17
+** asr x0, \1, 1
** ret
*/
PROTO (cnth_m17, uint64_t, ()) { return -svcnth () * 17; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
index e26cc67..9d16976 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
@@ -54,8 +54,8 @@ PROTO (cntw_16, uint64_t, ()) { return svcntw () * 16; }
/* Other sequences would be OK. */
/*
** cntw_17:
-** cntb x0, all, mul #4
-** incw x0
+** rdvl (x[0-9]+), #17
+** asr x0, \1, 2
** ret
*/
PROTO (cntw_17, uint64_t, ()) { return svcntw () * 17; }
@@ -76,8 +76,8 @@ PROTO (cntw_64, uint64_t, ()) { return svcntw () * 64; }
/*
** cntw_128:
-** cntd (x[0-9]+)
-** lsl x0, \1, 8
+** cntb (x[0-9]+)
+** lsl x0, \1, 5
** ret
*/
PROTO (cntw_128, uint64_t, ()) { return svcntw () * 128; }
@@ -108,8 +108,7 @@ PROTO (cntw_m15, uint64_t, ()) { return -svcntw () * 15; }
/*
** cntw_m16:
-** cntb (x[0-9]+), all, mul #4
-** neg x0, \1
+** rdvl (x[0-9]+), #-4
** ret
*/
PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
@@ -117,9 +116,8 @@ PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
/* Other sequences would be OK. */
/*
** cntw_m17:
-** cntb x0, all, mul #4
-** incw x0
-** neg x0, x0
+** rdvl (x[0-9]+), #-17
+** asr x0, \1, 2
** ret
*/
PROTO (cntw_m17, uint64_t, ()) { return -svcntw () * 17; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
index c90730a..94cd3a0 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfb_vnum_31, uint16_t,
/*
** prfb_vnum_32:
-** cntd (x[0-9]+)
-** lsl (x[0-9]+), \1, #?8
+** cntb (x[0-9]+)
+** lsl (x[0-9]+), \1, #?5
** add (x[0-9]+), (\2, x0|x0, \2)
** prfb pldl1keep, p0, \[\3\]
** ret
@@ -240,7 +240,7 @@ TEST_PREFETCH (prfb_vnum_m32, uint16_t,
/*
** prfb_vnum_m33:
** ...
-** prfb pldl1keep, p0, \[x[0-9]+\]
+** prfb pldl1keep, p0, \[x[0-9]+(, x[0-9]+)?\]
** ret
*/
TEST_PREFETCH (prfb_vnum_m33, uint16_t,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
index 869ef3d..b7a116c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfd_vnum_31, uint16_t,
/*
** prfd_vnum_32:
-** cntd (x[0-9]+)
-** lsl (x[0-9]+), \1, #?8
+** cntb (x[0-9]+)
+** lsl (x[0-9]+), \1, #?5
** add (x[0-9]+), (\2, x0|x0, \2)
** prfd pldl1keep, p0, \[\3\]
** ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
index 45a735e..9d3df6b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfh_vnum_31, uint16_t,
/*
** prfh_vnum_32:
-** cntd (x[0-9]+)
-** lsl (x[0-9]+), \1, #?8
+** cntb (x[0-9]+)
+** lsl (x[0-9]+), \1, #?5
** add (x[0-9]+), (\2, x0|x0, \2)
** prfh pldl1keep, p0, \[\3\]
** ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
index 444187f..6962aba 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfw_vnum_31, uint16_t,
/*
** prfw_vnum_32:
-** cntd (x[0-9]+)
-** lsl (x[0-9]+), \1, #?8
+** cntb (x[0-9]+)
+** lsl (x[0-9]+), \1, #?5
** add (x[0-9]+), (\2, x0|x0, \2)
** prfw pldl1keep, p0, \[\3\]
** ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
index 9ead9c2..7f02497 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
@@ -68,8 +68,7 @@ TEST_ALL (LOOP)
/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, w[0-9]+, w[0-9]+\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
-/* 2 for the calculations of -17 and 17. */
-/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 10 } } */
+/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 8 } } */
/* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #16\n} 1 } } */
/* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #15\n} 1 } } */
@@ -86,8 +85,7 @@ TEST_ALL (LOOP)
/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, x[0-9]+, x[0-9]+\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
-/* 2 for the calculations of -17 and 17. */
-/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 10 } } */
+/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 8 } } */
/* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #16\n} 1 } } */
/* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #15\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
index 110947a..5de34fc 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
@@ -6,8 +6,7 @@
/*
** test_1:
-** cntd x12, all, mul #9
-** lsl x12, x12, #?4
+** rdvl x12, #18
** mov x11, sp
** ...
** sub sp, sp, x12