diff options
author | Soumya AR <soumyaa@nvidia.com> | 2024-11-13 10:20:14 +0530 |
---|---|---|
committer | Soumya AR <soumyaa@nvidia.com> | 2024-11-13 10:20:14 +0530 |
commit | 9b2915d95d855333d4d8f66b71a75f653ee0d076 (patch) | |
tree | a31c5d16c0f9792665a7ed761776d206c35c5dbe /gcc | |
parent | 445d8bb6a89eb2275c4930ec87a98d5123e5abdd (diff) | |
download | gcc-9b2915d95d855333d4d8f66b71a75f653ee0d076.zip gcc-9b2915d95d855333d4d8f66b71a75f653ee0d076.tar.gz gcc-9b2915d95d855333d4d8f66b71a75f653ee0d076.tar.bz2 |
aarch64: Optimise calls to ldexp with SVE FSCALE instruction [PR111733]
This patch uses the FSCALE instruction provided by SVE to implement the
standard ldexp family of functions.
Currently, with '-Ofast -mcpu=neoverse-v2', GCC generates libcalls for the
following code:
float
test_ldexpf (float x, int i)
{
return __builtin_ldexpf (x, i);
}
double
test_ldexp (double x, int i)
{
return __builtin_ldexp(x, i);
}
GCC Output:
test_ldexpf:
b ldexpf
test_ldexp:
b ldexp
Since SVE has support for an FSCALE instruction, we can use this to process
scalar floats by moving them to a vector register and performing an fscale call,
similar to how LLVM tackles an ldexp builtin as well.
New Output:
test_ldexpf:
fmov s31, w0
ptrue p7.b, vl4
fscale z0.s, p7/m, z0.s, z31.s
ret
test_ldexp:
sxtw x0, w0
ptrue p7.b, vl8
fmov d31, x0
fscale z0.d, p7/m, z0.d, z31.d
ret
This is a revision of an earlier patch, and now uses the extended definition of
aarch64_ptrue_reg to generate predicate registers with the appropriate set bits.
The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?
Signed-off-by: Soumya AR <soumyaa@nvidia.com>
gcc/ChangeLog:
PR target/111733
* config/aarch64/aarch64-sve.md
(ldexp<mode>3): Added a new pattern to match ldexp calls with scalar
floating modes and expand to the existing pattern for FSCALE.
* config/aarch64/iterators.md:
(SVE_FULL_F_SCALAR): Added an iterator to match all FP SVE modes as well
as their scalar equivalents.
(VPRED): Extended the attribute to handle GPF_HF modes.
* internal-fn.def (LDEXP): Changed macro to incorporate ldexpf16.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/fscale.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-sve.md | 25 | ||||
-rw-r--r-- | gcc/config/aarch64/iterators.md | 6 | ||||
-rw-r--r-- | gcc/internal-fn.def | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/fscale.c | 46 |
4 files changed, 72 insertions, 7 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 5f0ecf4..affdb24 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -5088,6 +5088,21 @@ ;; - FTSSEL ;; ------------------------------------------------------------------------- +(define_expand "ldexp<mode>3" + [(set (match_operand:GPF_HF 0 "register_operand") + (unspec:GPF_HF + [(match_dup 3) + (const_int SVE_STRICT_GP) + (match_operand:GPF_HF 1 "register_operand") + (match_operand:<V_INT_EQUIV> 2 "register_operand")] + UNSPEC_COND_FSCALE))] + "TARGET_SVE" + { + operands[3] = aarch64_ptrue_reg (<VPRED>mode, + GET_MODE_UNIT_SIZE (<MODE>mode)); + } +) + ;; Unpredicated floating-point binary operations that take an integer as ;; their second operand. (define_insn "@aarch64_sve_<optab><mode>" @@ -5103,17 +5118,17 @@ ;; Predicated floating-point binary operations that take an integer ;; as their second operand. (define_insn "@aarch64_pred_<optab><mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_FULL_F_SCALAR 0 "register_operand") + (unspec:SVE_FULL_F_SCALAR [(match_operand:<VPRED> 1 "register_operand") (match_operand:SI 4 "aarch64_sve_gp_strictness") - (match_operand:SVE_FULL_F 2 "register_operand") + (match_operand:SVE_FULL_F_SCALAR 2 "register_operand") (match_operand:<V_INT_EQUIV> 3 "register_operand")] SVE_COND_FP_BINARY_INT))] "TARGET_SVE" {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] - [ w , Upl , 0 , w ; * ] <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> - [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + [ w , Upl , 0 , w ; * ] <sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%Z0, %Z2\;<sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype> } ) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 8e3b5731..ce8f032 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -452,6 +452,9 @@ ;; All fully-packed SVE floating-point vector modes. (define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF]) +;; Fully-packed SVE floating-point vector modes and their scalar equivalents. +(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF]) + ;; Fully-packed SVE integer vector modes that have 8-bit or 16-bit elements. (define_mode_iterator SVE_FULL_BHI [VNx16QI VNx8HI]) @@ -2354,7 +2357,8 @@ (VNx8DI "VNx2BI") (VNx8DF "VNx2BI") (V8QI "VNx8BI") (V16QI "VNx16BI") (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI") - (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")]) + (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI") + (HF "VNx8BI") (SF "VNx4BI") (DF "VNx2BI")]) ;; ...and again in lower case. (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi") diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index c3d0efc..09b7844 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -441,7 +441,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_FMADDSUB, ECF_CONST, vec_fmaddsub, ternary) DEF_INTERNAL_OPTAB_FN (VEC_FMSUBADD, ECF_CONST, vec_fmsubadd, ternary) /* FP scales. */ -DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary) +DEF_INTERNAL_FLT_FLOATN_FN (LDEXP, ECF_CONST, ldexp, binary) /* Ternary math functions. */ DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fscale.c b/gcc/testsuite/gcc.target/aarch64/sve/fscale.c new file mode 100644 index 0000000..23e295d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/fscale.c @@ -0,0 +1,46 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-Ofast" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +/* +** test_ldexpf16: +** ... +** ptrue (p[0-7]).b, vl2 +** ... +** fscale z[0-9]+\.h, \1/m, z[0-9]+\.h, z[0-9]+\.h +** ret +*/ +_Float16 +test_ldexpf16 (_Float16 x, int i) +{ + return __builtin_ldexpf16 (x, i); +} + +/* +** test_ldexpf: +** ... +** ptrue (p[0-7])\.b, vl4 +** ... +** fscale z[0-9]+\.s, \1/m, z[0-9]+\.s, z[0-9]+\.s +** ret +*/ +float +test_ldexpf (float x, int i) +{ + return __builtin_ldexpf (x, i); +} + +/* +** test_ldexp: +** ... +** ptrue (p[0-7]).b, vl8 +** ... +** fscale z[0-9]+\.d, \1/m, z[0-9]+\.d, z[0-9]+\.d +** ret +*/ +double +test_ldexp (double x, int i) +{ + return __builtin_ldexp (x, i); +} + |