aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorFilip Kastl <fkastl@suse.cz>2024-11-13 16:11:14 +0100
committerFilip Kastl <fkastl@suse.cz>2024-11-13 16:11:14 +0100
commit99ec0eb32a03506142f30c158276b4131aa73fe8 (patch)
treeb8016d2ff70bc969fb5ff5a22e5ed574803e3cac /gcc
parent0342d024ff9af258df3bc8a23c9e999bea6be3e0 (diff)
downloadgcc-99ec0eb32a03506142f30c158276b4131aa73fe8.zip
gcc-99ec0eb32a03506142f30c158276b4131aa73fe8.tar.gz
gcc-99ec0eb32a03506142f30c158276b4131aa73fe8.tar.bz2
i386: Add -mveclibabi=aocl [PR56504]
We currently support generating vectorized math calls to the AMD core math library (ACML) (-mveclibabi=acml). That library is end-of-life and its successor is the math library from AMD Optimizing CPU Libraries (AOCL). This patch adds support for AOCL (-mveclibabi=aocl). That significantly broadens the range of vectorized math functions optimized for AMD CPUs that GCC can generate calls to. See the edit to invoke.texi for a complete list of added functions. Compared to the list of functions in AOCL LibM docs I left out these vectorized function families: - sincos and all functions working with arrays ... Because these functions have pointer arguments and that would require a bigger rework of ix86_veclibabi_aocl(). Also, I'm not sure if GCC even ever generates calls to these functions. - linearfrac ... Because these functions are specific to the AMD library. There's no equivalent glibc function nor GCC internal function nor GCC built-in. - powx, sqrt, fabs ... Because GCC doesn't vectorize these functions into calls and uses instructions instead. I also left amd_vrd2_expm1() (the AMD docs list the function but I wasn't able to link calls to it with the current version of the library). gcc/ChangeLog: PR target/56504 * config/i386/i386-options.cc (ix86_option_override_internal): Add ix86_veclibabi_type_aocl case. * config/i386/i386-options.h (ix86_veclibabi_aocl): Add extern ix86_veclibabi_aocl(). * config/i386/i386-opts.h (enum ix86_veclibabi): Add ix86_veclibabi_type_aocl into the ix86_veclibabi enum. * config/i386/i386.cc (ix86_veclibabi_aocl): New function. * config/i386/i386.opt: Add the 'aocl' type. * doc/invoke.texi: Document -mveclibabi=aocl. gcc/testsuite/ChangeLog: PR target/56504 * gcc.target/i386/vectorize-aocl1.c: New test. Signed-off-by: Filip Kastl <fkastl@suse.cz>
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386-options.cc4
-rw-r--r--gcc/config/i386/i386-options.h1
-rw-r--r--gcc/config/i386/i386-opts.h3
-rw-r--r--gcc/config/i386/i386.cc142
-rw-r--r--gcc/config/i386/i386.opt3
-rw-r--r--gcc/doc/invoke.texi57
-rw-r--r--gcc/testsuite/gcc.target/i386/vectorize-aocl1.c224
7 files changed, 418 insertions, 16 deletions
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 603166d..76a2017 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2877,6 +2877,10 @@ ix86_option_override_internal (bool main_args_p,
ix86_veclib_handler = &ix86_veclibabi_acml;
break;
+ case ix86_veclibabi_type_aocl:
+ ix86_veclib_handler = &ix86_veclibabi_aocl;
+ break;
+
default:
gcc_unreachable ();
}
diff --git a/gcc/config/i386/i386-options.h b/gcc/config/i386/i386-options.h
index 0d448ef..591a615 100644
--- a/gcc/config/i386/i386-options.h
+++ b/gcc/config/i386/i386-options.h
@@ -60,6 +60,7 @@ void ix86_simd_clone_adjust (struct cgraph_node *node);
extern tree (*ix86_veclib_handler) (combined_fn, tree, tree);
extern tree ix86_veclibabi_svml (combined_fn, tree, tree);
extern tree ix86_veclibabi_acml (combined_fn, tree, tree);
+extern tree ix86_veclibabi_aocl (combined_fn, tree, tree);
enum ix86_function_specific_strings
{
diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
index 35542b2..69fcd82 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -87,7 +87,8 @@ enum asm_dialect {
enum ix86_veclibabi {
ix86_veclibabi_type_none,
ix86_veclibabi_type_svml,
- ix86_veclibabi_type_acml
+ ix86_veclibabi_type_acml,
+ ix86_veclibabi_type_aocl
};
enum stack_protector_guard {
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 526c9df..9d3d8ab 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -19882,6 +19882,148 @@ ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
return new_fndecl;
}
+/* Handler for an AOCL-LibM-style interface to
+ a library with vectorized intrinsics. */
+
+tree
+ix86_veclibabi_aocl (combined_fn fn, tree type_out, tree type_in)
+{
+ char name[20] = "amd_vr";
+ int name_len = 6;
+ tree fntype, new_fndecl, args;
+ unsigned arity;
+ const char *bname;
+ machine_mode el_mode, in_mode;
+ int n, in_n;
+
+ /* AOCL-LibM is 64bits only. It is also only suitable for unsafe math only
+ as it trades off some accuracy for increased performance. */
+ if (!TARGET_64BIT
+ || !flag_unsafe_math_optimizations)
+ return NULL_TREE;
+
+ el_mode = TYPE_MODE (TREE_TYPE (type_out));
+ n = TYPE_VECTOR_SUBPARTS (type_out);
+ in_mode = TYPE_MODE (TREE_TYPE (type_in));
+ in_n = TYPE_VECTOR_SUBPARTS (type_in);
+ if (el_mode != in_mode
+ || n != in_n)
+ return NULL_TREE;
+
+ gcc_checking_assert (n > 0);
+
+ /* Decide whether there exists a function for the combination of FN, the mode
+ and the vector width. Return early if it doesn't. */
+
+ if (el_mode != DFmode && el_mode != SFmode)
+ return NULL_TREE;
+
+ /* Supported vector widths for given FN and single/double precision. Zeros
+ are used to fill out unused positions in the arrays. */
+ static const int supported_n[][2][3] = {
+ /* Single prec. , Double prec. */
+ { { 16, 0, 0 }, { 2, 4, 8 } }, /* TAN. */
+ { { 4, 8, 16 }, { 2, 4, 8 } }, /* EXP. */
+ { { 4, 8, 16 }, { 2, 4, 8 } }, /* EXP2. */
+ { { 4, 8, 16 }, { 2, 4, 8 } }, /* LOG. */
+ { { 4, 8, 16 }, { 2, 4, 8 } }, /* LOG2. */
+ { { 4, 8, 16 }, { 2, 4, 8 } }, /* COS. */
+ { { 4, 8, 16 }, { 2, 4, 8 } }, /* SIN. */
+ { { 4, 8, 16 }, { 2, 4, 8 } }, /* POW. */
+ { { 4, 8, 16 }, { 2, 4, 8 } }, /* ERF. */
+ { { 4, 8, 16 }, { 2, 8, 0 } }, /* ATAN. */
+ { { 4, 8, 16 }, { 2, 0, 0 } }, /* LOG10. */
+ { { 4, 0, 0 }, { 2, 0, 0 } }, /* EXP10. */
+ { { 4, 0, 0 }, { 2, 0, 0 } }, /* LOG1P. */
+ { { 4, 8, 16 }, { 8, 0, 0 } }, /* ASIN. */
+ { { 4, 16, 0 }, { 0, 0, 0 } }, /* ACOS. */
+ { { 4, 8, 16 }, { 0, 0, 0 } }, /* TANH. */
+ { { 4, 0, 0 }, { 0, 0, 0 } }, /* EXPM1. */
+ { { 4, 8, 0 }, { 0, 0, 0 } }, /* COSH. */
+ };
+
+ /* We cannot simply index the supported_n array with FN since multiple FNs
+ may correspond to a single operation (see the definitions of these
+ CASE_CFN_* macros). */
+ int i;
+ switch (fn)
+ {
+ CASE_CFN_TAN : i = 0; break;
+ CASE_CFN_EXP : i = 1; break;
+ CASE_CFN_EXP2 : i = 2; break;
+ CASE_CFN_LOG : i = 3; break;
+ CASE_CFN_LOG2 : i = 4; break;
+ CASE_CFN_COS : i = 5; break;
+ CASE_CFN_SIN : i = 6; break;
+ CASE_CFN_POW : i = 7; break;
+ CASE_CFN_ERF : i = 8; break;
+ CASE_CFN_ATAN : i = 9; break;
+ CASE_CFN_LOG10 : i = 10; break;
+ CASE_CFN_EXP10 : i = 11; break;
+ CASE_CFN_LOG1P : i = 12; break;
+ CASE_CFN_ASIN : i = 13; break;
+ CASE_CFN_ACOS : i = 14; break;
+ CASE_CFN_TANH : i = 15; break;
+ CASE_CFN_EXPM1 : i = 16; break;
+ CASE_CFN_COSH : i = 17; break;
+ default: return NULL_TREE;
+ }
+
+ int j = el_mode == DFmode;
+ bool n_is_supported = false;
+ for (unsigned k = 0; k < 3; k++)
+ if (supported_n[i][j][k] == n)
+ {
+ n_is_supported = true;
+ break;
+ }
+ if (!n_is_supported)
+ return NULL_TREE;
+
+ /* Append the precision and the vector width to the function name we are
+ constructing. */
+ name[name_len++] = el_mode == DFmode ? 'd' : 's';
+ switch (n)
+ {
+ case 2:
+ case 4:
+ case 8:
+ name[name_len++] = '0' + n;
+ break;
+ case 16:
+ name[name_len++] = '1';
+ name[name_len++] = '6';
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ name[name_len++] = '_';
+
+ /* Append the operation name (steal it from the name of a builtin). */
+ tree fndecl = mathfn_built_in (el_mode == DFmode
+ ? double_type_node : float_type_node, fn);
+ bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+ sprintf (name + name_len, "%s", bname + 10);
+
+ arity = 0;
+ for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
+ arity++;
+
+ if (arity == 1)
+ fntype = build_function_type_list (type_out, type_in, NULL);
+ else
+ fntype = build_function_type_list (type_out, type_in, type_in, NULL);
+
+ /* Build a function declaration for the vectorized function. */
+ new_fndecl = build_decl (BUILTINS_LOCATION,
+ FUNCTION_DECL, get_identifier (name), fntype);
+ TREE_PUBLIC (new_fndecl) = 1;
+ DECL_EXTERNAL (new_fndecl) = 1;
+ TREE_READONLY (new_fndecl) = 1;
+
+ return new_fndecl;
+}
+
/* Returns a decl of a function that implements scatter store with
register type VECTYPE and index type INDEX_TYPE and SCALE.
Return NULL_TREE if it is not available. */
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 99e86f5..ea29265 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -588,6 +588,9 @@ Enum(ix86_veclibabi) String(svml) Value(ix86_veclibabi_type_svml)
EnumValue
Enum(ix86_veclibabi) String(acml) Value(ix86_veclibabi_type_acml)
+EnumValue
+Enum(ix86_veclibabi) String(aocl) Value(ix86_veclibabi_type_aocl)
+
mvect8-ret-in-mem
Target Mask(VECT8_RETURNS) Save
Return 8-byte vectors in memory.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index aa979c3..4a494f6 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -35944,25 +35944,52 @@ all of the reciprocal approximations, except for square root.
@opindex mveclibabi
@item -mveclibabi=@var{type}
-Specifies the ABI type to use for vectorizing intrinsics using an
-external library. Supported values for @var{type} are @samp{svml}
-for the Intel short
-vector math library and @samp{acml} for the AMD math core library.
+Specifies the ABI type to use for vectorizing intrinsics using an external
+library. Supported values for @var{type} are @samp{svml} for the Intel short
+vector math library, @samp{aocl} for the math library (LibM) from AMD
+Optimizing CPU Libraries (AOCL) and @samp{acml} for the end-of-life AMD core
+math library (to which AOCL-LibM is the successor).
To use this option, both @option{-ftree-vectorize} and
@option{-funsafe-math-optimizations} have to be enabled, and an SVML or ACML
ABI-compatible library must be specified at link time.
-GCC currently emits calls to @code{vmldExp2},
-@code{vmldLn2}, @code{vmldLog102}, @code{vmldPow2},
-@code{vmldTanh2}, @code{vmldTan2}, @code{vmldAtan2}, @code{vmldAtanh2},
-@code{vmldCbrt2}, @code{vmldSinh2}, @code{vmldSin2}, @code{vmldAsinh2},
-@code{vmldAsin2}, @code{vmldCosh2}, @code{vmldCos2}, @code{vmldAcosh2},
-@code{vmldAcos2}, @code{vmlsExp4}, @code{vmlsLn4},
-@code{vmlsLog104}, @code{vmlsPow4}, @code{vmlsTanh4}, @code{vmlsTan4},
-@code{vmlsAtan4}, @code{vmlsAtanh4}, @code{vmlsCbrt4}, @code{vmlsSinh4},
-@code{vmlsSin4}, @code{vmlsAsinh4}, @code{vmlsAsin4}, @code{vmlsCosh4},
-@code{vmlsCos4}, @code{vmlsAcosh4} and @code{vmlsAcos4} for corresponding
-function type when @option{-mveclibabi=svml} is used, and @code{__vrd2_sin},
+GCC currently emits calls to @code{vmldExp2}, @code{vmldLn2},
+@code{vmldLog102}, @code{vmldPow2}, @code{vmldTanh2}, @code{vmldTan2},
+@code{vmldAtan2}, @code{vmldAtanh2}, @code{vmldCbrt2}, @code{vmldSinh2},
+@code{vmldSin2}, @code{vmldAsinh2}, @code{vmldAsin2}, @code{vmldCosh2},
+@code{vmldCos2}, @code{vmldAcosh2}, @code{vmldAcos2}, @code{vmlsExp4},
+@code{vmlsLn4}, @code{vmlsLog104}, @code{vmlsPow4}, @code{vmlsTanh4},
+@code{vmlsTan4}, @code{vmlsAtan4}, @code{vmlsAtanh4}, @code{vmlsCbrt4},
+@code{vmlsSinh4}, @code{vmlsSin4}, @code{vmlsAsinh4}, @code{vmlsAsin4},
+@code{vmlsCosh4}, @code{vmlsCos4}, @code{vmlsAcosh4} and @code{vmlsAcos4} for
+corresponding function type when @option{-mveclibabi=svml} is used,
+@code{amd_vrs4_acosf}, @code{amd_vrs16_acosf}, @code{amd_vrd8_asin},
+@code{amd_vrs4_asinf}, @code{amd_vrs8_asinf}, @code{amd_vrs16_asinf},
+@code{amd_vrd2_atan}, @code{amd_vrd8_atan}, @code{amd_vrs4_atanf},
+@code{amd_vrs8_atanf}, @code{amd_vrs16_atanf}, @code{amd_vrd2_cos},
+@code{amd_vrd4_cos}, @code{amd_vrd8_cos}, @code{amd_vrs4_cosf},
+@code{amd_vrs8_cosf}, @code{amd_vrs16_cosf}, @code{amd_vrs4_coshf},
+@code{amd_vrs8_coshf}, @code{amd_vrd2_erf}, @code{amd_vrd4_erf},
+@code{amd_vrd8_erf}, @code{amd_vrs4_erff}, @code{amd_vrs8_erff},
+@code{amd_vrs16_erff}, @code{amd_vrd2_exp}, @code{amd_vrd4_exp},
+@code{amd_vrd8_exp}, @code{amd_vrs4_expf}, @code{amd_vrs8_expf},
+@code{amd_vrs16_expf}, @code{amd_vrd2_exp10}, @code{amd_vrs4_exp10f},
+@code{amd_vrd2_exp2}, @code{amd_vrd4_exp2}, @code{amd_vrd8_exp2},
+@code{amd_vrs4_exp2f}, @code{amd_vrs8_exp2f}, @code{amd_vrs16_exp2f},
+@code{amd_vrs4_expm1f}, @code{amd_vrd2_log}, @code{amd_vrd4_log},
+@code{amd_vrd8_log}, @code{amd_vrs4_logf}, @code{amd_vrs8_logf},
+@code{amd_vrs16_logf}, @code{amd_vrd2_log10}, @code{amd_vrs4_log10f},
+@code{amd_vrs8_log10f}, @code{amd_vrs16_log10f}, @code{amd_vrd2_log1p},
+@code{amd_vrs4_log1pf}, @code{amd_vrd2_log2}, @code{amd_vrd4_log2},
+@code{amd_vrd8_log2}, @code{amd_vrs4_log2f}, @code{amd_vrs8_log2f},
+@code{amd_vrs16_log2f}, @code{amd_vrd2_pow}, @code{amd_vrd4_pow},
+@code{amd_vrd8_pow}, @code{amd_vrs4_powf}, @code{amd_vrs8_powf},
+@code{amd_vrs16_powf}, @code{amd_vrd2_sin}, @code{amd_vrd4_sin},
+@code{amd_vrd8_sin}, @code{amd_vrs4_sinf}, @code{amd_vrs8_sinf},
+@code{amd_vrs16_sinf}, @code{amd_vrd2_tan}, @code{amd_vrd4_tan},
+@code{amd_vrd8_tan}, @code{amd_vrs16_tanf}, @code{amd_vrs4_tanhf},
+@code{amd_vrs8_tanhf}, @code{amd_vrs16_tanhf} for the corresponding function
+type when @option{-mveclibabi=aocl} is used, and @code{__vrd2_sin},
@code{__vrd2_cos}, @code{__vrd2_exp}, @code{__vrd2_log}, @code{__vrd2_log2},
@code{__vrd2_log10}, @code{__vrs4_sinf}, @code{__vrs4_cosf},
@code{__vrs4_expf}, @code{__vrs4_logf}, @code{__vrs4_log2f},
diff --git a/gcc/testsuite/gcc.target/i386/vectorize-aocl1.c b/gcc/testsuite/gcc.target/i386/vectorize-aocl1.c
new file mode 100644
index 0000000..5ffb04a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vectorize-aocl1.c
@@ -0,0 +1,224 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=znver5 -mveclibabi=aocl" } */
+
+
+/* Declare glibc math functions we need since this testcase may be run on
+ systems that don't have glibc. */
+float tanf(float);
+float expf(float);
+float exp2f(float);
+float logf(float);
+float log2f(float);
+float cosf(float);
+float sinf(float);
+float powf(float, float);
+float erff(float);
+float atanf(float);
+float log10f(float);
+float exp10f(float);
+float expm1f(float);
+float log1pf(float);
+float asinf(float);
+float acosf(float);
+float tanhf(float);
+float coshf(float);
+
+double tan(double);
+double exp(double);
+double exp2(double);
+double log(double);
+double log2(double);
+double cos(double);
+double sin(double);
+double pow(double, double);
+double erf(double);
+double atan(double);
+double log10(double);
+double exp10(double);
+double expm1(double);
+double log1p(double);
+double asin(double);
+double acos(double);
+double tanh(double);
+double cosh(double);
+
+#define gentest1(FUN, BASE, VF) \
+ extern BASE s_##FUN##_##BASE##_##VF[VF]; \
+ extern BASE d_##FUN##_##BASE##_##VF[VF]; \
+ void test_##FUN##_##BASE##_##VF (void) \
+ { \
+ for (int i = 0; i < VF; i++) \
+ d_##FUN##_##BASE##_##VF[i] \
+ = FUN (s_##FUN##_##BASE##_##VF[i]); \
+ } \
+
+
+#define gentest2(FUN, BASE, VF) \
+ extern BASE s1_##FUN##_##BASE##_##VF[VF]; \
+ extern BASE s2_##FUN##_##BASE##_##VF[VF]; \
+ extern BASE d_##FUN##_##BASE##_##VF[VF]; \
+ void test_##FUN##_##BASE##_##VF (void) \
+ { \
+ for (int i = 0; i < VF; i++) \
+ d_##FUN##_##BASE##_##VF[i] \
+ = FUN (s1_##FUN##_##BASE##_##VF[i], \
+ s2_##FUN##_##BASE##_##VF[i]); \
+ } \
+
+
+gentest1(tan, float, 16)
+
+#define COMMON_FLOAT_TESTS1(FUN) \
+ gentest1(FUN, float, 4) \
+ gentest1(FUN, float, 8) \
+ gentest1(FUN, float, 16)
+
+COMMON_FLOAT_TESTS1(exp)
+COMMON_FLOAT_TESTS1(exp2)
+COMMON_FLOAT_TESTS1(log)
+COMMON_FLOAT_TESTS1(log2)
+COMMON_FLOAT_TESTS1(cos)
+COMMON_FLOAT_TESTS1(sin)
+
+gentest2(powf, float, 4)
+gentest2(powf, float, 8)
+gentest2(powf, float, 16)
+
+//COMMON_FLOAT_TESTS1(sqrt) provided by an instruction
+COMMON_FLOAT_TESTS1(erf)
+
+//gentest1(fabsf, float, 4) provided by an instruction
+//gentest1(fabsf, float, 8) provided by an instruction
+
+COMMON_FLOAT_TESTS1(atan)
+COMMON_FLOAT_TESTS1(log10)
+
+gentest1(exp10f, float, 4)
+gentest1(expm1f, float, 4)
+gentest1(log1pf, float, 4)
+
+COMMON_FLOAT_TESTS1(asinf)
+
+gentest1(acosf, float, 4)
+gentest1(acosf, float, 16)
+
+COMMON_FLOAT_TESTS1(tanhf)
+
+gentest1(coshf, float, 4)
+gentest1(coshf, float, 8)
+
+#define COMMON_DOUBLE_TESTS1(FUN) \
+ gentest1(FUN, double, 2) \
+ gentest1(FUN, double, 4) \
+ gentest1(FUN, double, 8)
+
+
+COMMON_DOUBLE_TESTS1(tan)
+COMMON_DOUBLE_TESTS1(exp)
+COMMON_DOUBLE_TESTS1(exp2)
+COMMON_DOUBLE_TESTS1(log)
+COMMON_DOUBLE_TESTS1(log2)
+COMMON_DOUBLE_TESTS1(cos)
+COMMON_DOUBLE_TESTS1(sin)
+
+gentest2(pow, double, 2)
+gentest2(pow, double, 4)
+gentest2(pow, double, 8)
+
+//COMMON_DOUBLE_TESTS1(sqrt) provided by an instruction
+COMMON_DOUBLE_TESTS1(erf)
+
+//gentest1(fabs, double, 2) provided by an instruction
+//gentest1(fabs, double, 4) provided by an instruction
+
+gentest1(atan, double, 2)
+gentest1(atan, double, 8)
+
+gentest1(log10, double, 2)
+gentest1(exp10, double, 2)
+gentest1(expm1, double, 2)
+gentest1(log1p, double, 2)
+
+gentest1(asin, double, 8)
+
+
+/* { dg-final { scan-assembler-times "amd_vrs8_expf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_exp2f" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_expf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_exp2f" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_exp10f" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_expm1f" 1 } } */
+/* { dg-final { scan-assembler "amd_vrd2_exp" } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_exp2" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_exp10" 1 } } */
+/* { dg-final { scan-assembler "amd_vrd4_exp" } } */
+/* { dg-final { scan-assembler-times "amd_vrd4_exp2" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_expf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_exp2f" 1 } } */
+/* { dg-final { scan-assembler "amd_vrd8_exp" } } */
+/* { dg-final { scan-assembler-times "amd_vrd8_exp2" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_logf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_log2f" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_log10f" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_logf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_log2f" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_log10f" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_log1pf" 1 } } */
+/* { dg-final { scan-assembler "amd_vrd4_log" } } */
+/* { dg-final { scan-assembler-times "amd_vrd4_log2" 1 } } */
+/* { dg-final { scan-assembler "amd_vrd2_log" } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_log2" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_log10" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_log1p" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_logf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_log2f" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_log10f" 1 } } */
+/* { dg-final { scan-assembler "amd_vrd8_log" } } */
+/* { dg-final { scan-assembler-times "amd_vrd8_log2" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_cosf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_cosf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_sinf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_sinf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd4_sin" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd4_cos" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd4_tan" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_cos" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_sin" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_tan" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_cosf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_sinf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_tanf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd8_cos" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd8_sin" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd8_tan" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_acosf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_asinf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_asinf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_atanf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_atanf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_atan" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_atanf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_asinf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_acosf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd8_atan" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd8_asin" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_coshf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_tanhf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_coshf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_tanhf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_tanhf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_powf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_pow" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd4_pow" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_powf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_powf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd8_pow" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs4_erff" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd2_erf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs8_erff" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd4_erf" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrs16_erff" 1 } } */
+/* { dg-final { scan-assembler-times "amd_vrd8_erf" 1 } } */
+
+
+