diff options
author | Evandro Menezes <evandro@gcc.gnu.org> | 2016-06-13 19:02:52 +0000 |
---|---|---|
committer | Evandro Menezes <evandro@gcc.gnu.org> | 2016-06-13 19:02:52 +0000 |
commit | 9acc9cbeb8570573b8d73053453d65becc2d386a (patch) | |
tree | 83b24b59e7b7a16a7bcfa358cd38d75be76f2d6f | |
parent | d4f076c9b8f4288d748c871b5849146a518dee7b (diff) | |
download | gcc-9acc9cbeb8570573b8d73053453d65becc2d386a.zip gcc-9acc9cbeb8570573b8d73053453d65becc2d386a.tar.gz gcc-9acc9cbeb8570573b8d73053453d65becc2d386a.tar.bz2 |
[AArch64] Add more choices for the reciprocal square root approximation
Allow a target to prefer such operation depending on the operation mode.
gcc/
* config/aarch64/aarch64-protos.h
(AARCH64_APPROX_MODE): New macro.
(AARCH64_APPROX_{NONE,ALL}): Likewise.
(cpu_approx_modes): New structure.
(tune_params): New member "approx_modes".
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro.
* config/aarch64/aarch64.c
({generic,exynosm1,xgene1}_approx_modes): New core
"cpu_approx_modes" structures.
(generic_tunings): New member "approx_modes".
(cortexa35_tunings): Likewise.
(cortexa53_tunings): Likewise.
(cortexa57_tunings): Likewise.
(cortexa72_tunings): Likewise.
(exynosm1_tunings): Likewise.
(thunderx_tunings): Likewise.
(xgene1_tunings): Likewise.
(use_rsqrt_p): New argument for the mode and use new member from
"tune_params".
(aarch64_builtin_reciprocal): Devise mode from builtin.
(aarch64_optab_supported_p): New argument for the mode.
* doc/invoke.texi (-mlow-precision-recip-sqrt): Reword description.
From-SVN: r237395
-rw-r--r-- | gcc/config/aarch64/aarch64-protos.h | 18 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-tuning-flags.def | 2 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 44 | ||||
-rw-r--r-- | gcc/doc/invoke.texi | 2 |
4 files changed, 55 insertions, 11 deletions
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index ab9b37a..1c56a1d 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -178,6 +178,23 @@ struct cpu_branch_cost const int unpredictable; /* Unpredictable branch or optimizing for speed. */ }; +/* Control approximate alternatives to certain FP operators. */ +#define AARCH64_APPROX_MODE(MODE) \ + ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \ + ? (1 << ((MODE) - MIN_MODE_FLOAT)) \ + : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \ + ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \ + + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \ + : (0)) +#define AARCH64_APPROX_NONE (0) +#define AARCH64_APPROX_ALL (-1) + +/* Allowed modes for approximations. */ +struct cpu_approx_modes +{ + const unsigned int recip_sqrt; /* Reciprocal square root. */ +}; + struct tune_params { const struct cpu_cost_table *insn_extra_cost; @@ -185,6 +202,7 @@ struct tune_params const struct cpu_regmove_cost *regmove_cost; const struct cpu_vector_cost *vec_costs; const struct cpu_branch_cost *branch_costs; + const struct cpu_approx_modes *approx_modes; int memmov_cost; int issue_rate; unsigned int fusible_ops; diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 7e45a0c..048c2a3 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -29,5 +29,3 @@ AARCH64_TUNE_ to give an enum name. */ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) -AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT) - diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index a0db3a4f..6bc9910 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -393,6 +393,24 @@ static const struct cpu_branch_cost cortexa57_branch_cost = 3 /* Unpredictable. */ }; +/* Generic approximation modes. */ +static const cpu_approx_modes generic_approx_modes = +{ + AARCH64_APPROX_NONE /* recip_sqrt */ +}; + +/* Approximation modes for Exynos M1. */ +static const cpu_approx_modes exynosm1_approx_modes = +{ + AARCH64_APPROX_ALL /* recip_sqrt */ +}; + +/* Approximation modes for X-Gene 1. */ +static const cpu_approx_modes xgene1_approx_modes = +{ + AARCH64_APPROX_ALL /* recip_sqrt */ +}; + static const struct tune_params generic_tunings = { &cortexa57_extra_costs, @@ -400,6 +418,7 @@ static const struct tune_params generic_tunings = &generic_regmove_cost, &generic_vector_cost, &generic_branch_cost, + &generic_approx_modes, 4, /* memmov_cost */ 2, /* issue_rate */ AARCH64_FUSE_NOTHING, /* fusible_ops */ @@ -424,6 +443,7 @@ static const struct tune_params cortexa35_tunings = &cortexa53_regmove_cost, &generic_vector_cost, &generic_branch_cost, + &generic_approx_modes, 4, /* memmov_cost */ 1, /* issue_rate */ (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD @@ -449,6 +469,7 @@ static const struct tune_params cortexa53_tunings = &cortexa53_regmove_cost, &generic_vector_cost, &generic_branch_cost, + &generic_approx_modes, 4, /* memmov_cost */ 2, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD @@ -474,6 +495,7 @@ static const struct tune_params cortexa57_tunings = &cortexa57_regmove_cost, &cortexa57_vector_cost, &cortexa57_branch_cost, + &generic_approx_modes, 4, /* memmov_cost */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD @@ -499,6 +521,7 @@ static const struct tune_params cortexa72_tunings = &cortexa57_regmove_cost, &cortexa57_vector_cost, &generic_branch_cost, + &generic_approx_modes, 4, /* memmov_cost */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD @@ -524,6 +547,7 @@ static const struct tune_params exynosm1_tunings = &exynosm1_regmove_cost, &exynosm1_vector_cost, &generic_branch_cost, + &exynosm1_approx_modes, 4, /* memmov_cost */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC), /* fusible_ops */ @@ -538,7 +562,7 @@ static const struct tune_params exynosm1_tunings = 48, /* max_case_values. */ 64, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; static const struct tune_params thunderx_tunings = @@ -548,6 +572,7 @@ static const struct tune_params thunderx_tunings = &thunderx_regmove_cost, &generic_vector_cost, &generic_branch_cost, + &generic_approx_modes, 6, /* memmov_cost */ 2, /* issue_rate */ AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */ @@ -572,6 +597,7 @@ static const struct tune_params xgene1_tunings = &xgene1_regmove_cost, &xgene1_vector_cost, &generic_branch_cost, + &xgene1_approx_modes, 6, /* memmov_cost */ 4, /* issue_rate */ AARCH64_FUSE_NOTHING, /* fusible_ops */ @@ -586,7 +612,7 @@ static const struct tune_params xgene1_tunings = 0, /* max_case_values. */ 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; /* Support for fine-grained override of the tuning structures. */ @@ -7320,12 +7346,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, to optimize 1.0/sqrt. */ static bool -use_rsqrt_p (void) +use_rsqrt_p (machine_mode mode) { return (!flag_trapping_math && flag_unsafe_math_optimizations - && ((aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_APPROX_RSQRT) + && ((aarch64_tune_params.approx_modes->recip_sqrt + & AARCH64_APPROX_MODE (mode)) || flag_mrecip_low_precision_sqrt)); } @@ -7335,7 +7361,9 @@ use_rsqrt_p (void) static tree aarch64_builtin_reciprocal (tree fndecl) { - if (!use_rsqrt_p ()) + machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl)); + + if (!use_rsqrt_p (mode)) return NULL_TREE; return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl)); } @@ -13731,13 +13759,13 @@ aarch64_promoted_type (const_tree t) /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */ static bool -aarch64_optab_supported_p (int op, machine_mode, machine_mode, +aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode, optimization_type opt_type) { switch (op) { case rsqrt_optab: - return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (); + return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1); default: return true; diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index aa11209..e107da9 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -13022,7 +13022,7 @@ corresponding flag to the linker. @item -mno-low-precision-recip-sqrt @opindex mlow-precision-recip-sqrt @opindex mno-low-precision-recip-sqrt -Enable or disable reciprocal square root approximation. +Enable or disable the reciprocal square root approximation. This option only has an effect if @option{-ffast-math} or @option{-funsafe-math-optimizations} is used as well. Enabling this reduces precision of reciprocal square root results to about 16 bits for |