Correct imul (r64) latency for modern Intel CPUs

Since Sandybridge the 64bit multiplication latency is three cycles, not four. So update the costs to reflect reality. * x86-tune-costs.h (skylake_cost, core_cost): Decrease r64 multiply latencies. * gcc.target/i386/wmul-3.c: New test. From-SVN: r255760
author: Markus Trippelsdorf <markus@trippelsdorf.de> 2017-12-17 12:01:25 +0000
committer: Markus Trippelsdorf <trippels@gcc.gnu.org> 2017-12-17 12:01:25 +0000
commit: a2ef9558d17aeb038cbc8a66a203f7a8e6c6e81e (patch)
tree: b25c6818a61fd8d5752c2aa8a73c912df16c2234 /gcc/config
parent: d7f06bc3f7e1e1da11c065cc96a81f15bd0ca68f (diff)
download: gcc-a2ef9558d17aeb038cbc8a66a203f7a8e6c6e81e.zip
gcc-a2ef9558d17aeb038cbc8a66a203f7a8e6c6e81e.tar.gz
gcc-a2ef9558d17aeb038cbc8a66a203f7a8e6c6e81e.tar.bz2
1 files changed, 5 insertions, 4 deletions
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 64821933..477e478 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1538,8 +1538,8 @@ struct processor_costs skylake_cost = {
   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
    COSTS_N_INSNS (4),			/*				 HI */
    COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (4)},			/*			      other */
+   COSTS_N_INSNS (3),			/*				 DI */
+   COSTS_N_INSNS (3)},			/*			      other */
   0,					/* cost of multiply per each bit set */
   /* Expanding div/mod currently doesn't consider parallelism. So the cost
      model is not realistic. We compensate by increasing the latencies a bit.  */
@@ -2341,8 +2341,9 @@ struct processor_costs core_cost = {
   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
    COSTS_N_INSNS (4),			/*				 HI */
    COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (4)},			/*			      other */
+   /* Here we tune for Sandybridge or newer.  */
+   COSTS_N_INSNS (3),			/*				 DI */
+   COSTS_N_INSNS (3)},			/*			      other */
   0,					/* cost of multiply per each bit set */
   /* Expanding div/mod currently doesn't consider parallelism. So the cost
      model is not realistic. We compensate by increasing the latencies a bit.  */
author	Markus Trippelsdorf <markus@trippelsdorf.de>	2017-12-17 12:01:25 +0000
committer	Markus Trippelsdorf <trippels@gcc.gnu.org>	2017-12-17 12:01:25 +0000
commit	a2ef9558d17aeb038cbc8a66a203f7a8e6c6e81e (patch)
tree	b25c6818a61fd8d5752c2aa8a73c912df16c2234 /gcc/config
parent	d7f06bc3f7e1e1da11c065cc96a81f15bd0ca68f (diff)
download	gcc-a2ef9558d17aeb038cbc8a66a203f7a8e6c6e81e.zip gcc-a2ef9558d17aeb038cbc8a66a203f7a8e6c6e81e.tar.gz gcc-a2ef9558d17aeb038cbc8a66a203f7a8e6c6e81e.tar.bz2