aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2021-03-11 16:56:26 -0800
committerH.J. Lu <hjl.tools@gmail.com>2021-04-06 05:36:00 -0700
commita32452a5442cd05040af53787af0d8b537ac77a6 (patch)
tree1090d473d81fc8ed8a028d92d22ecb224c264f49 /gcc
parente5c170e080399fb3d24a38bbfcd66bd4675abe53 (diff)
downloadgcc-a32452a5442cd05040af53787af0d8b537ac77a6.zip
gcc-a32452a5442cd05040af53787af0d8b537ac77a6.tar.gz
gcc-a32452a5442cd05040af53787af0d8b537ac77a6.tar.bz2
x86: Update memcpy/memset inline strategies for Skylake family CPUs
Simply memcpy and memset inline strategies to avoid branches for Skylake family CPUs: 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector load and store for up to 16 * 16 (256) bytes when the data size is fixed and known. 2. Inline only if data size is known to be <= 256. a. Use "rep movsb/stosb" with simple code sequence if the data size is a constant. b. Use loop if data size is not a constant. 3. Use memcpy/memset libray function if data size is unknown or > 256. On Cascadelake processor with -march=native -Ofast -flto, 1. Performance impacts of SPEC CPU 2017 rate are: 500.perlbench_r 0.17% 502.gcc_r -0.36% 505.mcf_r 0.00% 520.omnetpp_r 0.08% 523.xalancbmk_r -0.62% 525.x264_r 1.04% 531.deepsjeng_r 0.11% 541.leela_r -1.09% 548.exchange2_r -0.25% 557.xz_r 0.17% Geomean -0.08% 503.bwaves_r 0.00% 507.cactuBSSN_r 0.69% 508.namd_r -0.07% 510.parest_r 1.12% 511.povray_r 1.82% 519.lbm_r 0.00% 521.wrf_r -1.32% 526.blender_r -0.47% 527.cam4_r 0.23% 538.imagick_r -1.72% 544.nab_r -0.56% 549.fotonik3d_r 0.12% 554.roms_r 0.43% Geomean 0.02% 2. Significant impacts on eembc benchmarks are: eembc/idctrn01 9.23% eembc/nnet_test 29.26% gcc/ * config/i386/x86-tune-costs.h (skylake_memcpy): Updated. (skylake_memset): Likewise. (skylake_cost): Change CLEAR_RATIO to 17. * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB): Replace m_CANNONLAKE, m_ICELAKE_CLIENT, m_ICELAKE_SERVER, m_TIGERLAKE and m_SAPPHIRERAPIDS with m_SKYLAKE and m_CORE_AVX512. gcc/testsuite/ * gcc.target/i386/memcpy-strategy-9.c: New test. * gcc.target/i386/memcpy-strategy-10.c: Likewise. * gcc.target/i386/memcpy-strategy-11.c: Likewise. * gcc.target/i386/memset-strategy-7.c: Likewise. * gcc.target/i386/memset-strategy-8.c: Likewise. * gcc.target/i386/memset-strategy-9.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/x86-tune-costs.h27
-rw-r--r--gcc/config/i386/x86-tune.def3
-rw-r--r--gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c11
-rw-r--r--gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c18
-rw-r--r--gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c9
-rw-r--r--gcc/testsuite/gcc.target/i386/memset-strategy-7.c11
-rw-r--r--gcc/testsuite/gcc.target/i386/memset-strategy-8.c9
-rw-r--r--gcc/testsuite/gcc.target/i386/memset-strategy-9.c17
8 files changed, 93 insertions, 12 deletions
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 0e00ff9..ffe810f 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1822,17 +1822,24 @@ struct processor_costs znver3_cost = {
/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
static stringop_algs skylake_memcpy[2] = {
- {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
- {libcall, {{16, loop, false}, {512, unrolled_loop, false},
- {-1, libcall, false}}}};
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}},
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}}};
static stringop_algs skylake_memset[2] = {
- {libcall, {{6, loop_1_byte, true},
- {24, loop, true},
- {8192, rep_prefix_4_byte, true},
- {-1, libcall, false}}},
- {libcall, {{24, loop, true}, {512, unrolled_loop, false},
- {-1, libcall, false}}}};
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}},
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}}};
static const
struct processor_costs skylake_cost = {
@@ -1889,7 +1896,7 @@ struct processor_costs skylake_cost = {
COSTS_N_INSNS (0), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
- 6, /* CLEAR_RATIO */
+ 17, /* CLEAR_RATIO */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 134916c..eb057a6 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -273,8 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
move/set sequences of bytes with known size. */
DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
"prefer_known_rep_movsb_stosb",
- m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE
- | m_ALDERLAKE | m_SAPPHIRERAPIDS)
+ m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
compact prologues and epilogues by issuing a misaligned moves. This
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
new file mode 100644
index 0000000..970aa74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+ __builtin_memcpy (dest, src, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
new file mode 100644
index 0000000..b604194
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+typedef unsigned char e_u8;
+
+#define MAXBC 8
+
+void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
+{
+ e_u8 b[4][MAXBC];
+ int i, j;
+
+ for(i = 0; i < 4; i++)
+ for(j = 0; j < BC; j++) a[i][j] = b[i][j];
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
new file mode 100644
index 0000000..b0dc748
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+ __builtin_memcpy (dest, src, 256);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-7.c b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c
new file mode 100644
index 0000000..07c2816
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 0, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-8.c b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c
new file mode 100644
index 0000000..52ea882
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 0, 256);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-9.c b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c
new file mode 100644
index 0000000..d4db031
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+typedef unsigned char e_u8;
+
+#define MAXBC 8
+
+void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
+{
+ int i, j;
+
+ for(i = 0; i < 4; i++)
+ for(j = 0; j < BC; j++) a[i][j] = 1;
+}