diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2021-03-11 16:56:26 -0800 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2021-04-06 05:36:00 -0700 |
commit | a32452a5442cd05040af53787af0d8b537ac77a6 (patch) | |
tree | 1090d473d81fc8ed8a028d92d22ecb224c264f49 /gcc | |
parent | e5c170e080399fb3d24a38bbfcd66bd4675abe53 (diff) | |
download | gcc-a32452a5442cd05040af53787af0d8b537ac77a6.zip gcc-a32452a5442cd05040af53787af0d8b537ac77a6.tar.gz gcc-a32452a5442cd05040af53787af0d8b537ac77a6.tar.bz2 |
x86: Update memcpy/memset inline strategies for Skylake family CPUs
Simply memcpy and memset inline strategies to avoid branches for
Skylake family CPUs:
1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
load and store for up to 16 * 16 (256) bytes when the data size is
fixed and known.
2. Inline only if data size is known to be <= 256.
a. Use "rep movsb/stosb" with simple code sequence if the data size
is a constant.
b. Use loop if data size is not a constant.
3. Use memcpy/memset libray function if data size is unknown or > 256.
On Cascadelake processor with -march=native -Ofast -flto,
1. Performance impacts of SPEC CPU 2017 rate are:
500.perlbench_r 0.17%
502.gcc_r -0.36%
505.mcf_r 0.00%
520.omnetpp_r 0.08%
523.xalancbmk_r -0.62%
525.x264_r 1.04%
531.deepsjeng_r 0.11%
541.leela_r -1.09%
548.exchange2_r -0.25%
557.xz_r 0.17%
Geomean -0.08%
503.bwaves_r 0.00%
507.cactuBSSN_r 0.69%
508.namd_r -0.07%
510.parest_r 1.12%
511.povray_r 1.82%
519.lbm_r 0.00%
521.wrf_r -1.32%
526.blender_r -0.47%
527.cam4_r 0.23%
538.imagick_r -1.72%
544.nab_r -0.56%
549.fotonik3d_r 0.12%
554.roms_r 0.43%
Geomean 0.02%
2. Significant impacts on eembc benchmarks are:
eembc/idctrn01 9.23%
eembc/nnet_test 29.26%
gcc/
* config/i386/x86-tune-costs.h (skylake_memcpy): Updated.
(skylake_memset): Likewise.
(skylake_cost): Change CLEAR_RATIO to 17.
* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
Replace m_CANNONLAKE, m_ICELAKE_CLIENT, m_ICELAKE_SERVER,
m_TIGERLAKE and m_SAPPHIRERAPIDS with m_SKYLAKE and m_CORE_AVX512.
gcc/testsuite/
* gcc.target/i386/memcpy-strategy-9.c: New test.
* gcc.target/i386/memcpy-strategy-10.c: Likewise.
* gcc.target/i386/memcpy-strategy-11.c: Likewise.
* gcc.target/i386/memset-strategy-7.c: Likewise.
* gcc.target/i386/memset-strategy-8.c: Likewise.
* gcc.target/i386/memset-strategy-9.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/x86-tune-costs.h | 27 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune.def | 3 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c | 11 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c | 18 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c | 9 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/memset-strategy-7.c | 11 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/memset-strategy-8.c | 9 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/memset-strategy-9.c | 17 |
8 files changed, 93 insertions, 12 deletions
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 0e00ff9..ffe810f 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1822,17 +1822,24 @@ struct processor_costs znver3_cost = { /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ static stringop_algs skylake_memcpy[2] = { - {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, - {libcall, {{16, loop, false}, {512, unrolled_loop, false}, - {-1, libcall, false}}}}; + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}, + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}}; static stringop_algs skylake_memset[2] = { - {libcall, {{6, loop_1_byte, true}, - {24, loop, true}, - {8192, rep_prefix_4_byte, true}, - {-1, libcall, false}}}, - {libcall, {{24, loop, true}, {512, unrolled_loop, false}, - {-1, libcall, false}}}}; + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}, + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}}; static const struct processor_costs skylake_cost = { @@ -1889,7 +1896,7 @@ struct processor_costs skylake_cost = { COSTS_N_INSNS (0), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - 6, /* CLEAR_RATIO */ + 17, /* CLEAR_RATIO */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 134916c..eb057a6 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -273,8 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) move/set sequences of bytes with known size. */ DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB, "prefer_known_rep_movsb_stosb", - m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE - | m_ALDERLAKE | m_SAPPHIRERAPIDS) + m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512) /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of compact prologues and epilogues by issuing a misaligned moves. This diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c new file mode 100644 index 0000000..970aa74 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake -mno-sse" } */ +/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "rep movsb" } } */ + +void +foo (char *dest, char *src) +{ + __builtin_memcpy (dest, src, 257); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c new file mode 100644 index 0000000..b604194 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake" } */ +/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "rep movsb" } } */ + +typedef unsigned char e_u8; + +#define MAXBC 8 + +void MixColumn(e_u8 a[4][MAXBC], e_u8 BC) +{ + e_u8 b[4][MAXBC]; + int i, j; + + for(i = 0; i < 4; i++) + for(j = 0; j < BC; j++) a[i][j] = b[i][j]; +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c new file mode 100644 index 0000000..b0dc748 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake -mno-sse" } */ +/* { dg-final { scan-assembler "rep movsb" } } */ + +void +foo (char *dest, char *src) +{ + __builtin_memcpy (dest, src, 256); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-7.c b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c new file mode 100644 index 0000000..07c2816 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake -mno-sse" } */ +/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "rep stosb" } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 257); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-8.c b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c new file mode 100644 index 0000000..52ea882 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake -mno-sse" } */ +/* { dg-final { scan-assembler "rep stosb" } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 256); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-9.c b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c new file mode 100644 index 0000000..d4db031 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake" } */ +/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "rep stosb" } } */ + +typedef unsigned char e_u8; + +#define MAXBC 8 + +void MixColumn(e_u8 a[4][MAXBC], e_u8 BC) +{ + int i, j; + + for(i = 0; i < 4; i++) + for(j = 0; j < BC; j++) a[i][j] = 1; +} |