diff options
author | liuhongt <hongtao.liu@intel.com> | 2024-09-24 15:53:14 +0800 |
---|---|---|
committer | liuhongt <hongtao.liu@intel.com> | 2024-10-10 10:21:29 +0800 |
commit | 9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc (patch) | |
tree | efcd53d6a4f137194d58f73774349bfc3385b5a3 /gcc | |
parent | 8718727509b2d038d00afa3bd5ef8e0df216a287 (diff) | |
download | gcc-9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc.zip gcc-9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc.tar.gz gcc-9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc.tar.bz2 |
Add new microarchitecture tune for SRF/GRR/CWF.
For Crestmont, 4-operand vex blendv instructions come from MSROM and
is slower than 3-instructions sequence (op1 & mask) | (op2 & ~mask).
legacy blendv instruction can still be handled by the decoder.
The patch add a new tune which is enabled for all processors except
for SRF/CWF. It will use vpand + vpandn + vpor instead of
vpblendvb(similar for vblendvps/vblendvpd) for SRF/CWF.
gcc/ChangeLog:
* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Guard
instruction blendv generation under new tune.
* config/i386/i386.h (TARGET_SSE_MOVCC_USE_BLENDV): New Macro.
* config/i386/x86-tune.def (X86_TUNE_SSE_MOVCC_USE_BLENDV):
New tune.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/i386-expand.cc | 24 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 2 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune.def | 8 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c | 12 |
4 files changed, 34 insertions, 12 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 3284011..0734399 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -4344,23 +4344,23 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) switch (mode) { case E_V2SFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_mmx_blendvps; break; case E_V4SFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvps; break; case E_V2DFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvpd; break; case E_SFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvss; break; case E_DFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvsd; break; case E_V8QImode: @@ -4368,7 +4368,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V4HFmode: case E_V4BFmode: case E_V2SImode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) { gen = gen_mmx_pblendvb_v8qi; blend_mode = V8QImode; @@ -4378,14 +4378,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V2HImode: case E_V2HFmode: case E_V2BFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) { gen = gen_mmx_pblendvb_v4qi; blend_mode = V4QImode; } break; case E_V2QImode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_mmx_pblendvb_v2qi; break; case E_V16QImode: @@ -4395,18 +4395,18 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V4SImode: case E_V2DImode: case E_V1TImode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) { gen = gen_sse4_1_pblendvb; blend_mode = V16QImode; } break; case E_V8SFmode: - if (TARGET_AVX) + if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV) gen = gen_avx_blendvps256; break; case E_V4DFmode: - if (TARGET_AVX) + if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV) gen = gen_avx_blendvpd256; break; case E_V32QImode: @@ -4415,7 +4415,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V16BFmode: case E_V8SImode: case E_V4DImode: - if (TARGET_AVX2) + if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV) { gen = gen_avx2_pblendvb; blend_mode = V32QImode; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 82177b9..d5d54ee 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -462,6 +462,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC] #define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC] #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR] +#define TARGET_SSE_MOVCC_USE_BLENDV \ + ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 3d123da..b815b6d 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -534,6 +534,14 @@ DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5) DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, "v2df_reduction_prefer_haddpd", m_NONE) +/* X86_TUNE_SSE_MOVCC_USE_BLENDV: Prefer blendv instructions to + 3-instruction sequence (op1 & mask) | (op2 & ~mask) + for vector condition move. + For Crestmont, 4-operand vex blendv instructions come from MSROM + which is slow. */ +DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV, + "sse_movcc_use_blendv", ~m_CORE_ATOM) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ diff --git a/gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c b/gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c new file mode 100644 index 0000000..ac9f152 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-march=sierraforest -O2" } */ +/* { dg-final { scan-assembler-not {(?n)vp?blendv(b|ps|pd)} } } */ + +void +foo (int* a, int* b, int* __restrict c) +{ + for (int i = 0; i != 200; i++) + { + c[i] += a[i] > b[i] ? 1 : -1; + } +} |