aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2024-09-24 15:53:14 +0800
committerliuhongt <hongtao.liu@intel.com>2024-10-10 10:21:29 +0800
commit9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc (patch)
treeefcd53d6a4f137194d58f73774349bfc3385b5a3 /gcc
parent8718727509b2d038d00afa3bd5ef8e0df216a287 (diff)
downloadgcc-9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc.zip
gcc-9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc.tar.gz
gcc-9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc.tar.bz2
Add new microarchitecture tune for SRF/GRR/CWF.
For Crestmont, 4-operand vex blendv instructions come from MSROM and is slower than 3-instructions sequence (op1 & mask) | (op2 & ~mask). legacy blendv instruction can still be handled by the decoder. The patch add a new tune which is enabled for all processors except for SRF/CWF. It will use vpand + vpandn + vpor instead of vpblendvb(similar for vblendvps/vblendvpd) for SRF/CWF. gcc/ChangeLog: * config/i386/i386-expand.cc (ix86_expand_sse_movcc): Guard instruction blendv generation under new tune. * config/i386/i386.h (TARGET_SSE_MOVCC_USE_BLENDV): New Macro. * config/i386/x86-tune.def (X86_TUNE_SSE_MOVCC_USE_BLENDV): New tune.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386-expand.cc24
-rw-r--r--gcc/config/i386/i386.h2
-rw-r--r--gcc/config/i386/x86-tune.def8
-rw-r--r--gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c12
4 files changed, 34 insertions, 12 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 3284011..0734399 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4344,23 +4344,23 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
switch (mode)
{
case E_V2SFmode:
- if (TARGET_SSE4_1)
+ if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_mmx_blendvps;
break;
case E_V4SFmode:
- if (TARGET_SSE4_1)
+ if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvps;
break;
case E_V2DFmode:
- if (TARGET_SSE4_1)
+ if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvpd;
break;
case E_SFmode:
- if (TARGET_SSE4_1)
+ if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvss;
break;
case E_DFmode:
- if (TARGET_SSE4_1)
+ if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvsd;
break;
case E_V8QImode:
@@ -4368,7 +4368,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
case E_V4HFmode:
case E_V4BFmode:
case E_V2SImode:
- if (TARGET_SSE4_1)
+ if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
gen = gen_mmx_pblendvb_v8qi;
blend_mode = V8QImode;
@@ -4378,14 +4378,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
case E_V2HImode:
case E_V2HFmode:
case E_V2BFmode:
- if (TARGET_SSE4_1)
+ if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
gen = gen_mmx_pblendvb_v4qi;
blend_mode = V4QImode;
}
break;
case E_V2QImode:
- if (TARGET_SSE4_1)
+ if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_mmx_pblendvb_v2qi;
break;
case E_V16QImode:
@@ -4395,18 +4395,18 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
case E_V4SImode:
case E_V2DImode:
case E_V1TImode:
- if (TARGET_SSE4_1)
+ if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
gen = gen_sse4_1_pblendvb;
blend_mode = V16QImode;
}
break;
case E_V8SFmode:
- if (TARGET_AVX)
+ if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
gen = gen_avx_blendvps256;
break;
case E_V4DFmode:
- if (TARGET_AVX)
+ if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
gen = gen_avx_blendvpd256;
break;
case E_V32QImode:
@@ -4415,7 +4415,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
case E_V16BFmode:
case E_V8SImode:
case E_V4DImode:
- if (TARGET_AVX2)
+ if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
{
gen = gen_avx2_pblendvb;
blend_mode = V32QImode;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 82177b9..d5d54ee 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -462,6 +462,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
#define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
#define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR]
+#define TARGET_SSE_MOVCC_USE_BLENDV \
+ ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 3d123da..b815b6d 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -534,6 +534,14 @@ DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
"v2df_reduction_prefer_haddpd", m_NONE)
+/* X86_TUNE_SSE_MOVCC_USE_BLENDV: Prefer blendv instructions to
+ 3-instruction sequence (op1 & mask) | (op2 & ~mask)
+ for vector condition move.
+ For Crestmont, 4-operand vex blendv instructions come from MSROM
+ which is slow. */
+DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV,
+ "sse_movcc_use_blendv", ~m_CORE_ATOM)
+
/*****************************************************************************/
/* AVX instruction selection tuning (some of SSE flags affects AVX, too) */
/*****************************************************************************/
diff --git a/gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c b/gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c
new file mode 100644
index 0000000..ac9f152
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=sierraforest -O2" } */
+/* { dg-final { scan-assembler-not {(?n)vp?blendv(b|ps|pd)} } } */
+
+void
+foo (int* a, int* b, int* __restrict c)
+{
+ for (int i = 0; i != 200; i++)
+ {
+ c[i] += a[i] > b[i] ? 1 : -1;
+ }
+}