diff options
author | Roger Sayle <roger@nextmovesoftware.com> | 2024-08-25 09:14:34 -0600 |
---|---|---|
committer | Roger Sayle <roger@nextmovesoftware.com> | 2024-08-25 09:14:34 -0600 |
commit | 07d62a1711f3e3bbdd2146ab5914d3bc5e246509 (patch) | |
tree | f901f84e14c6e5577cc3fd22bd113e049af32c64 /gcc/config/i386/i386-features.cc | |
parent | 70edccf88738ec204036e498a4a50c46e5e4f0c0 (diff) | |
download | gcc-07d62a1711f3e3bbdd2146ab5914d3bc5e246509.zip gcc-07d62a1711f3e3bbdd2146ab5914d3bc5e246509.tar.gz gcc-07d62a1711f3e3bbdd2146ab5914d3bc5e246509.tar.bz2 |
i386: Update STV's gains for TImode arithmetic right shifts on AVX2.
This patch tweaks timode_scalar_chain::compute_convert_gain to better
reflect the expansion of V1TImode arithmetic right shifts by the i386
backend. The comment "see ix86_expand_v1ti_ashiftrt" appears after
"case ASHIFTRT" in compute_convert_gain, and the changes below attempt
to better match the logic used there.
The original motivating example is:
__int128 m1;
void foo()
{
m1 = (m1 << 8) >> 8;
}
which with -O2 -mavx2 we fail to convert to vector form due to the
inappropriate cost of the arithmetic right shift.
Instruction gain -16 for 7: {r103:TI=r101:TI>>0x8;clobber flags:CC;}
Total gain: -3
Chain #1 conversion is not profitable
This is reporting that the ASHIFTRT is four instructions worse using
vectors than in scalar form, which is incorrect as the AVX2 expansion
of this shift only requires three instructions (and the scalar form
requires two).
With more accurate costs in timode_scalar_chain::compute_convert_gain
we now see (with -O2 -mavx2):
Instruction gain -4 for 7: {r103:TI=r101:TI>>0x8;clobber flags:CC;}
Total gain: 9
Converting chain #1...
which results in:
foo: vmovdqa m1(%rip), %xmm0
vpslldq $1, %xmm0, %xmm0
vpsrad $8, %xmm0, %xmm1
vpsrldq $1, %xmm0, %xmm0
vpblendd $7, %xmm0, %xmm1, %xmm0
vmovdqa %xmm0, m1(%rip)
ret
2024-08-25 Roger Sayle <roger@nextmovesoftware.com>
Uros Bizjak <ubizjak@gmail.com>
gcc/ChangeLog
* config/i386/i386-features.cc (compute_convert_gain)
<case ASHIFTRT>: Update to match ix86_expand_v1ti_ashiftrt.
Diffstat (limited to 'gcc/config/i386/i386-features.cc')
-rw-r--r-- | gcc/config/i386/i386-features.cc | 21 |
1 files changed, 13 insertions, 8 deletions
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 7e80e7b..ca902ec 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -1650,23 +1650,28 @@ timode_scalar_chain::compute_convert_gain () else if (op1val == 64) vcost = COSTS_N_INSNS (3); else if (op1val == 96) - vcost = COSTS_N_INSNS (4); - else if (op1val >= 111) vcost = COSTS_N_INSNS (3); - else if (TARGET_AVX2 && op1val == 32) + else if (op1val >= 111) vcost = COSTS_N_INSNS (3); else if (TARGET_SSE4_1 && op1val == 32) - vcost = COSTS_N_INSNS (4); + vcost = COSTS_N_INSNS (3); + else if (TARGET_SSE4_1 + && (op1val == 8 || op1val == 16 || op1val == 24)) + vcost = COSTS_N_INSNS (3); else if (op1val >= 96) - vcost = COSTS_N_INSNS (5); + vcost = COSTS_N_INSNS (4); + else if (TARGET_SSE4_1 && (op1val == 28 || op1val == 80)) + vcost = COSTS_N_INSNS (4); else if ((op1val & 7) == 0) - vcost = COSTS_N_INSNS (6); + vcost = COSTS_N_INSNS (5); else if (TARGET_AVX2 && op1val < 32) vcost = COSTS_N_INSNS (6); + else if (TARGET_SSE4_1 && op1val < 15) + vcost = COSTS_N_INSNS (6); else if (op1val == 1 || op1val >= 64) - vcost = COSTS_N_INSNS (9); + vcost = COSTS_N_INSNS (8); else - vcost = COSTS_N_INSNS (10); + vcost = COSTS_N_INSNS (9); } igain = scost - vcost; break; |