diff options
author | Roger Sayle <roger@nextmovesoftware.com> | 2023-07-01 08:24:33 +0100 |
---|---|---|
committer | Roger Sayle <roger@nextmovesoftware.com> | 2023-07-01 08:24:33 +0100 |
commit | 620a35b24a2b6edb67720ec42864b571a972fa45 (patch) | |
tree | cece852eb043ca089930e2c88e33aa3684feed86 /gcc | |
parent | 02460c0b8c9000359a09440f9532664a7835f158 (diff) | |
download | gcc-620a35b24a2b6edb67720ec42864b571a972fa45.zip gcc-620a35b24a2b6edb67720ec42864b571a972fa45.tar.gz gcc-620a35b24a2b6edb67720ec42864b571a972fa45.tar.bz2 |
i386: Add STV support for DImode and SImode rotations by constant.
This patch implements scalar-to-vector (STV) support for DImode and SImode
rotations by constant bit counts. Scalar rotations are almost always
optimal on x86, requiring only one or two instructions, but it is also
possible to implement these efficiently with SSE2, requiring only one
or two instructions for SImode rotations and at most 3 instructions for
DImode rotations. This allows GCC to STV rotations with a small or no
penalty if there are other (net) benefits to converting a chain. An
example of the benefits is shown below, which is based upon the BLAKE2
cryptographic hash function:
unsigned long long a,b,c,d;
unsigned long rot(unsigned long long x, int y)
{
return (x<<y) | (x>>(64-y));
}
void foo()
{
d = rot(d ^ a,32);
c = c + d;
b = rot(b ^ c,24);
a = a + b;
d = rot(d ^ a,16);
c = c + d;
b = rot(b ^ c,63);
}
where with -m32 -O2 -msse2
Before (59 insns, 247 bytes):
foo: pushl %edi
xorl %edx, %edx
pushl %esi
pushl %ebx
subl $16, %esp
movq a, %xmm1
movq d, %xmm0
movq b, %xmm2
pxor %xmm1, %xmm0
psrlq $32, %xmm0
movd %xmm0, %eax
movd %edx, %xmm0
movd %eax, %xmm3
punpckldq %xmm0, %xmm3
movq c, %xmm0
paddq %xmm3, %xmm0
pxor %xmm0, %xmm2
movd %xmm2, %ecx
psrlq $32, %xmm2
movd %xmm2, %ebx
movl %ecx, %eax
shldl $24, %ebx, %ecx
shldl $24, %eax, %ebx
movd %ebx, %xmm4
movd %ecx, %xmm2
punpckldq %xmm4, %xmm2
movdqa .LC0, %xmm4
pand %xmm4, %xmm2
paddq %xmm2, %xmm1
movq %xmm1, a
pxor %xmm3, %xmm1
movd %xmm1, %esi
psrlq $32, %xmm1
movd %xmm1, %edi
movl %esi, %eax
shldl $16, %edi, %esi
shldl $16, %eax, %edi
movd %esi, %xmm1
movd %edi, %xmm3
punpckldq %xmm3, %xmm1
pand %xmm4, %xmm1
movq %xmm1, d
paddq %xmm1, %xmm0
movq %xmm0, c
pxor %xmm2, %xmm0
movd %xmm0, 8(%esp)
psrlq $32, %xmm0
movl 8(%esp), %eax
movd %xmm0, 12(%esp)
movl 12(%esp), %edx
shrdl $1, %edx, %eax
xorl %edx, %edx
movl %eax, b
movl %edx, b+4
addl $16, %esp
popl %ebx
popl %esi
popl %edi
ret
After (32 insns, 165 bytes):
movq a, %xmm1
xorl %edx, %edx
movq d, %xmm0
movq b, %xmm2
movdqa .LC0, %xmm4
pxor %xmm1, %xmm0
psrlq $32, %xmm0
movd %xmm0, %eax
movd %edx, %xmm0
movd %eax, %xmm3
punpckldq %xmm0, %xmm3
movq c, %xmm0
paddq %xmm3, %xmm0
pxor %xmm0, %xmm2
pshufd $68, %xmm2, %xmm2
psrldq $5, %xmm2
pand %xmm4, %xmm2
paddq %xmm2, %xmm1
movq %xmm1, a
pxor %xmm3, %xmm1
pshuflw $147, %xmm1, %xmm1
pand %xmm4, %xmm1
movq %xmm1, d
paddq %xmm1, %xmm0
movq %xmm0, c
pxor %xmm2, %xmm0
pshufd $20, %xmm0, %xmm0
psrlq $1, %xmm0
pshufd $136, %xmm0, %xmm0
pand %xmm4, %xmm0
movq %xmm0, b
ret
2023-07-01 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (compute_convert_gain): Provide
gains/costs for ROTATE and ROTATERT (by an integer constant).
(general_scalar_chain::convert_rotate): New helper function to
convert a DImode or SImode rotation by an integer constant into
SSE vector form.
(general_scalar_chain::convert_insn): Call the new convert_rotate
for ROTATE and ROTATERT.
(general_scalar_to_vector_candidate_p): Consider ROTATE and
ROTATERT to be candidates if the second operand is an integer
constant, valid for a rotation (or shift) in the given mode.
* config/i386/i386-features.h (general_scalar_chain): Add new
helper method convert_rotate.
gcc/testsuite/ChangeLog
* gcc.target/i386/rotate-6.c: New test case.
* gcc.target/i386/sse2-stv-1.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/i386-features.cc | 116 | ||||
-rw-r--r-- | gcc/config/i386/i386-features.h | 1 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/rotate-6.c | 195 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/sse2-stv-1.c | 24 |
4 files changed, 336 insertions, 0 deletions
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index c676a90..2e751d1 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -582,6 +582,25 @@ general_scalar_chain::compute_convert_gain () igain -= vector_const_cost (XEXP (src, 0)); break; + case ROTATE: + case ROTATERT: + igain += m * ix86_cost->shift_const; + if (smode == DImode) + { + int bits = INTVAL (XEXP (src, 1)); + if ((bits & 0x0f) == 0) + igain -= ix86_cost->sse_op; + else if ((bits & 0x07) == 0) + igain -= 2 * ix86_cost->sse_op; + else + igain -= 3 * ix86_cost->sse_op; + } + else if (INTVAL (XEXP (src, 1)) == 16) + igain -= ix86_cost->sse_op; + else + igain -= 2 * ix86_cost->sse_op; + break; + case AND: case IOR: case XOR: @@ -1178,6 +1197,95 @@ scalar_chain::convert_insn_common (rtx_insn *insn) } } +/* Convert INSN which is an SImode or DImode rotation by a constant + to vector mode. CODE is either ROTATE or ROTATERT with operands + OP0 and OP1. Returns the SET_SRC of the last instruction in the + resulting sequence, which is emitted before INSN. */ + +rtx +general_scalar_chain::convert_rotate (enum rtx_code code, rtx op0, rtx op1, + rtx_insn *insn) +{ + int bits = INTVAL (op1); + rtx pat, result; + + convert_op (&op0, insn); + if (bits == 0) + return op0; + + if (smode == DImode) + { + if (code == ROTATE) + bits = 64 - bits; + if (bits == 32) + { + rtx tmp1 = gen_reg_rtx (V4SImode); + pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0), + GEN_INT (225)); + emit_insn_before (pat, insn); + result = gen_lowpart (V2DImode, tmp1); + } + else if (bits == 16 || bits == 48) + { + rtx tmp1 = gen_reg_rtx (V8HImode); + pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), + GEN_INT (bits == 16 ? 57 : 147)); + emit_insn_before (pat, insn); + result = gen_lowpart (V2DImode, tmp1); + } + else if ((bits & 0x07) == 0) + { + rtx tmp1 = gen_reg_rtx (V4SImode); + pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0), + GEN_INT (68)); + emit_insn_before (pat, insn); + rtx tmp2 = gen_reg_rtx (V1TImode); + pat = gen_sse2_lshrv1ti3 (tmp2, gen_lowpart (V1TImode, tmp1), + GEN_INT (bits)); + emit_insn_before (pat, insn); + result = gen_lowpart (V2DImode, tmp2); + } + else + { + rtx tmp1 = gen_reg_rtx (V4SImode); + pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0), + GEN_INT (20)); + emit_insn_before (pat, insn); + rtx tmp2 = gen_reg_rtx (V2DImode); + pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1), + GEN_INT (bits & 31)); + emit_insn_before (pat, insn); + rtx tmp3 = gen_reg_rtx (V4SImode); + pat = gen_sse2_pshufd (tmp3, gen_lowpart (V4SImode, tmp2), + GEN_INT (bits > 32 ? 34 : 136)); + emit_insn_before (pat, insn); + result = gen_lowpart (V2DImode, tmp3); + } + } + else if (bits == 16) + { + rtx tmp1 = gen_reg_rtx (V8HImode); + pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), GEN_INT (225)); + emit_insn_before (pat, insn); + result = gen_lowpart (V4SImode, tmp1); + } + else + { + if (code == ROTATE) + bits = 32 - bits; + + rtx tmp1 = gen_reg_rtx (V4SImode); + emit_insn_before (gen_sse2_pshufd (tmp1, op0, GEN_INT (224)), insn); + rtx tmp2 = gen_reg_rtx (V2DImode); + pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1), + GEN_INT (bits)); + emit_insn_before (pat, insn); + result = gen_lowpart (V4SImode, tmp2); + } + + return result; +} + /* Convert INSN to vector mode. */ void @@ -1233,6 +1341,12 @@ general_scalar_chain::convert_insn (rtx_insn *insn) PUT_MODE (src, vmode); break; + case ROTATE: + case ROTATERT: + src = convert_rotate (GET_CODE (src), XEXP (src, 0), XEXP (src, 1), + insn); + break; + case NEG: src = XEXP (src, 0); @@ -2006,6 +2120,8 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode) case ASHIFT: case LSHIFTRT: + case ROTATE: + case ROTATERT: if (!CONST_INT_P (XEXP (src, 1)) || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1)) return false; diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h index 72a9f54..af5acbb 100644 --- a/gcc/config/i386/i386-features.h +++ b/gcc/config/i386/i386-features.h @@ -189,6 +189,7 @@ class general_scalar_chain : public scalar_chain void convert_insn (rtx_insn *insn) final override; void convert_op (rtx *op, rtx_insn *insn) final override; int vector_const_cost (rtx exp); + rtx convert_rotate (enum rtx_code, rtx op0, rtx op1, rtx_insn *insn); }; class timode_scalar_chain : public scalar_chain diff --git a/gcc/testsuite/gcc.target/i386/rotate-6.c b/gcc/testsuite/gcc.target/i386/rotate-6.c new file mode 100644 index 0000000..42c2072 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/rotate-6.c @@ -0,0 +1,195 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +/* scalar 64-bit DImode rotations. */ +unsigned long long rot1(unsigned long long x) { return (x>>1) | (x<<63); } +unsigned long long rot2(unsigned long long x) { return (x>>2) | (x<<62); } +unsigned long long rot3(unsigned long long x) { return (x>>3) | (x<<61); } +unsigned long long rot4(unsigned long long x) { return (x>>4) | (x<<60); } +unsigned long long rot5(unsigned long long x) { return (x>>5) | (x<<59); } +unsigned long long rot6(unsigned long long x) { return (x>>6) | (x<<58); } +unsigned long long rot7(unsigned long long x) { return (x>>7) | (x<<57); } +unsigned long long rot8(unsigned long long x) { return (x>>8) | (x<<56); } +unsigned long long rot9(unsigned long long x) { return (x>>9) | (x<<55); } +unsigned long long rot10(unsigned long long x) { return (x>>10) | (x<<54); } +unsigned long long rot15(unsigned long long x) { return (x>>15) | (x<<49); } +unsigned long long rot16(unsigned long long x) { return (x>>16) | (x<<48); } +unsigned long long rot17(unsigned long long x) { return (x>>17) | (x<<47); } +unsigned long long rot20(unsigned long long x) { return (x>>20) | (x<<44); } +unsigned long long rot24(unsigned long long x) { return (x>>24) | (x<<40); } +unsigned long long rot30(unsigned long long x) { return (x>>30) | (x<<34); } +unsigned long long rot31(unsigned long long x) { return (x>>31) | (x<<33); } +unsigned long long rot32(unsigned long long x) { return (x>>32) | (x<<32); } +unsigned long long rot33(unsigned long long x) { return (x>>33) | (x<<31); } +unsigned long long rot34(unsigned long long x) { return (x>>34) | (x<<30); } +unsigned long long rot40(unsigned long long x) { return (x>>40) | (x<<24); } +unsigned long long rot42(unsigned long long x) { return (x>>42) | (x<<22); } +unsigned long long rot48(unsigned long long x) { return (x>>48) | (x<<16); } +unsigned long long rot50(unsigned long long x) { return (x>>50) | (x<<14); } +unsigned long long rot56(unsigned long long x) { return (x>>56) | (x<<8); } +unsigned long long rot58(unsigned long long x) { return (x>>58) | (x<<6); } +unsigned long long rot60(unsigned long long x) { return (x>>60) | (x<<4); } +unsigned long long rot61(unsigned long long x) { return (x>>61) | (x<<3); } +unsigned long long rot62(unsigned long long x) { return (x>>62) | (x<<2); } +unsigned long long rot63(unsigned long long x) { return (x>>63) | (x<<1); } + +/* DImode mem-to-mem rotations. These STV with -m32. */ +void mem1(unsigned long long *p) { *p = rot1(*p); } +void mem2(unsigned long long *p) { *p = rot2(*p); } +void mem3(unsigned long long *p) { *p = rot3(*p); } +void mem4(unsigned long long *p) { *p = rot4(*p); } +void mem5(unsigned long long *p) { *p = rot5(*p); } +void mem6(unsigned long long *p) { *p = rot6(*p); } +void mem7(unsigned long long *p) { *p = rot7(*p); } +void mem8(unsigned long long *p) { *p = rot8(*p); } +void mem9(unsigned long long *p) { *p = rot9(*p); } +void mem10(unsigned long long *p) { *p = rot10(*p); } +void mem15(unsigned long long *p) { *p = rot15(*p); } +void mem16(unsigned long long *p) { *p = rot16(*p); } +void mem17(unsigned long long *p) { *p = rot17(*p); } +void mem20(unsigned long long *p) { *p = rot20(*p); } +void mem24(unsigned long long *p) { *p = rot24(*p); } +void mem30(unsigned long long *p) { *p = rot30(*p); } +void mem31(unsigned long long *p) { *p = rot31(*p); } +void mem32(unsigned long long *p) { *p = rot32(*p); } +void mem33(unsigned long long *p) { *p = rot33(*p); } +void mem34(unsigned long long *p) { *p = rot34(*p); } +void mem40(unsigned long long *p) { *p = rot40(*p); } +void mem42(unsigned long long *p) { *p = rot42(*p); } +void mem48(unsigned long long *p) { *p = rot48(*p); } +void mem50(unsigned long long *p) { *p = rot50(*p); } +void mem56(unsigned long long *p) { *p = rot56(*p); } +void mem58(unsigned long long *p) { *p = rot58(*p); } +void mem60(unsigned long long *p) { *p = rot60(*p); } +void mem61(unsigned long long *p) { *p = rot61(*p); } +void mem62(unsigned long long *p) { *p = rot62(*p); } +void mem63(unsigned long long *p) { *p = rot63(*p); } + +/* Check that rotN and memN give the same result. */ +typedef unsigned long long (*rotN)(unsigned long long); +typedef void (*memN)(unsigned long long*); + +void eval(rotN s, memN v, unsigned long long x) +{ + unsigned long long r = s(x); + unsigned long long t = x; + v(&t); + + if (t != r) + __builtin_abort (); +} + +void test(rotN s, memN v) +{ + eval(s,v,0x0000000000000000ll); + eval(s,v,0x0000000000000001ll); + eval(s,v,0x0000000000000002ll); + eval(s,v,0x0000000000000004ll); + eval(s,v,0x0000000000000008ll); + eval(s,v,0x0000000000000010ll); + eval(s,v,0x0000000000000020ll); + eval(s,v,0x0000000000000040ll); + eval(s,v,0x0000000000000080ll); + eval(s,v,0x0000000000000100ll); + eval(s,v,0x0000000000000200ll); + eval(s,v,0x0000000000000400ll); + eval(s,v,0x0000000000000800ll); + eval(s,v,0x0000000000001000ll); + eval(s,v,0x0000000000002000ll); + eval(s,v,0x0000000000004000ll); + eval(s,v,0x0000000000008000ll); + eval(s,v,0x0000000000010000ll); + eval(s,v,0x0000000000020000ll); + eval(s,v,0x0000000000040000ll); + eval(s,v,0x0000000000080000ll); + eval(s,v,0x0000000000100000ll); + eval(s,v,0x0000000000200000ll); + eval(s,v,0x0000000000400000ll); + eval(s,v,0x0000000000800000ll); + eval(s,v,0x0000000001000000ll); + eval(s,v,0x0000000002000000ll); + eval(s,v,0x0000000004000000ll); + eval(s,v,0x0000000008000000ll); + eval(s,v,0x0000000010000000ll); + eval(s,v,0x0000000020000000ll); + eval(s,v,0x0000000040000000ll); + eval(s,v,0x0000000080000000ll); + eval(s,v,0x0000000100000000ll); + eval(s,v,0x0000000200000000ll); + eval(s,v,0x0000000400000000ll); + eval(s,v,0x0000000800000000ll); + eval(s,v,0x0000001000000000ll); + eval(s,v,0x0000002000000000ll); + eval(s,v,0x0000004000000000ll); + eval(s,v,0x0000008000000000ll); + eval(s,v,0x0000010000000000ll); + eval(s,v,0x0000020000000000ll); + eval(s,v,0x0000040000000000ll); + eval(s,v,0x0000080000000000ll); + eval(s,v,0x0000100000000000ll); + eval(s,v,0x0000200000000000ll); + eval(s,v,0x0000400000000000ll); + eval(s,v,0x0000800000000000ll); + eval(s,v,0x0001000000000000ll); + eval(s,v,0x0002000000000000ll); + eval(s,v,0x0004000000000000ll); + eval(s,v,0x0008000000000000ll); + eval(s,v,0x0010000000000000ll); + eval(s,v,0x0020000000000000ll); + eval(s,v,0x0040000000000000ll); + eval(s,v,0x0080000000000000ll); + eval(s,v,0x0100000000000000ll); + eval(s,v,0x0200000000000000ll); + eval(s,v,0x0400000000000000ll); + eval(s,v,0x0800000000000000ll); + eval(s,v,0x1000000000000000ll); + eval(s,v,0x2000000000000000ll); + eval(s,v,0x4000000000000000ll); + eval(s,v,0x8000000000000000ll); + eval(s,v,0x0123456789abcdefll); + eval(s,v,0x1111111111111111ll); + eval(s,v,0x5555555555555555ll); + eval(s,v,0x8888888888888888ll); + eval(s,v,0xaaaaaaaaaaaaaaaall); + eval(s,v,0xcafebabecafebabell); + eval(s,v,0xdeadbeefdeadbeefll); + eval(s,v,0xfedcba9876543210ll); + eval(s,v,0xffffffffffffffffll); +} + +int main() +{ + test(rot1,mem1); + test(rot2,mem2); + test(rot3,mem3); + test(rot4,mem4); + test(rot5,mem5); + test(rot6,mem6); + test(rot7,mem7); + test(rot8,mem8); + test(rot9,mem9); + test(rot10,mem10); + test(rot15,mem15); + test(rot16,mem16); + test(rot17,mem17); + test(rot20,mem20); + test(rot24,mem24); + test(rot30,mem30); + test(rot31,mem31); + test(rot32,mem32); + test(rot33,mem33); + test(rot34,mem34); + test(rot40,mem40); + test(rot42,mem42); + test(rot48,mem48); + test(rot50,mem50); + test(rot56,mem56); + test(rot58,mem58); + test(rot60,mem60); + test(rot61,mem61); + test(rot62,mem62); + test(rot63,mem63); + return 0; +} + diff --git a/gcc/testsuite/gcc.target/i386/sse2-stv-1.c b/gcc/testsuite/gcc.target/i386/sse2-stv-1.c new file mode 100644 index 0000000..a95d4ed --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-stv-1.c @@ -0,0 +1,24 @@ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-O2 -msse2" } */ + +unsigned long long a,b,c,d; + +static unsigned long rot(unsigned long long x, int y) +{ + /* Only called with y in 1..63. */ + return (x<<y) | (x>>(64-y)); +} + +void foo() +{ + d = rot(d ^ a,32); + c = c + d; + b = rot(b ^ c,24); + a = a + b; + d = rot(d ^ a,16); + c = c + d; + b = rot(b ^ c,63); +} + +/* { dg-final { scan-assembler-not "shldl" } } */ +/* { dg-final { scan-assembler-not "%\[er\]sp" } } */ |