aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2011-10-18 23:27:40 +0200
committerJakub Jelinek <jakub@gcc.gnu.org>2011-10-18 23:27:40 +0200
commitb6f9a04a80187b976b37bbf1afb0c8e2eefa943d (patch)
treede23d6e3c334ccca26f466c22e7d5d134ca8a089 /gcc
parente9d662bb75152bb63ada99d46d30d98197a82160 (diff)
downloadgcc-b6f9a04a80187b976b37bbf1afb0c8e2eefa943d.zip
gcc-b6f9a04a80187b976b37bbf1afb0c8e2eefa943d.tar.gz
gcc-b6f9a04a80187b976b37bbf1afb0c8e2eefa943d.tar.bz2
i386.c (ix86_expand_vec_perm): In merge_two use mode SUBREG of operands[0] as target.
* config/i386/i386.c (ix86_expand_vec_perm): In merge_two use mode SUBREG of operands[0] as target. (valid_perm_using_mode_p): Don't ignore higher bits of d->perm. (expand_vec_pshufb): For V8SImode vmode emit avx2_permvarv8si. (expand_vec_perm_1): Handle identity and some broadcast permutations. (expand_vec_perm_interleave2): Handle also 32-byte modes, using vperm2[fi]128 or vpunpck[lh]* followed by single insn permutation. For d->testing_p return true earlier to avoid creating more GC garbage. (expand_vec_perm_vpermq_perm_1): New function. (expand_vec_perm_vpshufb2_vpermq): For d->testing_p return true earlier to avoid creating more GC garbage. Fix handling of V16HImode. Avoid some SUBREGs in SET_DEST. (expand_vec_perm_broadcast_1): Return false for 32-byte integer vector modes. (expand_vec_perm_vpshufb4_vpermq2): New function. (ix86_expand_vec_perm_builtin_1): Call expand_vec_perm_vpermq_perm_1 and expand_vec_perm_vpshufb4_vpermq2. From-SVN: r180169
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog22
-rw-r--r--gcc/config/i386/i386.c524
2 files changed, 476 insertions, 70 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2dc4ba2..d8f5a82 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,25 @@
+2011-10-18 Jakub Jelinek <jakub@redhat.com>
+
+ * config/i386/i386.c (ix86_expand_vec_perm): In merge_two use
+ mode SUBREG of operands[0] as target.
+ (valid_perm_using_mode_p): Don't ignore higher bits of d->perm.
+ (expand_vec_pshufb): For V8SImode vmode emit avx2_permvarv8si.
+ (expand_vec_perm_1): Handle identity and some broadcast
+ permutations.
+ (expand_vec_perm_interleave2): Handle also 32-byte modes, using
+ vperm2[fi]128 or vpunpck[lh]* followed by single insn permutation.
+ For d->testing_p return true earlier to avoid creating more GC
+ garbage.
+ (expand_vec_perm_vpermq_perm_1): New function.
+ (expand_vec_perm_vpshufb2_vpermq): For d->testing_p return true
+ earlier to avoid creating more GC garbage. Fix handling of
+ V16HImode. Avoid some SUBREGs in SET_DEST.
+ (expand_vec_perm_broadcast_1): Return false for 32-byte integer
+ vector modes.
+ (expand_vec_perm_vpshufb4_vpermq2): New function.
+ (ix86_expand_vec_perm_builtin_1): Call expand_vec_perm_vpermq_perm_1
+ and expand_vec_perm_vpshufb4_vpermq2.
+
2011-10-18 Andrew Stubbs <ams@codesourcery.com>
* config/arm/driver-arm.c (host_detect_local_cpu): Close the file
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 2c53423..ec9d39b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19663,7 +19663,7 @@ ix86_expand_vec_perm (rtx operands[])
mask = expand_simple_binop (maskmode, AND, mask, vt,
NULL_RTX, 0, OPTAB_DIRECT);
- xops[0] = operands[0];
+ xops[0] = gen_lowpart (mode, operands[0]);
xops[1] = gen_lowpart (mode, t2);
xops[2] = gen_lowpart (mode, t1);
xops[3] = gen_rtx_EQ (maskmode, mask, vt);
@@ -35006,8 +35006,7 @@ valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
return false;
else
for (j = 1; j < chunk; ++j)
- if ((d->perm[i] & (d->nelt - 1)) + j
- != (d->perm[i + j] & (d->nelt - 1)))
+ if (d->perm[i] + j != d->perm[i + j])
return false;
return true;
@@ -35138,6 +35137,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else
+ emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
}
else
{
@@ -35163,9 +35164,58 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (d->op0 == d->op1)
{
int mask = nelt - 1;
+ bool identity_perm = true;
+ bool broadcast_perm = true;
for (i = 0; i < nelt; i++)
- perm2[i] = d->perm[i] & mask;
+ {
+ perm2[i] = d->perm[i] & mask;
+ if (perm2[i] != i)
+ identity_perm = false;
+ if (perm2[i])
+ broadcast_perm = false;
+ }
+
+ if (identity_perm)
+ {
+ if (!d->testing_p)
+ emit_move_insn (d->target, d->op0);
+ return true;
+ }
+ else if (broadcast_perm && TARGET_AVX2)
+ {
+ /* Use vpbroadcast{b,w,d}. */
+ rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
+ switch (d->vmode)
+ {
+ case V32QImode:
+ op = gen_lowpart (V16QImode, op);
+ gen = gen_avx2_pbroadcastv32qi;
+ break;
+ case V16HImode:
+ op = gen_lowpart (V8HImode, op);
+ gen = gen_avx2_pbroadcastv16hi;
+ break;
+ case V8SImode:
+ op = gen_lowpart (V4SImode, op);
+ gen = gen_avx2_pbroadcastv8si;
+ break;
+ case V16QImode:
+ gen = gen_avx2_pbroadcastv16qi;
+ break;
+ case V8HImode:
+ gen = gen_avx2_pbroadcastv8hi;
+ break;
+ /* For other modes prefer other shuffles this function creates. */
+ default: break;
+ }
+ if (gen != NULL)
+ {
+ if (!d->testing_p)
+ emit_insn (gen (d->target, op));
+ return true;
+ }
+ }
if (expand_vselect (d->target, d->op0, perm2, nelt))
return true;
@@ -35349,93 +35399,210 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
{
struct expand_vec_perm_d dremap, dfinal;
unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
- unsigned contents, h1, h2, h3, h4;
+ unsigned HOST_WIDE_INT contents;
unsigned char remap[2 * MAX_VECT_LEN];
rtx seq;
- bool ok;
+ bool ok, same_halves = false;
- if (d->op0 == d->op1)
- return false;
-
- /* The 256-bit unpck[lh]p[sd] instructions only operate within the 128-bit
- lanes. We can use similar techniques with the vperm2f128 instruction,
- but it requires slightly different logic. */
- if (GET_MODE_SIZE (d->vmode) != 16)
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ if (d->op0 == d->op1)
+ return false;
+ }
+ else if (GET_MODE_SIZE (d->vmode) == 32)
+ {
+ if (!TARGET_AVX)
+ return false;
+ /* For 32-byte modes allow even d->op0 == d->op1.
+ The lack of cross-lane shuffling in some instructions
+ might prevent a single insn shuffle. */
+ }
+ else
return false;
/* Examine from whence the elements come. */
contents = 0;
for (i = 0; i < nelt; ++i)
- contents |= 1u << d->perm[i];
-
- /* Split the two input vectors into 4 halves. */
- h1 = (1u << nelt2) - 1;
- h2 = h1 << nelt2;
- h3 = h2 << nelt2;
- h4 = h3 << nelt2;
+ contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
memset (remap, 0xff, sizeof (remap));
dremap = *d;
- /* If the elements from the low halves use interleave low, and similarly
- for interleave high. If the elements are from mis-matched halves, we
- can use shufps for V4SF/V4SI or do a DImode shuffle. */
- if ((contents & (h1 | h3)) == contents)
+ if (GET_MODE_SIZE (d->vmode) == 16)
{
- for (i = 0; i < nelt2; ++i)
+ unsigned HOST_WIDE_INT h1, h2, h3, h4;
+
+ /* Split the two input vectors into 4 halves. */
+ h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
+ h2 = h1 << nelt2;
+ h3 = h2 << nelt2;
+ h4 = h3 << nelt2;
+
+ /* If the elements from the low halves use interleave low, and similarly
+ for interleave high. If the elements are from mis-matched halves, we
+ can use shufps for V4SF/V4SI or do a DImode shuffle. */
+ if ((contents & (h1 | h3)) == contents)
{
- remap[i] = i * 2;
- remap[i + nelt] = i * 2 + 1;
- dremap.perm[i * 2] = i;
- dremap.perm[i * 2 + 1] = i + nelt;
+ /* punpckl* */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i] = i * 2;
+ remap[i + nelt] = i * 2 + 1;
+ dremap.perm[i * 2] = i;
+ dremap.perm[i * 2 + 1] = i + nelt;
+ }
}
- }
- else if ((contents & (h2 | h4)) == contents)
- {
- for (i = 0; i < nelt2; ++i)
+ else if ((contents & (h2 | h4)) == contents)
{
- remap[i + nelt2] = i * 2;
- remap[i + nelt + nelt2] = i * 2 + 1;
- dremap.perm[i * 2] = i + nelt2;
- dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+ /* punpckh* */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nelt2] = i * 2;
+ remap[i + nelt + nelt2] = i * 2 + 1;
+ dremap.perm[i * 2] = i + nelt2;
+ dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+ }
}
- }
- else if ((contents & (h1 | h4)) == contents)
- {
- for (i = 0; i < nelt2; ++i)
+ else if ((contents & (h1 | h4)) == contents)
{
- remap[i] = i;
- remap[i + nelt + nelt2] = i + nelt2;
- dremap.perm[i] = i;
- dremap.perm[i + nelt2] = i + nelt + nelt2;
+ /* shufps */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i] = i;
+ remap[i + nelt + nelt2] = i + nelt2;
+ dremap.perm[i] = i;
+ dremap.perm[i + nelt2] = i + nelt + nelt2;
+ }
+ if (nelt != 4)
+ {
+ /* shufpd */
+ dremap.vmode = V2DImode;
+ dremap.nelt = 2;
+ dremap.perm[0] = 0;
+ dremap.perm[1] = 3;
+ }
}
- if (nelt != 4)
+ else if ((contents & (h2 | h3)) == contents)
{
- dremap.vmode = V2DImode;
- dremap.nelt = 2;
- dremap.perm[0] = 0;
- dremap.perm[1] = 3;
+ /* shufps */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nelt2] = i;
+ remap[i + nelt] = i + nelt2;
+ dremap.perm[i] = i + nelt2;
+ dremap.perm[i + nelt2] = i + nelt;
+ }
+ if (nelt != 4)
+ {
+ /* shufpd */
+ dremap.vmode = V2DImode;
+ dremap.nelt = 2;
+ dremap.perm[0] = 1;
+ dremap.perm[1] = 2;
+ }
}
+ else
+ return false;
}
- else if ((contents & (h2 | h3)) == contents)
+ else
{
- for (i = 0; i < nelt2; ++i)
+ unsigned int nelt4 = nelt / 4, nzcnt = 0;
+ unsigned HOST_WIDE_INT q[8];
+ unsigned int nonzero_halves[4];
+
+ /* Split the two input vectors into 8 quarters. */
+ q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
+ for (i = 1; i < 8; ++i)
+ q[i] = q[0] << (nelt4 * i);
+ for (i = 0; i < 4; ++i)
+ if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
+ {
+ nonzero_halves[nzcnt] = i;
+ ++nzcnt;
+ }
+
+ if (nzcnt == 1)
+ {
+ gcc_assert (d->op0 == d->op1);
+ nonzero_halves[1] = nonzero_halves[0];
+ same_halves = true;
+ }
+ else if (d->op0 == d->op1)
+ {
+ gcc_assert (nonzero_halves[0] == 0);
+ gcc_assert (nonzero_halves[1] == 1);
+ }
+
+ if (nzcnt <= 2)
+ {
+ if (d->perm[0] / nelt2 == nonzero_halves[1])
+ {
+ /* Attempt to increase the likelyhood that dfinal
+ shuffle will be intra-lane. */
+ char tmph = nonzero_halves[0];
+ nonzero_halves[0] = nonzero_halves[1];
+ nonzero_halves[1] = tmph;
+ }
+
+ /* vperm2f128 or vperm2i128. */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
+ remap[i + nonzero_halves[0] * nelt2] = i;
+ dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
+ dremap.perm[i] = i + nonzero_halves[0] * nelt2;
+ }
+
+ if (d->vmode != V8SFmode
+ && d->vmode != V4DFmode
+ && d->vmode != V8SImode)
+ {
+ dremap.vmode = V8SImode;
+ dremap.nelt = 8;
+ for (i = 0; i < 4; ++i)
+ {
+ dremap.perm[i] = i + nonzero_halves[0] * 4;
+ dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
+ }
+ }
+ }
+ else if (d->op0 == d->op1)
+ return false;
+ else if (TARGET_AVX2
+ && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
{
- remap[i + nelt2] = i;
- remap[i + nelt] = i + nelt2;
- dremap.perm[i] = i + nelt2;
- dremap.perm[i + nelt2] = i + nelt;
+ /* vpunpckl* */
+ for (i = 0; i < nelt4; ++i)
+ {
+ remap[i] = i * 2;
+ remap[i + nelt] = i * 2 + 1;
+ remap[i + nelt2] = i * 2 + nelt2;
+ remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
+ dremap.perm[i * 2] = i;
+ dremap.perm[i * 2 + 1] = i + nelt;
+ dremap.perm[i * 2 + nelt2] = i + nelt2;
+ dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
+ }
}
- if (nelt != 4)
+ else if (TARGET_AVX2
+ && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
{
- dremap.vmode = V2DImode;
- dremap.nelt = 2;
- dremap.perm[0] = 1;
- dremap.perm[1] = 2;
+ /* vpunpckh* */
+ for (i = 0; i < nelt4; ++i)
+ {
+ remap[i + nelt4] = i * 2;
+ remap[i + nelt + nelt4] = i * 2 + 1;
+ remap[i + nelt2 + nelt4] = i * 2 + nelt2;
+ remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
+ dremap.perm[i * 2] = i + nelt4;
+ dremap.perm[i * 2 + 1] = i + nelt + nelt4;
+ dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
+ dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
+ }
}
+ else
+ return false;
}
- else
- return false;
/* Use the remapping array set up above to move the elements from their
swizzled locations into their final destinations. */
@@ -35444,7 +35611,15 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
{
unsigned e = remap[d->perm[i]];
gcc_assert (e < nelt);
- dfinal.perm[i] = e;
+ /* If same_halves is true, both halves of the remapped vector are the
+ same. Avoid cross-lane accesses if possible. */
+ if (same_halves && i >= nelt2)
+ {
+ gcc_assert (e < nelt2);
+ dfinal.perm[i] = e + nelt2;
+ }
+ else
+ dfinal.perm[i] = e;
}
dfinal.op0 = gen_reg_rtx (dfinal.vmode);
dfinal.op1 = dfinal.op0;
@@ -35460,6 +35635,9 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
if (!ok)
return false;
+ if (d->testing_p)
+ return true;
+
if (dremap.vmode != dfinal.vmode)
{
dremap.target = gen_lowpart (dremap.vmode, dremap.target);
@@ -35475,6 +35653,83 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+ a single vector cross-lane permutation into vpermq followed
+ by any of the single insn permutations. */
+
+static bool
+expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dremap, dfinal;
+ unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
+ unsigned contents[2];
+ bool ok;
+
+ if (!(TARGET_AVX2
+ && (d->vmode == V32QImode || d->vmode == V16HImode)
+ && d->op0 == d->op1))
+ return false;
+
+ contents[0] = 0;
+ contents[1] = 0;
+ for (i = 0; i < nelt2; ++i)
+ {
+ contents[0] |= 1u << (d->perm[i] / nelt4);
+ contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ unsigned int cnt = 0;
+ for (j = 0; j < 4; ++j)
+ if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ dremap = *d;
+ dremap.vmode = V4DImode;
+ dremap.nelt = 4;
+ dremap.target = gen_reg_rtx (V4DImode);
+ dremap.op0 = gen_lowpart (V4DImode, d->op0);
+ dremap.op1 = dremap.op0;
+ for (i = 0; i < 2; ++i)
+ {
+ unsigned int cnt = 0;
+ for (j = 0; j < 4; ++j)
+ if ((contents[i] & (1u << j)) != 0)
+ dremap.perm[2 * i + cnt++] = j;
+ for (; cnt < 2; ++cnt)
+ dremap.perm[2 * i + cnt] = 0;
+ }
+
+ dfinal = *d;
+ dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+ dfinal.op1 = dfinal.op0;
+ for (i = 0, j = 0; i < nelt; ++i)
+ {
+ if (i == nelt2)
+ j = 2;
+ dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
+ if ((d->perm[i] / nelt4) == dremap.perm[j])
+ ;
+ else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
+ dfinal.perm[i] |= nelt4;
+ else
+ gcc_unreachable ();
+ }
+
+ ok = expand_vec_perm_1 (&dremap);
+ gcc_assert (ok);
+
+ ok = expand_vec_perm_1 (&dfinal);
+ gcc_assert (ok);
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
a two vector permutation using 2 intra-lane interleave insns
and cross-lane shuffle for 32-byte vectors. */
@@ -35621,6 +35876,9 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
|| (d->vmode != V32QImode && d->vmode != V16HImode))
return false;
+ if (d->testing_p)
+ return true;
+
nelt = d->nelt;
eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
@@ -35635,12 +35893,12 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
for (i = 0; i < nelt; ++i)
{
unsigned j, e = d->perm[i] & (nelt / 2 - 1);
- unsigned which = ((d->perm[i] ^ i) & (nelt / 2));
+ unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
for (j = 0; j < eltsz; ++j)
{
rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
- rperm[!which][(i * eltsz + j) ^ (which ^ (nelt / 2))] = m128;
+ rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
}
}
@@ -35652,10 +35910,9 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
/* Swap the 128-byte lanes of h into hp. */
- hp = gen_reg_rtx (V32QImode);
+ hp = gen_reg_rtx (V4DImode);
op = gen_lowpart (V4DImode, h);
- emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, hp), op,
- const2_rtx, GEN_INT (3), const0_rtx,
+ emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
const1_rtx));
vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
@@ -35666,7 +35923,7 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
op = gen_lowpart (V32QImode, d->target);
- emit_insn (gen_iorv32qi3 (op, l, hp));
+ emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
return true;
}
@@ -35994,6 +36251,15 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
gcc_assert (ok);
return true;
+ case V32QImode:
+ case V16HImode:
+ case V8SImode:
+ case V4DImode:
+ /* For AVX2 broadcasts of the first element vpbroadcast* or
+ vpermq should be used by expand_vec_perm_1. */
+ gcc_assert (!TARGET_AVX2 || d->perm[0]);
+ return false;
+
default:
gcc_unreachable ();
}
@@ -36018,6 +36284,117 @@ expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
return expand_vec_perm_broadcast_1 (d);
}
+/* Implement arbitrary permutation of two V32QImode and V16QImode operands
+ with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
+ all the shorter instruction sequences. */
+
+static bool
+expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
+{
+ rtx rperm[4][32], vperm, l[2], h[2], op, m128;
+ unsigned int i, nelt, eltsz;
+ bool used[4];
+
+ if (!TARGET_AVX2
+ || d->op0 == d->op1
+ || (d->vmode != V32QImode && d->vmode != V16HImode))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ nelt = d->nelt;
+ eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+ /* Generate 4 permutation masks. If the required element is within
+ the same lane, it is shuffled in. If the required element from the
+ other lane, force a zero by setting bit 7 in the permutation mask.
+ In the other mask the mask has non-negative elements if element
+ is requested from the other lane, but also moved to the other lane,
+ so that the result of vpshufb can have the two V2TImode halves
+ swapped. */
+ m128 = GEN_INT (-128);
+ for (i = 0; i < 32; ++i)
+ {
+ rperm[0][i] = m128;
+ rperm[1][i] = m128;
+ rperm[2][i] = m128;
+ rperm[3][i] = m128;
+ }
+ used[0] = false;
+ used[1] = false;
+ used[2] = false;
+ used[3] = false;
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+ unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+ unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
+
+ for (j = 0; j < eltsz; ++j)
+ rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
+ used[which] = true;
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ if (!used[2 * i + 1])
+ {
+ h[i] = NULL_RTX;
+ continue;
+ }
+ vperm = gen_rtx_CONST_VECTOR (V32QImode,
+ gen_rtvec_v (32, rperm[2 * i + 1]));
+ vperm = force_reg (V32QImode, vperm);
+ h[i] = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
+ }
+
+ /* Swap the 128-byte lanes of h[X]. */
+ for (i = 0; i < 2; ++i)
+ {
+ if (h[i] == NULL_RTX)
+ continue;
+ op = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
+ const2_rtx, GEN_INT (3), const0_rtx,
+ const1_rtx));
+ h[i] = gen_lowpart (V32QImode, op);
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ if (!used[2 * i])
+ {
+ l[i] = NULL_RTX;
+ continue;
+ }
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
+ vperm = force_reg (V32QImode, vperm);
+ l[i] = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ if (h[i] && l[i])
+ {
+ op = gen_reg_rtx (V32QImode);
+ emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
+ l[i] = op;
+ }
+ else if (h[i])
+ l[i] = h[i];
+ }
+
+ gcc_assert (l[0] && l[1]);
+ op = gen_lowpart (V32QImode, d->target);
+ emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
+ return true;
+}
+
/* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
With all of the interface bits taken care of, perform the expansion
in D and return true on success. */
@@ -36043,6 +36420,9 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_broadcast (d))
return true;
+ if (expand_vec_perm_vpermq_perm_1 (d))
+ return true;
+
/* Try sequences of three instructions. */
if (expand_vec_perm_pshufb2 (d))
@@ -36072,6 +36452,10 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_even_odd (d))
return true;
+ /* Even longer sequences. */
+ if (expand_vec_perm_vpshufb4_vpermq2 (d))
+ return true;
+
return false;
}