diff options
author | Artjoms Sinkarovs <artyom.shinkaroff@gmail.com> | 2011-10-03 08:13:26 -0700 |
---|---|---|
committer | Richard Henderson <rth@gcc.gnu.org> | 2011-10-03 08:13:26 -0700 |
commit | f90e8e2eae9a83d22efd7922673116a97ebf5290 (patch) | |
tree | f49377e311033773555fb0a2471ab04986fefafe /gcc/config | |
parent | e4a5b262e7bc64b22a34ada24b5d83b6c13dbe40 (diff) | |
download | gcc-f90e8e2eae9a83d22efd7922673116a97ebf5290.zip gcc-f90e8e2eae9a83d22efd7922673116a97ebf5290.tar.gz gcc-f90e8e2eae9a83d22efd7922673116a97ebf5290.tar.bz2 |
Vector shuffling patch from Artem Shinkarov.
From-SVN: r179462
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/i386/i386-protos.h | 1 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 154 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 19 |
3 files changed, 169 insertions, 5 deletions
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 707f217..99327ed 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -123,6 +123,7 @@ extern bool ix86_expand_int_movcc (rtx[]); extern bool ix86_expand_fp_movcc (rtx[]); extern bool ix86_expand_fp_vcond (rtx[]); extern bool ix86_expand_int_vcond (rtx[]); +extern bool ix86_expand_vshuffle (rtx[]); extern void ix86_expand_sse_unpack (rtx[], bool, bool); extern bool ix86_expand_int_addcc (rtx[]); extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7e89dbd..9b079af 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19254,6 +19254,147 @@ ix86_expand_int_vcond (rtx operands[]) return true; } +bool +ix86_expand_vshuffle (rtx operands[]) +{ + rtx target = operands[0]; + rtx op0 = operands[1]; + rtx op1 = operands[2]; + rtx mask = operands[3]; + rtx new_mask, vt, t1, t2, w_vector; + enum machine_mode mode = GET_MODE (op0); + enum machine_mode maskmode = GET_MODE (mask); + enum machine_mode maskinner = GET_MODE_INNER (mode); + rtx vec[16]; + int w, i, j; + bool one_operand_shuffle = op0 == op1; + + gcc_assert ((TARGET_SSSE3 || TARGET_AVX) && GET_MODE_BITSIZE (mode) == 128); + + /* Number of elements in the vector. */ + w = GET_MODE_BITSIZE (maskmode) / GET_MODE_BITSIZE (maskinner); + + /* generate w_vector = {w, w, ...} */ + for (i = 0; i < w; i++) + vec[i] = GEN_INT (w); + w_vector = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); + + /* mask = mask & {w-1, w-1, w-1,...} */ + for (i = 0; i < w; i++) + vec[i] = GEN_INT (w - 1); + + vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); + new_mask = expand_simple_binop (maskmode, AND, mask, vt, + NULL_RTX, 0, OPTAB_DIRECT); + + /* If the original vector mode is V16QImode, we can just + use pshufb directly. */ + if (mode == V16QImode && one_operand_shuffle) + { + t1 = gen_reg_rtx (V16QImode); + emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask)); + emit_insn (gen_rtx_SET (VOIDmode, target, t1)); + return true; + } + else if (mode == V16QImode) + { + rtx xops[6]; + + t1 = gen_reg_rtx (V16QImode); + t2 = gen_reg_rtx (V16QImode); + emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask)); + emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, new_mask)); + + /* mask = mask & {w, w, ...} */ + mask = expand_simple_binop (V16QImode, AND, mask, w_vector, + NULL_RTX, 0, OPTAB_DIRECT); + xops[0] = target; + xops[1] = operands[1]; + xops[2] = operands[2]; + xops[3] = gen_rtx_EQ (mode, mask, w_vector); + xops[4] = t1; + xops[5] = t2; + + return ix86_expand_int_vcond (xops); + } + + /* mask = mask * {w, w, ...} */ + new_mask = expand_simple_binop (maskmode, MULT, new_mask, w_vector, + NULL_RTX, 0, OPTAB_DIRECT); + + /* Convert mask to vector of chars. */ + new_mask = simplify_gen_subreg (V16QImode, new_mask, maskmode, 0); + new_mask = force_reg (V16QImode, new_mask); + + /* Build a helper mask wich we will use in pshufb + (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12} + (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...} + ... */ + for (i = 0; i < w; i++) + for (j = 0; j < 16/w; j++) + vec[i*w+j] = GEN_INT (i*16/w); + vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); + vt = force_reg (V16QImode, vt); + + t1 = gen_reg_rtx (V16QImode); + emit_insn (gen_ssse3_pshufbv16qi3 (t1, new_mask, vt)); + new_mask = t1; + + /* Convert it into the byte positions by doing + new_mask = new_mask + {0,1,..,16/w, 0,1,..,16/w, ...} */ + for (i = 0; i < w; i++) + for (j = 0; j < 16/w; j++) + vec[i*w+j] = GEN_INT (j); + + vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); + new_mask = expand_simple_binop (V16QImode, PLUS, new_mask, vt, + NULL_RTX, 0, OPTAB_DIRECT); + + t1 = gen_reg_rtx (V16QImode); + + /* Convert OP0 to vector of chars. */ + op0 = simplify_gen_subreg (V16QImode, op0, mode, 0); + op0 = force_reg (V16QImode, op0); + emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask)); + + if (one_operand_shuffle) + { + /* Convert it back from vector of chars to the original mode. */ + t1 = simplify_gen_subreg (mode, t1, V16QImode, 0); + emit_insn (gen_rtx_SET (VOIDmode, target, t1)); + return true; + } + else + { + rtx xops[6]; + + t2 = gen_reg_rtx (V16QImode); + + /* Convert OP1 to vector of chars. */ + op1 = simplify_gen_subreg (V16QImode, op1, mode, 0); + op1 = force_reg (V16QImode, op1); + emit_insn (gen_ssse3_pshufbv16qi3 (t1, op1, new_mask)); + + /* mask = mask & {w, w, ...} */ + mask = expand_simple_binop (V16QImode, AND, mask, w_vector, + NULL_RTX, 0, OPTAB_DIRECT); + + t1 = simplify_gen_subreg (mode, t1, V16QImode, 0); + t2 = simplify_gen_subreg (mode, t2, V16QImode, 0); + + xops[0] = target; + xops[1] = operands[1]; + xops[2] = operands[2]; + xops[3] = gen_rtx_EQ (mode, mask, w_vector); + xops[4] = t1; + xops[5] = t2; + + return ix86_expand_int_vcond (xops); + } + + return false; +} + /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is true if we should do zero extension, else sign extension. HIGH_P is true if we want the N/2 high elements, else the low elements. */ @@ -31472,6 +31613,9 @@ struct expand_vec_perm_d static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d); +static int extract_vec_perm_cst (struct expand_vec_perm_d *, tree); +static bool ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask); + /* Get a vector mode of the same size as the original but with elements twice as wide. This is only guaranteed to apply to integral vectors. */ @@ -33103,7 +33247,7 @@ void ix86_emit_i387_round (rtx op0, rtx op1) res = gen_reg_rtx (outmode); half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode); - + /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */ /* scratch = fxam(op1) */ @@ -35262,10 +35406,10 @@ ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask) vec_mask = extract_vec_perm_cst (&d, mask); - /* This hook is cannot be called in response to something that the - user does (unlike the builtin expander) so we shouldn't ever see - an error generated from the extract. */ - gcc_assert (vec_mask > 0 && vec_mask <= 3); + /* Check whether the mask can be applied to the vector type. */ + if (vec_mask < 0 || vec_mask > 3) + return false; + one_vec = (vec_mask != 3); /* Implementable with shufps or pshufd. */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index baa22a6..251cdde 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -230,6 +230,12 @@ (V4SF "V4SF") (V2DF "V2DF") (TI "TI")]) +;; All 128bit vector modes +(define_mode_attr sseshuffint + [(V16QI "V16QI") (V8HI "V8HI") + (V4SI "V4SI") (V2DI "V2DI") + (V4SF "V4SI") (V2DF "V2DI")]) + ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr sseintvecmode [(V8SF "V8SI") (V4DF "V4DI") @@ -6216,6 +6222,19 @@ DONE; }) +(define_expand "vshuffle<mode>" + [(match_operand:V_128 0 "register_operand" "") + (match_operand:V_128 1 "register_operand" "") + (match_operand:V_128 2 "register_operand" "") + (match_operand:<sseshuffint> 3 "register_operand" "")] + "TARGET_SSSE3 || TARGET_AVX" +{ + bool ok = ix86_expand_vshuffle (operands); + gcc_assert (ok); + DONE; +}) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel bitwise logical operations |