diff options
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 7 | ||||
-rw-r--r-- | gcc/config/arm/arm.c | 9 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.c | 7 | ||||
-rw-r--r-- | gcc/config/i386/i386-expand.c | 26 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 102 | ||||
-rw-r--r-- | gcc/config/ia64/ia64.c | 9 | ||||
-rw-r--r-- | gcc/config/mips/mips.c | 9 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.c | 10 | ||||
-rw-r--r-- | gcc/config/sparc/sparc.c | 6 | ||||
-rw-r--r-- | gcc/optabs.c | 8 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr95905-2.c | 42 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr95905-3.c | 82 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr95905-4.c | 82 |
13 files changed, 388 insertions, 11 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 7536b75..c19dc6c 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -21084,8 +21084,11 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, d.vmode = vmode; d.vec_flags = aarch64_classify_vector_mode (d.vmode); d.target = target; - d.op0 = op0; - d.op1 = op1; + d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX; + if (op0 == op1) + d.op1 = d.op0; + else + d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX; d.testing_p = !target; if (!d.testing_p) diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index c8e2571..bebccc1 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -31482,6 +31482,15 @@ arm_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1, return false; d.target = target; + if (op0) + { + rtx nop0 = force_reg (vmode, op0); + if (op0 == op1) + op1 = nop0; + op0 = nop0; + } + if (op1) + op1 = force_reg (vmode, op1); d.op0 = op0; d.op1 = op1; diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index b08f4b3..3b1762e 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -3982,13 +3982,14 @@ gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst, for (unsigned int i = 0; i < nelt; ++i) perm[i] = sel[i] & (2 * nelt - 1); + src0 = force_reg (vmode, src0); + src1 = force_reg (vmode, src1); + /* Make life a bit easier by swapping operands if necessary so that the first element always comes from src0. */ if (perm[0] >= nelt) { - rtx temp = src0; - src0 = src1; - src1 = temp; + std::swap (src0, src1); for (unsigned int i = 0; i < nelt; ++i) if (perm[i] < nelt) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index d793e5a..280645f 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -19929,6 +19929,32 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, two_args = canonicalize_perm (&d); + /* If one of the operands is a zero vector, try to match pmovzx. */ + if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode))) + { + struct expand_vec_perm_d dzero = d; + if (d.op0 == CONST0_RTX (vmode)) + { + d.op1 = dzero.op1 = force_reg (vmode, d.op1); + std::swap (dzero.op0, dzero.op1); + for (i = 0; i < nelt; ++i) + dzero.perm[i] ^= nelt; + } + else + d.op0 = dzero.op0 = force_reg (vmode, d.op0); + + if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1, + dzero.perm, nelt, dzero.testing_p)) + return true; + } + + /* Force operands into registers. */ + rtx nop0 = force_reg (vmode, d.op0); + if (d.op0 == d.op1) + d.op1 = nop0; + d.op0 = nop0; + d.op1 = force_reg (vmode, d.op1); + if (ix86_expand_vec_perm_const_1 (&d)) return true; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 2a260c1c..7f03fc4 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17611,6 +17611,23 @@ (set_attr "prefix" "maybe_evex") (set_attr "mode" "OI")]) +(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_1" + [(set (match_operand:V32QI 0 "register_operand" "=v") + (vec_select:V32QI + (vec_concat:V64QI + (match_operand:V32QI 1 "nonimmediate_operand" "vm") + (match_operand:V32QI 2 "const0_operand" "C")) + (match_parallel 3 "pmovzx_parallel" + [(match_operand 4 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode); + operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode); +}) + (define_expand "<insn>v16qiv16hi2" [(set (match_operand:V16HI 0 "register_operand") (any_extend:V16HI @@ -17628,6 +17645,23 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) +(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_1" + [(set (match_operand:V64QI 0 "register_operand" "=v") + (vec_select:V64QI + (vec_concat:V128QI + (match_operand:V64QI 1 "nonimmediate_operand" "vm") + (match_operand:V64QI 2 "const0_operand" "C")) + (match_parallel 3 "pmovzx_parallel" + [(match_operand 4 "const_int_operand" "n")])))] + "TARGET_AVX512BW" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode); + operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode); +}) + (define_expand "<insn>v32qiv32hi2" [(set (match_operand:V32HI 0 "register_operand") (any_extend:V32HI @@ -17883,6 +17917,23 @@ (match_operand:V16HI 1 "nonimmediate_operand")))] "TARGET_AVX512F") +(define_insn_and_split "avx512f_zero_extendv16hiv16si2_1" + [(set (match_operand:V32HI 0 "register_operand" "=v") + (vec_select:V32HI + (vec_concat:V64HI + (match_operand:V32HI 1 "nonimmediate_operand" "vm") + (match_operand:V32HI 2 "const0_operand" "C")) + (match_parallel 3 "pmovzx_parallel" + [(match_operand 4 "const_int_operand" "n")])))] + "TARGET_AVX512F" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode); + operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode); +}) + (define_insn "avx2_<code>v8hiv8si2<mask_name>" [(set (match_operand:V8SI 0 "register_operand" "=v") (any_extend:V8SI @@ -17900,6 +17951,23 @@ (match_operand:V8HI 1 "nonimmediate_operand")))] "TARGET_AVX2") +(define_insn_and_split "avx2_zero_extendv8hiv8si2_1" + [(set (match_operand:V16HI 0 "register_operand" "=v") + (vec_select:V16HI + (vec_concat:V32HI + (match_operand:V16HI 1 "nonimmediate_operand" "vm") + (match_operand:V16HI 2 "const0_operand" "C")) + (match_parallel 3 "pmovzx_parallel" + [(match_operand 4 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode); + operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode); +}) + (define_insn "sse4_1_<code>v4hiv4si2<mask_name>" [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") (any_extend:V4SI @@ -18275,6 +18343,23 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) +(define_insn_and_split "*avx512f_zero_extendv8siv8di2_1" + [(set (match_operand:V16SI 0 "register_operand" "=v") + (vec_select:V16SI + (vec_concat:V32SI + (match_operand:V16SI 1 "nonimmediate_operand" "vm") + (match_operand:V16SI 2 "const0_operand" "C")) + (match_parallel 3 "pmovzx_parallel" + [(match_operand 4 "const_int_operand" "n")])))] + "TARGET_AVX512F" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode); + operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode); +}) + (define_expand "<insn>v8siv8di2" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -18292,6 +18377,23 @@ (set_attr "prefix_extra" "1") (set_attr "mode" "OI")]) +(define_insn_and_split "*avx2_zero_extendv4siv4di2_1" + [(set (match_operand:V8SI 0 "register_operand" "=v") + (vec_select:V8SI + (vec_concat:V16SI + (match_operand:V8SI 1 "nonimmediate_operand" "vm") + (match_operand:V8SI 2 "const0_operand" "C")) + (match_parallel 3 "pmovzx_parallel" + [(match_operand 4 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode); + operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode); +}) + (define_expand "<insn>v4siv4di2" [(set (match_operand:V4DI 0 "register_operand" "=v") (any_extend:V4DI diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c index 8ddacce..f1a6de1 100644 --- a/gcc/config/ia64/ia64.c +++ b/gcc/config/ia64/ia64.c @@ -11759,6 +11759,15 @@ ia64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, unsigned int i, nelt, which; d.target = target; + if (op0) + { + rtx nop0 = force_reg (vmode, op0); + if (op0 == op1) + op1 = nop0; + op0 = nop0; + } + if (op1) + op1 = force_reg (vmode, op1); d.op0 = op0; d.op1 = op1; diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 8556ebb..ebb04b7 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -21624,6 +21624,15 @@ mips_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, bool ok; d.target = target; + if (op0) + { + rtx nop0 = force_reg (vmode, op0); + if (op0 == op1) + op1 = nop0; + op0 = nop0; + } + if (op1) + op1 = force_reg (vmode, op1); d.op0 = op0; d.op1 = op1; diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 67681d1..b9e90ae 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -22946,6 +22946,16 @@ rs6000_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, if (TARGET_ALTIVEC && testing_p) return true; + if (op0) + { + rtx nop0 = force_reg (vmode, op0); + if (op0 == op1) + op1 = nop0; + op0 = nop0; + } + if (op1) + op1 = force_reg (vmode, op1); + /* Check for ps_merge* or xxpermdi insns. */ if ((vmode == V2DFmode || vmode == V2DImode) && VECTOR_MEM_VSX_P (vmode)) { diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c index 8a5a269..f355793 100644 --- a/gcc/config/sparc/sparc.c +++ b/gcc/config/sparc/sparc.c @@ -12942,6 +12942,12 @@ sparc_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, if (vmode != V8QImode) return false; + rtx nop0 = force_reg (vmode, op0); + if (op0 == op1) + op1 = nop0; + op0 = nop0; + op1 = force_reg (vmode, op1); + unsigned int i, mask; for (i = mask = 0; i < 8; ++i) mask |= (sel[i] & 0xf) << (28 - i*4); diff --git a/gcc/optabs.c b/gcc/optabs.c index 6f671fd..f4614a3 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -6070,11 +6070,8 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1, if (targetm.vectorize.vec_perm_const != NULL) { - v0 = force_reg (mode, v0); if (single_arg_p) v1 = v0; - else - v1 = force_reg (mode, v1); if (targetm.vectorize.vec_perm_const (mode, target, v0, v1, indices)) return target; @@ -6095,6 +6092,11 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1, return gen_lowpart (mode, target_qi); } + v0 = force_reg (mode, v0); + if (single_arg_p) + v1 = v0; + v1 = force_reg (mode, v1); + /* Otherwise expand as a fully variable permuation. */ /* The optabs are only defined for selectors with the same width diff --git a/gcc/testsuite/gcc.target/i386/pr95905-2.c b/gcc/testsuite/gcc.target/i386/pr95905-2.c index 7cd20a3..231335c 100644 --- a/gcc/testsuite/gcc.target/i386/pr95905-2.c +++ b/gcc/testsuite/gcc.target/i386/pr95905-2.c @@ -1,9 +1,9 @@ /* PR target/95905 */ /* { dg-do compile } */ /* { dg-options "-O2 -msse4.1" } */ -/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */ -/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */ -/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */ +/* { dg-final { scan-assembler-times "\tv?pmovzxbw\t" 4 } } */ +/* { dg-final { scan-assembler-times "\tv?pmovzxwd\t" 4 } } */ +/* { dg-final { scan-assembler-times "\tv?pmovzxdq\t" 4 } } */ typedef unsigned char V1 __attribute__((vector_size (16))); typedef unsigned short V2 __attribute__((vector_size (16))); @@ -44,3 +44,39 @@ f6 (V3 *x) { return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 }); } + +V1 +f7 (V1 x) +{ + return __builtin_shuffle ((V1) {}, x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); +} + +V2 +f8 (V2 x) +{ + return __builtin_shuffle ((V2) {}, x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 }); +} + +V3 +f9 (V3 x) +{ + return __builtin_shuffle ((V3) {}, x, (V3) { 4, 0, 5, 1 }); +} + +V1 +f10 (V1 *x) +{ + return __builtin_shuffle ((V1) {}, *x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); +} + +V2 +f11 (V2 *x) +{ + return __builtin_shuffle ((V2) {}, *x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 }); +} + +V3 +f12 (V3 *x) +{ + return __builtin_shuffle ((V3) {}, *x, (V3) { 4, 0, 5, 1 }); +} diff --git a/gcc/testsuite/gcc.target/i386/pr95905-3.c b/gcc/testsuite/gcc.target/i386/pr95905-3.c new file mode 100644 index 0000000..b7b4bc5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95905-3.c @@ -0,0 +1,82 @@ +/* PR target/95905 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2" } */ +/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */ +/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */ +/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */ + +typedef unsigned char V1 __attribute__((vector_size (32))); +typedef unsigned short V2 __attribute__((vector_size (32))); +typedef unsigned int V3 __attribute__((vector_size (32))); + +V1 +f1 (V1 x) +{ + return __builtin_shuffle (x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 }); +} + +V2 +f2 (V2 x) +{ + return __builtin_shuffle (x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); +} + +V3 +f3 (V3 x) +{ + return __builtin_shuffle (x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 }); +} + +V1 +f4 (V1 *x) +{ + return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 }); +} + +V2 +f5 (V2 *x) +{ + return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); +} + +V3 +f6 (V3 *x) +{ + return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 }); +} + +V1 +f7 (V1 x) +{ + return __builtin_shuffle ((V1) {}, x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 }); +} + +V2 +f8 (V2 x) +{ + return __builtin_shuffle ((V2) {}, x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); +} + +V3 +f9 (V3 x) +{ + return __builtin_shuffle ((V3) {}, x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 }); +} + +V1 +f10 (V1 *x) +{ + return __builtin_shuffle ((V1) {}, *x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 }); +} + +V2 +f11 (V2 *x) +{ + return __builtin_shuffle ((V2) {}, *x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); +} + +V3 +f12 (V3 *x) +{ + return __builtin_shuffle ((V3) {}, *x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 }); +} diff --git a/gcc/testsuite/gcc.target/i386/pr95905-4.c b/gcc/testsuite/gcc.target/i386/pr95905-4.c new file mode 100644 index 0000000..43cdf7f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95905-4.c @@ -0,0 +1,82 @@ +/* PR target/95905 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512bw" } */ +/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */ +/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */ +/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */ + +typedef unsigned char V1 __attribute__((vector_size (64))); +typedef unsigned short V2 __attribute__((vector_size (64))); +typedef unsigned int V3 __attribute__((vector_size (64))); + +V1 +f1 (V1 x) +{ + return __builtin_shuffle (x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 }); +} + +V2 +f2 (V2 x) +{ + return __builtin_shuffle (x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 }); +} + +V3 +f3 (V3 x) +{ + return __builtin_shuffle (x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); +} + +V1 +f4 (V1 *x) +{ + return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 }); +} + +V2 +f5 (V2 *x) +{ + return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 }); +} + +V3 +f6 (V3 *x) +{ + return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); +} + +V1 +f7 (V1 x) +{ + return __builtin_shuffle ((V1) {}, x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 }); +} + +V2 +f8 (V2 x) +{ + return __builtin_shuffle ((V2) {}, x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 }); +} + +V3 +f9 (V3 x) +{ + return __builtin_shuffle ((V3) {}, x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); +} + +V1 +f10 (V1 *x) +{ + return __builtin_shuffle ((V1) {}, *x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 }); +} + +V2 +f11 (V2 *x) +{ + return __builtin_shuffle ((V2) {}, *x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 }); +} + +V3 +f12 (V3 *x) +{ + return __builtin_shuffle ((V3) {}, *x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); +} |