diff options
-rw-r--r-- | gcc/config/riscv/riscv-protos.h | 2 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-v.cc | 148 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-1.c | 17 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-2.c | 18 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-3.c | 19 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-4.c | 19 |
6 files changed, 211 insertions, 12 deletions
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index bfbd2bf..a6f204f 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -543,7 +543,7 @@ void expand_tuple_move (rtx *); bool expand_block_move (rtx, rtx, rtx); machine_mode preferred_simd_mode (scalar_mode); machine_mode get_mask_mode (machine_mode); -void expand_vec_series (rtx, rtx, rtx); +void expand_vec_series (rtx, rtx, rtx, rtx = 0); void expand_vec_init (rtx, rtx); void expand_vec_perm (rtx, rtx, rtx, rtx); void expand_select_vl (rtx *); diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 71cb756..9b99d0a 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -432,6 +432,7 @@ public: bool single_step_npatterns_p () const; bool npatterns_all_equal_p () const; + bool interleaved_stepped_npatterns_p () const; machine_mode new_mode () const { return m_new_mode; } scalar_mode inner_mode () const { return m_inner_mode; } @@ -668,6 +669,27 @@ rvv_builder::single_step_npatterns_p () const return true; } +/* Return true if the permutation consists of two + interleaved patterns with a constant step each. + TODO: We currently only support NPATTERNS = 2. */ +bool +rvv_builder::interleaved_stepped_npatterns_p () const +{ + if (npatterns () != 2 || nelts_per_pattern () != 3) + return false; + for (unsigned int i = 0; i < npatterns (); i++) + { + poly_int64 ele0 = rtx_to_poly_int64 (elt (i)); + poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i)); + poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i)); + poly_int64 diff1 = ele1 - ele0; + poly_int64 diff2 = ele2 - ele1; + if (maybe_ne (diff1, diff2)) + return false; + } + return true; +} + /* Return true if all elements of NPATTERNS are equal. E.g. NPATTERNS = 4: @@ -955,10 +977,15 @@ get_repeating_sequence_dup_machine_mode (const rvv_builder &builder, return get_vector_mode (inner_mode, dup_nunit).require (); } -/* Expand series const vector. */ +/* Expand series const vector. If VID is NULL_RTX, we use vid.v + instructions to generate sequence for VID: + + VID = { 0, 1, 2, 3, ... } + + Otherwise, we use the VID argument directly. */ void -expand_vec_series (rtx dest, rtx base, rtx step) +expand_vec_series (rtx dest, rtx base, rtx step, rtx vid) { machine_mode mode = GET_MODE (dest); poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1; @@ -968,14 +995,18 @@ expand_vec_series (rtx dest, rtx base, rtx step) /* VECT_IV = BASE + I * STEP. */ /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */ - rtx vid = gen_reg_rtx (mode); - rtx op[] = {vid}; - emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op); + bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx) + && poly_int_rtx_p (base, &value) + && known_eq (nunits_m1, value); + if (!vid) + { + vid = gen_reg_rtx (mode); + rtx op[] = {vid}; + emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op); + } rtx step_adj; - if (rtx_equal_p (step, constm1_rtx) - && poly_int_rtx_p (base, &value) - && known_eq (nunits_m1, value)) + if (reverse_p) { /* Special case: {nunits - 1, nunits - 2, ... , 0}. @@ -1246,13 +1277,108 @@ expand_const_vector (rtx target, rtx src) BINARY_OP, add_ops); } } + else if (builder.interleaved_stepped_npatterns_p ()) + { + rtx base1 = builder.elt (0); + rtx base2 = builder.elt (1); + poly_int64 step1 + = rtx_to_poly_int64 (builder.elt (builder.npatterns ())) + - rtx_to_poly_int64 (base1); + poly_int64 step2 + = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1)) + - rtx_to_poly_int64 (base2); + + /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW + integer vector mode to generate such vector efficiently. + + E.g. EEW = 16, { 2, 0, 4, 0, ... } + + can be interpreted into: + + EEW = 32, { 2, 4, ... } */ + unsigned int new_smode_bitsize = builder.inner_bits_size () * 2; + scalar_int_mode new_smode; + machine_mode new_mode; + poly_uint64 new_nunits + = exact_div (GET_MODE_NUNITS (builder.mode ()), 2); + if (int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode) + && get_vector_mode (new_smode, new_nunits).exists (&new_mode)) + { + rtx tmp = gen_reg_rtx (new_mode); + base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode); + expand_vec_series (tmp, base1, gen_int_mode (step1, new_smode)); + + if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0)) + /* { 1, 0, 2, 0, ... }. */ + emit_move_insn (target, gen_lowpart (mode, tmp)); + else if (known_eq (step2, 0)) + { + /* { 1, 1, 2, 1, ... }. */ + rtx scalar = expand_simple_binop ( + new_smode, ASHIFT, + gen_int_mode (rtx_to_poly_int64 (base2), new_smode), + gen_int_mode (builder.inner_bits_size (), new_smode), + NULL_RTX, false, OPTAB_DIRECT); + rtx tmp2 = gen_reg_rtx (new_mode); + rtx and_ops[] = {tmp2, tmp, scalar}; + emit_vlmax_insn (code_for_pred_scalar (AND, new_mode), + BINARY_OP, and_ops); + emit_move_insn (target, gen_lowpart (mode, tmp2)); + } + else + { + /* { 1, 3, 2, 6, ... }. */ + rtx tmp2 = gen_reg_rtx (new_mode); + base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode); + expand_vec_series (tmp2, base2, + gen_int_mode (step1, new_smode)); + rtx shifted_tmp2 = expand_simple_binop ( + new_mode, ASHIFT, tmp2, + gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX, + false, OPTAB_DIRECT); + rtx tmp3 = gen_reg_rtx (new_mode); + rtx ior_ops[] = {tmp3, tmp, shifted_tmp2}; + emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP, + ior_ops); + emit_move_insn (target, gen_lowpart (mode, tmp3)); + } + } + else + { + rtx vid = gen_reg_rtx (mode); + expand_vec_series (vid, const0_rtx, const1_rtx); + /* Transform into { 0, 0, 1, 1, 2, 2, ... }. */ + rtx shifted_vid + = expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx, + NULL_RTX, false, OPTAB_DIRECT); + rtx tmp1 = gen_reg_rtx (mode); + rtx tmp2 = gen_reg_rtx (mode); + expand_vec_series (tmp1, base1, + gen_int_mode (step1, builder.inner_mode ()), + shifted_vid); + expand_vec_series (tmp2, base2, + gen_int_mode (step2, builder.inner_mode ()), + shifted_vid); + + /* Transform into { 0, 1, 0, 1, 0, 1, ... }. */ + rtx and_vid = gen_reg_rtx (mode); + rtx and_ops[] = {and_vid, vid, const1_rtx}; + emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP, + and_ops); + rtx mask = gen_reg_rtx (builder.mask_mode ()); + expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode)); + + rtx ops[] = {target, tmp1, tmp2, mask}; + emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops); + } + } else if (npatterns == 1 && nelts_per_pattern == 3) { /* Generate the following CONST_VECTOR: { base0, base1, base1 + step, base1 + step * 2, ... } */ - rtx base0 = CONST_VECTOR_ELT (src, 0); - rtx base1 = CONST_VECTOR_ELT (src, 1); - rtx step = CONST_VECTOR_ELT (src, 2); + rtx base0 = builder.elt (0); + rtx base1 = builder.elt (1); + rtx step = builder.elt (2); /* Step 1 - { base1, base1 + step, base1 + step * 2, ... } */ rtx tmp = gen_reg_rtx (mode); expand_vec_series (tmp, base1, step); diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-1.c new file mode 100644 index 0000000..9f37143 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-vect-cost-model --param=riscv-autovec-lmul=m8 -O3 -fdump-tree-optimized-details" } */ + +struct S { int a, b; } s[8]; + +void +foo () +{ + int i; + for (i = 0; i < 8; i++) + { + s[i].b = 0; + s[i].a = i; + } +} + +/* { dg-final { scan-tree-dump-times "\{ 0, 0, 1, 0, 2, 0, ... \}" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-2.c new file mode 100644 index 0000000..6cc390c --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-2.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-vect-cost-model --param=riscv-autovec-lmul=m8 -O3 -fdump-tree-optimized-details" } */ + +struct S { int a, b; } s[8]; + +void +foo () +{ + int i; + for (i = 0; i < 8; i++) + { + s[i].b = 1; + s[i].a = i; + } +} + +/* { dg-final { scan-tree-dump-times "\{ 0, 1, 1, 1, 2, 1, ... \}" 1 "optimized" } } */ +/* { dg-final { scan-assembler-times {slli\t[a-x0-9]+,\s*[a-x0-9]+,\s*32} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-3.c new file mode 100644 index 0000000..326d66e --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-3.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-vect-cost-model --param=riscv-autovec-lmul=m8 -O3 -fdump-tree-optimized-details" } */ + +struct S { int a, b; } s[8]; + +void +foo () +{ + int i; + for (i = 0; i < 8; i++) + { + s[i].b = i*3 + 100; + s[i].a = i + 200; + } +} + +/* { dg-final { scan-tree-dump-times "\{ 200, 100, 201, 103, 202, 106, ... \}" 1 "optimized" } } */ +/* { dg-final { scan-assembler-times {vsll\.vx} 1 } } */ +/* { dg-final { scan-assembler-times {vor\.vv} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-4.c new file mode 100644 index 0000000..2bb73eb --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-interleave-4.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gc_zve32f_zvl1024b -mabi=lp64d -fno-vect-cost-model --param=riscv-autovec-lmul=m8 -O3 -fdump-tree-optimized-details" } */ + +struct S { int a, b; } s[8]; + +void +foo () +{ + int i; + for (i = 0; i < 8; i++) + { + s[i].b = i*3 + 100; + s[i].a = i + 200; + } +} + +/* { dg-final { scan-tree-dump-times "\{ 200, 100, 201, 103, 202, 106, ... \}" 1 "optimized" } } */ +/* { dg-final { scan-assembler-times {vand\.vi} 1 } } */ +/* { dg-final { scan-assembler-times {vmseq\.vi} 1 } } */ |