diff options
author | Yuri Rumyantsev <ysrumyan@gmail.com> | 2016-02-02 09:46:26 +0000 |
---|---|---|
committer | Ilya Enkovich <ienkovich@gcc.gnu.org> | 2016-02-02 09:46:26 +0000 |
commit | 2d4dc2233b4a3aef3edeb23d6acaaa8a5327c137 (patch) | |
tree | 406eb48f3246bc92d314640f0b58301e7c2c6257 | |
parent | 65c98fdec722720ae9b2a6765232fe4535e06d4e (diff) | |
download | gcc-2d4dc2233b4a3aef3edeb23d6acaaa8a5327c137.zip gcc-2d4dc2233b4a3aef3edeb23d6acaaa8a5327c137.tar.gz gcc-2d4dc2233b4a3aef3edeb23d6acaaa8a5327c137.tar.bz2 |
re PR middle-end/68542 (10% 481.wrf performance regression)
gcc/
2016-02-02 Yuri Rumyantsev <ysrumyan@gmail.com>
PR middle-end/68542
* config/i386/i386.c (ix86_expand_branch): Add support for conditional
branch with vector comparison.
* config/i386/sse.md (VI48_AVX): New mode iterator.
(define_expand "cbranch<mode>4): Add support for conditional branch
with vector comparison.
* tree-vect-loop.c (optimize_mask_stores): New function.
* tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
has_mask_store field of vect_info.
* tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for
vectorized loops having masked stores after vec_info destroy.
* tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
correspondent macros.
(optimize_mask_stores): Add prototype.
gcc/testsuite
2016-02-02 Yuri Rumyantsev <ysrumyan@gmail.com>
PR middle-end/68542
* gcc.dg/vect/vect-mask-store-move-1.c: New test.
* gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
From-SVN: r233068
-rw-r--r-- | gcc/ChangeLog | 17 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 24 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 21 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 6 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c | 19 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c | 79 | ||||
-rw-r--r-- | gcc/tree-vect-loop.c | 192 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 1 | ||||
-rw-r--r-- | gcc/tree-vectorizer.c | 6 | ||||
-rw-r--r-- | gcc/tree-vectorizer.h | 5 |
10 files changed, 370 insertions, 0 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b061339..be546dc 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,20 @@ +2016-02-02 Yuri Rumyantsev <ysrumyan@gmail.com> + + PR middle-end/68542 + * config/i386/i386.c (ix86_expand_branch): Add support for conditional + branch with vector comparison. + * config/i386/sse.md (VI48_AVX): New mode iterator. + (define_expand "cbranch<mode>4): Add support for conditional branch + with vector comparison. + * tree-vect-loop.c (optimize_mask_stores): New function. + * tree-vect-stmts.c (vectorizable_mask_load_store): Initialize + has_mask_store field of vect_info. + * tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for + vectorized loops having masked stores after vec_info destroy. + * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and + correspondent macros. + (optimize_mask_stores): Add prototype. + 2016-02-02 Alan Modra <amodra@gmail.com> PR target/69548 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index b500233..af5cf5a 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21684,6 +21684,30 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) machine_mode mode = GET_MODE (op0); rtx tmp; + /* Handle special case - vector comparsion with boolean result, transform + it using ptest instruction. */ + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG); + machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode; + + gcc_assert (code == EQ || code == NE); + /* Generate XOR since we can't check that one operand is zero vector. */ + tmp = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1))); + tmp = gen_lowpart (p_mode, tmp); + emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG), + gen_rtx_UNSPEC (CCmode, + gen_rtvec (2, tmp, tmp), + UNSPEC_PTEST))); + tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, label), + pc_rtx); + emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + return; + } + switch (mode) { case SFmode: diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6744cce..7f89679 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -305,6 +305,10 @@ (V8SI "TARGET_AVX") (V4DI "TARGET_AVX") (V8SF "TARGET_AVX") (V4DF"TARGET_AVX")]) +(define_mode_iterator VI48_AVX + [V4SI V2DI + (V8SI "TARGET_AVX") (V4DI "TARGET_AVX")]) + (define_mode_iterator VI8 [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI]) @@ -18225,6 +18229,23 @@ (match_operand:<avx512fmaskmode> 2 "register_operand")))] "TARGET_AVX512BW") +(define_expand "cbranch<mode>4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:VI48_AVX 1 "register_operand") + (match_operand:VI48_AVX 2 "nonimmediate_operand"))) + (set (pc) (if_then_else + (match_operator 0 "bt_comparison_operator" + [(reg:CC FLAGS_REG) (const_int 0)]) + (label_ref (match_operand 3)) + (pc)))] + "TARGET_SSE4_1" +{ + ix86_expand_branch (GET_CODE (operands[0]), + operands[1], operands[2], operands[3]); + DONE; +}) + + (define_insn_and_split "avx_<castmode><avxsizesuffix>_<castmode>" [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m") (unspec:AVX256MODE2P diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index c96277c..7f348ae 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2016-02-02 Yuri Rumyantsev <ysrumyan@gmail.com> + + PR middle-end/68542 + * gcc.dg/vect/vect-mask-store-move-1.c: New test. + * gcc.target/i386/avx2-vect-mask-store-move1.c: New test. + 2016-02-02 Alan Modra <amodra@gmail.com> PR target/69548 diff --git a/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c b/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c new file mode 100644 index 0000000..e575f6d --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ +/* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */ + +#define N 256 +int p1[N], p2[N], p3[N]; +int c[N]; +void foo (int n) +{ + int i; + for (i=0; i<n; i++) + if (c[i]) + { + p1[i] += 1; + p2[i] = p3[i] +2; + } +} + +/* { dg-final { scan-tree-dump-times "Move stmt to created bb" 6 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c new file mode 100644 index 0000000..2a10560 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c @@ -0,0 +1,79 @@ +/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx2 } */ + +#include "avx2-check.h" +#define N 32 +int *p1, *p2, *p3; +int c[N]; +int p1ref[N], p2ref[N]; + +__attribute__((noinline, noclone)) void foo (int n) +{ + int i; + for (i=0; i<n; i++) + if (c[i]) + { + p1[i] += 1; + p2[i] = p3[i] +2; + } +} + +void init () +{ + p1ref[0]=1; p2ref[0]=2; + p1ref[1]=3; p2ref[1]=5; + p1ref[2]=5; p2ref[2]=8; + p1ref[3]=7; p2ref[3]=11; + p1ref[4]=9; p2ref[4]=14; + p1ref[5]=11; p2ref[5]=17; + p1ref[6]=13; p2ref[6]=20; + p1ref[7]=15; p2ref[7]=23; + p1ref[8]=16; p2ref[8]=8; + p1ref[9]=18; p2ref[9]=9; + p1ref[10]=20; p2ref[10]=10; + p1ref[11]=22; p2ref[11]=11; + p1ref[12]=24; p2ref[12]=12; + p1ref[13]=26; p2ref[13]=13; + p1ref[14]=28; p2ref[14]=14; + p1ref[15]=30; p2ref[15]=15; + p1ref[16]=33; p2ref[16]=50; + p1ref[17]=35; p2ref[17]=53; + p1ref[18]=37; p2ref[18]=56; + p1ref[19]=39; p2ref[19]=59; + p1ref[20]=41; p2ref[20]=62; + p1ref[21]=43; p2ref[21]=65; + p1ref[22]=45; p2ref[22]=68; + p1ref[23]=47; p2ref[23]=71; + p1ref[24]=48; p2ref[24]=24; + p1ref[25]=50; p2ref[25]=25; + p1ref[26]=52; p2ref[26]=26; + p1ref[27]=54; p2ref[27]=27; + p1ref[28]=56; p2ref[28]=28; + p1ref[29]=58; p2ref[29]=29; + p1ref[30]=60; p2ref[30]=30; + p1ref[31]=62; p2ref[31]=31; +} + +static void +avx2_test (void) +{ + int * P = malloc (N * 3 * sizeof (int)); + int i; + + p1 = &P[0]; + p2 = &P[N]; + p3 = &P[2 * N]; + for (i=0; i<N; i++) { + p1[i] = i + i; + p3[i] = i * 3; + p2[i] = i; + c[i] = (i >> 3) & 1? 0: 1; + } + init (); + foo (N); + for (i=0; i<N;i++) + if (p1[i] != p1ref[i] || p2[i] != p2ref[i]) + abort (); +} + +/* { dg-final { scan-tree-dump-times "Move stmt to created bb" 6 "vect" } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index b8303ad..976e192 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -6938,3 +6938,195 @@ vect_transform_loop (loop_vec_info loop_vinfo) vect_free_slp_instance (instance); LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); } + +/* The code below is trying to perform simple optimization - revert + if-conversion for masked stores, i.e. if the mask of a store is zero + do not perform it and all stored value producers also if possible. + For example, + for (i=0; i<n; i++) + if (c[i]) + { + p1[i] += 1; + p2[i] = p3[i] +2; + } + this transformation will produce the following semi-hammock: + + if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 }) + { + vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165); + vect__12.22_172 = vect__11.19_170 + vect_cst__171; + MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172); + vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165); + vect__19.28_184 = vect__18.25_182 + vect_cst__183; + MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); + } +*/ + +void +optimize_mask_stores (struct loop *loop) +{ + basic_block *bbs = get_loop_body (loop); + unsigned nbbs = loop->num_nodes; + unsigned i; + basic_block bb; + gimple_stmt_iterator gsi; + gimple *stmt, *stmt1 = NULL; + auto_vec<gimple *> worklist; + + vect_location = find_loop_location (loop); + /* Pick up all masked stores in loop if any. */ + for (i = 0; i < nbbs; i++) + { + bb = bbs[i]; + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + stmt = gsi_stmt (gsi); + if (is_gimple_call (stmt) + && gimple_call_internal_p (stmt) + && gimple_call_internal_fn (stmt) == IFN_MASK_STORE) + worklist.safe_push (stmt); + } + } + + free (bbs); + if (worklist.is_empty ()) + return; + + /* Loop has masked stores. */ + while (!worklist.is_empty ()) + { + gimple *last, *last_store; + edge e, efalse; + tree mask; + basic_block store_bb, join_bb; + gimple_stmt_iterator gsi_to; + tree vdef, new_vdef; + gphi *phi; + tree vectype; + tree zero; + + last = worklist.pop (); + mask = gimple_call_arg (last, 2); + bb = gimple_bb (last); + /* Create new bb. */ + e = split_block (bb, last); + join_bb = e->dest; + store_bb = create_empty_bb (bb); + add_bb_to_loop (store_bb, loop); + e->flags = EDGE_TRUE_VALUE; + efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); + /* Put STORE_BB to likely part. */ + efalse->probability = PROB_UNLIKELY; + store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse); + make_edge (store_bb, join_bb, EDGE_FALLTHRU); + if (dom_info_available_p (CDI_DOMINATORS)) + set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Create new block %d to sink mask stores.", + store_bb->index); + /* Create vector comparison with boolean result. */ + vectype = TREE_TYPE (mask); + zero = build_zero_cst (vectype); + stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); + gsi = gsi_last_bb (bb); + gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); + /* Create new PHI node for vdef of the last masked store: + .MEM_2 = VDEF <.MEM_1> + will be converted to + .MEM.3 = VDEF <.MEM_1> + and new PHI node will be created in join bb + .MEM_2 = PHI <.MEM_1, .MEM_3> + */ + vdef = gimple_vdef (last); + new_vdef = make_ssa_name (gimple_vop (cfun), last); + gimple_set_vdef (last, new_vdef); + phi = create_phi_node (vdef, join_bb); + add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); + + /* Put all masked stores with the same mask to STORE_BB if possible. */ + while (true) + { + gimple_stmt_iterator gsi_from; + /* Move masked store to STORE_BB. */ + last_store = last; + gsi = gsi_for_stmt (last); + gsi_from = gsi; + /* Shift GSI to the previous stmt for further traversal. */ + gsi_prev (&gsi); + gsi_to = gsi_start_bb (store_bb); + gsi_move_before (&gsi_from, &gsi_to); + /* Setup GSI_TO to the non-empty block start. */ + gsi_to = gsi_start_bb (store_bb); + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Move stmt to created bb\n"); + dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0); + } + /* Move all stored value producers if possible. */ + while (!gsi_end_p (gsi)) + { + tree lhs; + imm_use_iterator imm_iter; + use_operand_p use_p; + bool res; + stmt1 = gsi_stmt (gsi); + /* Do not consider statements writing to memory. */ + if (gimple_vdef (stmt1)) + break; + gsi_from = gsi; + gsi_prev (&gsi); + lhs = gimple_get_lhs (stmt1); + if (!lhs) + break; + + /* LHS of vectorized stmt must be SSA_NAME. */ + if (TREE_CODE (lhs) != SSA_NAME) + break; + + /* Skip scalar statements. */ + if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) + continue; + + /* Check that LHS does not have uses outside of STORE_BB. */ + res = true; + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) + { + gimple *use_stmt; + use_stmt = USE_STMT (use_p); + if (gimple_bb (use_stmt) != store_bb) + { + res = false; + break; + } + } + if (!res) + break; + + if (gimple_vuse (stmt1) + && gimple_vuse (stmt1) != gimple_vuse (last_store)) + break; + + /* Can move STMT1 to STORE_BB. */ + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Move stmt to created bb\n"); + dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0); + } + gsi_move_before (&gsi_from, &gsi_to); + /* Shift GSI_TO for further insertion. */ + gsi_prev (&gsi_to); + } + /* Put other masked stores with the same mask to STORE_BB. */ + if (worklist.is_empty () + || gimple_call_arg (worklist.last (), 2) != mask + || worklist.last () != stmt1) + break; + last = worklist.pop (); + } + add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); + } +} diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index fa4a364..1aade9e 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -2023,6 +2023,7 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi, { tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE; prev_stmt_info = NULL; + LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true; for (i = 0; i < ncopies; i++) { unsigned align, misalign; diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index e1c2ced..2b25b45 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -604,12 +604,18 @@ vectorize_loops (void) for (i = 1; i < vect_loops_num; i++) { loop_vec_info loop_vinfo; + bool has_mask_store; loop = get_loop (cfun, i); if (!loop) continue; loop_vinfo = (loop_vec_info) loop->aux; + has_mask_store = false; + if (loop_vinfo) + has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo); destroy_loop_vec_info (loop_vinfo, true); + if (has_mask_store) + optimize_mask_stores (loop); loop->aux = NULL; } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 0a3f5d7..bd1d55a 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -333,6 +333,9 @@ typedef struct _loop_vec_info : public vec_info { loop version without if-conversion. */ struct loop *scalar_loop; + /* Mark loops having masked stores. */ + bool has_mask_store; + } *loop_vec_info; /* Access Functions. */ @@ -368,6 +371,7 @@ typedef struct _loop_vec_info : public vec_info { #define LOOP_VINFO_PEELING_FOR_NITER(L) (L)->peeling_for_niter #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies #define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop +#define LOOP_VINFO_HAS_MASK_STORE(L) (L)->has_mask_store #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost @@ -1013,6 +1017,7 @@ extern void vect_get_vec_defs (tree, tree, gimple *, vec<tree> *, vec<tree> *, slp_tree, int); extern tree vect_gen_perm_mask_any (tree, const unsigned char *); extern tree vect_gen_perm_mask_checked (tree, const unsigned char *); +extern void optimize_mask_stores (struct loop*); /* In tree-vect-data-refs.c. */ extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int); |