diff options
Diffstat (limited to 'gcc/tree-vectorizer.c')
-rw-r--r-- | gcc/tree-vectorizer.c | 97 |
1 files changed, 93 insertions, 4 deletions
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 20c867c..372334d 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -1345,6 +1345,13 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo) STMT_VINFO_IN_PATTERN_P (res) = false; STMT_VINFO_RELATED_STMT (res) = NULL; STMT_VINFO_DATA_REF (res) = NULL; + + STMT_VINFO_DR_BASE_ADDRESS (res) = NULL; + STMT_VINFO_DR_OFFSET (res) = NULL; + STMT_VINFO_DR_INIT (res) = NULL; + STMT_VINFO_DR_STEP (res) = NULL; + STMT_VINFO_DR_ALIGNED_TO (res) = NULL; + if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt))) STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type; else @@ -1655,21 +1662,103 @@ get_vectype_for_scalar_type (tree scalar_type) enum dr_alignment_support vect_supportable_dr_alignment (struct data_reference *dr) { - tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); + tree stmt = DR_STMT (dr); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); enum machine_mode mode = (int) TYPE_MODE (vectype); + struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info)); + bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); + bool invariant_in_outerloop = false; if (aligned_access_p (dr)) return dr_aligned; + if (nested_in_vect_loop) + { + tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); + invariant_in_outerloop = + (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); + } + /* Possibly unaligned access. */ + + /* We can choose between using the implicit realignment scheme (generating + a misaligned_move stmt) and the explicit realignment scheme (generating + aligned loads with a REALIGN_LOAD). There are two variants to the explicit + realignment scheme: optimized, and unoptimized. + We can optimize the realignment only if the step between consecutive + vector loads is equal to the vector size. Since the vector memory + accesses advance in steps of VS (Vector Size) in the vectorized loop, it + is guaranteed that the misalignment amount remains the same throughout the + execution of the vectorized loop. Therefore, we can create the + "realignment token" (the permutation mask that is passed to REALIGN_LOAD) + at the loop preheader. + + However, in the case of outer-loop vectorization, when vectorizing a + memory access in the inner-loop nested within the LOOP that is now being + vectorized, while it is guaranteed that the misalignment of the + vectorized memory access will remain the same in different outer-loop + iterations, it is *not* guaranteed that is will remain the same throughout + the execution of the inner-loop. This is because the inner-loop advances + with the original scalar step (and not in steps of VS). If the inner-loop + step happens to be a multiple of VS, then the misalignment remaines fixed + and we can use the optimized realignment scheme. For example: + + for (i=0; i<N; i++) + for (j=0; j<M; j++) + s += a[i+j]; + + When vectorizing the i-loop in the above example, the step between + consecutive vector loads is 1, and so the misalignment does not remain + fixed across the execution of the inner-loop, and the realignment cannot + be optimized (as illustrated in the following pseudo vectorized loop): + + for (i=0; i<N; i+=4) + for (j=0; j<M; j++){ + vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} + // when j is {0,1,2,3,4,5,6,7,...} respectively. + // (assuming that we start from an aligned address). + } + + We therefore have to use the unoptimized realignment scheme: + + for (i=0; i<N; i+=4) + for (j=k; j<M; j+=4) + vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming + // that the misalignment of the initial address is + // 0). + + The loop can then be vectorized as follows: + + for (k=0; k<4; k++){ + rt = get_realignment_token (&vp[k]); + for (i=0; i<N; i+=4){ + v1 = vp[i+k]; + for (j=k; j<M; j+=4){ + v2 = vp[i+j+VS-1]; + va = REALIGN_LOAD <v1,v2,rt>; + vs += va; + v1 = v2; + } + } + } */ + if (DR_IS_READ (dr)) { - if (optab_handler (vec_realign_load_optab, mode)->insn_code != CODE_FOR_nothing + if (optab_handler (vec_realign_load_optab, mode)->insn_code != + CODE_FOR_nothing && (!targetm.vectorize.builtin_mask_for_load || targetm.vectorize.builtin_mask_for_load ())) - return dr_unaligned_software_pipeline; + { + if (nested_in_vect_loop + && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD) + return dr_explicit_realign; + else + return dr_explicit_realign_optimized; + } - if (optab_handler (movmisalign_optab, mode)->insn_code != CODE_FOR_nothing) + if (optab_handler (movmisalign_optab, mode)->insn_code != + CODE_FOR_nothing) /* Can't software pipeline the loads, but can at least do them. */ return dr_unaligned_supported; } |