diff options
author | Jan Hubicka <jh@suse.cz> | 2006-12-20 21:46:15 +0100 |
---|---|---|
committer | Jan Hubicka <hubicka@gcc.gnu.org> | 2006-12-20 20:46:15 +0000 |
commit | 2a4f771a6e735f6b1924349c39f05f4f420c2a41 (patch) | |
tree | 391ddf419840ba85f6d4c66c576620192d6584b6 /gcc/config | |
parent | 75a4c3c194dda02369ca0b8ebcfe9f962f01dd02 (diff) | |
download | gcc-2a4f771a6e735f6b1924349c39f05f4f420c2a41.zip gcc-2a4f771a6e735f6b1924349c39f05f4f420c2a41.tar.gz gcc-2a4f771a6e735f6b1924349c39f05f4f420c2a41.tar.bz2 |
re PR rtl-optimization/30213 (Wrong code with optimized memset() (possible bug in RTL bbro optimizer))
PR target/30213
* i386.c (expand_setmem_epilogue): Fix formating.
(dsmalest_pow2_greater_than): New function.
(ix86_expand_movmem): Improve comments; avoid re-computing of
epilogue size.
(promote_duplicated_reg_to_size): Break out from ...
(expand_setmem): ... this one; reorganize promotion code;
improve comments; avoid recomputation of epilogue size.
From-SVN: r120083
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/i386/i386.c | 187 |
1 files changed, 137 insertions, 50 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 747e31e..ee0cdf4 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -13159,6 +13159,7 @@ static void expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size) { rtx dest; + if (GET_CODE (count) == CONST_INT) { HOST_WIDE_INT countval = INTVAL (count); @@ -13491,8 +13492,40 @@ decide_alignment (int align, return desired_align; } +/* Return thre smallest power of 2 greater than VAL. */ +static int +smallest_pow2_greater_than (int val) +{ + int ret = 1; + while (ret <= val) + ret <<= 1; + return ret; +} + /* Expand string move (memcpy) operation. Use i386 string operations when - profitable. expand_clrmem contains similar code. */ + profitable. expand_clrmem contains similar code. The code depends upon + architecture, block size and alignment, but always has the same + overall structure: + + 1) Prologue guard: Conditional that jumps up to epilogues for small + blocks that can be handled by epilogue alone. This is faster but + also needed for correctness, since prologue assume the block is larger + than the desrired alignment. + + Optional dynamic check for size and libcall for large + blocks is emitted here too, with -minline-stringops-dynamically. + + 2) Prologue: copy first few bytes in order to get destination aligned + to DESIRED_ALIGN. It is emitted only when ALIGN is less than + DESIRED_ALIGN and and up to DESIRED_ALIGN - ALIGN bytes can be copied. + We emit either a jump tree on power of two sized blocks, or a byte loop. + + 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks + with specified algorithm. + + 4) Epilogue: code copying tail of the block that is too small to be + handled by main body (or up to size guarded by prologue guard). */ + int ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, rtx expected_align_exp, rtx expected_size_exp) @@ -13505,7 +13538,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, HOST_WIDE_INT align = 1; unsigned HOST_WIDE_INT count = 0; HOST_WIDE_INT expected_size = -1; - int size_needed = 0; + int size_needed = 0, epilogue_size_needed; int desired_align = 0; enum stringop_alg alg; int dynamic_check; @@ -13519,9 +13552,10 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, if (GET_CODE (count_exp) == CONST_INT) count = expected_size = INTVAL (count_exp); if (GET_CODE (expected_size_exp) == CONST_INT && count == 0) - { - expected_size = INTVAL (expected_size_exp); - } + expected_size = INTVAL (expected_size_exp); + + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ alg = decide_alg (count, expected_size, false, &dynamic_check); desired_align = decide_alignment (align, alg, expected_size); @@ -13559,6 +13593,10 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, break; } + epilogue_size_needed = size_needed; + + /* Step 1: Prologue guard. */ + /* Alignment code needs count to be in register. */ if (GET_CODE (count_exp) == CONST_INT && desired_align > align) { @@ -13568,17 +13606,22 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, count_exp = force_reg (mode, count_exp); } gcc_assert (desired_align >= 1 && align >= 1); + /* Ensure that alignment prologue won't copy past end of block. */ if ((size_needed > 1 || (desired_align > 1 && desired_align > align)) && !count) { - int size = MAX (size_needed - 1, desired_align - align); + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); + + /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. + Make sure it is power of 2. */ + epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); label = gen_label_rtx (); emit_cmp_and_jump_insns (count_exp, - GEN_INT (size), - LEU, 0, GET_MODE (count_exp), 1, label); - if (expected_size == -1 || expected_size < size) + GEN_INT (epilogue_size_needed), + LTU, 0, GET_MODE (count_exp), 1, label); + if (expected_size == -1 || expected_size < epilogue_size_needed) predict_jump (REG_BR_PROB_BASE * 60 / 100); else predict_jump (REG_BR_PROB_BASE * 20 / 100); @@ -13597,8 +13640,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, emit_label (hot_label); } + /* Step 2: Alignment prologue. */ - /* Alignment prologue. */ if (desired_align > align) { /* Except for the first move in epilogue, we no longer know @@ -13617,7 +13660,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, label = NULL; } - /* Main body. */ + /* Step 3: Main loop. */ + switch (alg) { case libcall: @@ -13665,25 +13709,31 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, dst = change_address (dst, BLKmode, destreg); } - /* Epilogue to copy the remaining bytes. */ + /* Step 4: Epilogue to copy the remaining bytes. */ + if (label) { - if (size_needed < desired_align - align) + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ + + if (size_needed < epilogue_size_needed) { tmp = expand_simple_binop (GET_MODE (count_exp), AND, count_exp, GEN_INT (size_needed - 1), count_exp, 1, OPTAB_DIRECT); - size_needed = desired_align - align + 1; if (tmp != count_exp) emit_move_insn (count_exp, tmp); } emit_label (label); LABEL_NUSES (label) = 1; } - if (count_exp != const0_rtx && size_needed > 1) + + if (count_exp != const0_rtx && epilogue_size_needed > 1) expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp, - size_needed); + epilogue_size_needed); if (jump_around_label) emit_label (jump_around_label); return 1; @@ -13761,8 +13811,30 @@ promote_duplicated_reg (enum machine_mode mode, rtx val) } } +/* Duplicate value VAL using promote_duplicated_reg into maximal size that will + be needed by main loop copying SIZE_NEEDED chunks and prologue getting + alignment from ALIGN to DESIRED_ALIGN. */ +static rtx +promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align) +{ + rtx promoted_val; + + if (TARGET_64BIT + && (size_needed > 4 || (desired_align > align && desired_align > 4))) + promoted_val = promote_duplicated_reg (DImode, val); + else if (size_needed > 2 || (desired_align > align && desired_align > 2)) + promoted_val = promote_duplicated_reg (SImode, val); + else if (size_needed > 1 || (desired_align > align && desired_align > 1)) + promoted_val = promote_duplicated_reg (HImode, val); + else + promoted_val = val; + + return promoted_val; +} + /* Expand string clear operation (bzero). Use i386 string operations when - profitable. expand_movmem contains similar code. */ + profitable. See expand_movmem comment for explanation of individual + steps performd. */ int ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, rtx expected_align_exp, rtx expected_size_exp) @@ -13774,10 +13846,10 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, HOST_WIDE_INT align = 1; unsigned HOST_WIDE_INT count = 0; HOST_WIDE_INT expected_size = -1; - int size_needed = 0; + int size_needed = 0, epilogue_size_needed; int desired_align = 0; enum stringop_alg alg; - rtx promoted_val = val_exp; + rtx promoted_val = NULL; bool force_loopy_epilogue = false; int dynamic_check; @@ -13792,6 +13864,9 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, if (GET_CODE (expected_size_exp) == CONST_INT && count == 0) expected_size = INTVAL (expected_size_exp); + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ + alg = decide_alg (count, expected_size, true, &dynamic_check); desired_align = decide_alignment (align, alg, expected_size); @@ -13826,6 +13901,10 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, size_needed = 1; break; } + epilogue_size_needed = size_needed; + + /* Step 1: Prologue guard. */ + /* Alignment code needs count to be in register. */ if (GET_CODE (count_exp) == CONST_INT && desired_align > align) { @@ -13834,20 +13913,33 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, mode = DImode; count_exp = force_reg (mode, count_exp); } + /* Do the cheap promotion to allow better CSE across the + main loop and epilogue (ie one load of the big constant in the + front of all code. */ + if (GET_CODE (val_exp) == CONST_INT) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); /* Ensure that alignment prologue won't copy past end of block. */ if ((size_needed > 1 || (desired_align > 1 && desired_align > align)) && !count) { - int size = MAX (size_needed - 1, desired_align - align); - /* To improve performance of small blocks, we jump around the promoting - code, so we need to use QImode accesses in epilogue. */ - if (GET_CODE (val_exp) != CONST_INT && size_needed > 1) - force_loopy_epilogue = true; + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); + + /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. + Make sure it is power of 2. */ + epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); + + /* To improve performance of small blocks, we jump around the VAL + promoting mode. This mean that if the promoted VAL is not constant, + we might not use it in the epilogue and have to use byte + loop variant. */ + if (epilogue_size_needed > 2 && !promoted_val) + force_loopy_epilogue = true; label = gen_label_rtx (); emit_cmp_and_jump_insns (count_exp, - GEN_INT (size), - LEU, 0, GET_MODE (count_exp), 1, label); - if (expected_size == -1 || expected_size <= size) + GEN_INT (epilogue_size_needed), + LTU, 0, GET_MODE (count_exp), 1, label); + if (expected_size == -1 || expected_size <= epilogue_size_needed) predict_jump (REG_BR_PROB_BASE * 60 / 100); else predict_jump (REG_BR_PROB_BASE * 20 / 100); @@ -13863,30 +13955,15 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, emit_jump (jump_around_label); emit_label (hot_label); } - if (TARGET_64BIT - && (size_needed > 4 || (desired_align > align && desired_align > 4))) - promoted_val = promote_duplicated_reg (DImode, val_exp); - else if (size_needed > 2 || (desired_align > align && desired_align > 2)) - promoted_val = promote_duplicated_reg (SImode, val_exp); - else if (size_needed > 1 || (desired_align > align && desired_align > 1)) - promoted_val = promote_duplicated_reg (HImode, val_exp); - else - promoted_val = val_exp; + + /* Step 2: Alignment prologue. */ + + /* Do the expensive promotion once we branched off the small blocks. */ + if (!promoted_val) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); gcc_assert (desired_align >= 1 && align >= 1); - if ((size_needed > 1 || (desired_align > 1 && desired_align > align)) - && !count && !label) - { - int size = MAX (size_needed - 1, desired_align - align); - label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (size), - LEU, 0, GET_MODE (count_exp), 1, label); - if (expected_size == -1 || expected_size <= size) - predict_jump (REG_BR_PROB_BASE * 60 / 100); - else - predict_jump (REG_BR_PROB_BASE * 20 / 100); - } if (desired_align > align) { /* Except for the first move in epilogue, we no longer know @@ -13903,6 +13980,9 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, LABEL_NUSES (label) = 1; label = NULL; } + + /* Step 3: Main loop. */ + switch (alg) { case libcall: @@ -13940,8 +14020,15 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, else dst = change_address (dst, BLKmode, destreg); + /* Step 4: Epilogue to copy the remaining bytes. */ + if (label) { + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ + if (size_needed < desired_align - align) { tmp = @@ -13955,7 +14042,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, emit_label (label); LABEL_NUSES (label) = 1; } - if (count_exp != const0_rtx && size_needed > 1) + if (count_exp != const0_rtx && epilogue_size_needed > 1) { if (force_loopy_epilogue) expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, |