diff options
author | Shreya Munnangi <smunnangi1@ventanamicro.com> | 2025-08-11 21:42:50 -0600 |
---|---|---|
committer | Jeff Law <jlaw@ventanamicro.com> | 2025-08-11 21:46:04 -0600 |
commit | 1b5b461428fb6a43ef91e3dc330d6f59b6d88618 (patch) | |
tree | bdeb62a8eb273569e08b97d185af8e5305891e6d | |
parent | 9992c0a0e1b455ad5c68d7261b4bc9bfc2461f70 (diff) | |
download | gcc-1b5b461428fb6a43ef91e3dc330d6f59b6d88618.zip gcc-1b5b461428fb6a43ef91e3dc330d6f59b6d88618.tar.gz gcc-1b5b461428fb6a43ef91e3dc330d6f59b6d88618.tar.bz2 |
Improve initial code generation for addsi/adddi
This is a patch primarily from Shreya, though I think she cribbed some
code from Philipp that we had internally within Ventana and I made some
minor adjustments as well.
So the basic idea here is similar to her work on logical ops --
specifically when we can generate more efficient code at expansion time,
then do so. In some cases the net is better code; in other cases we
lessen reliance on mvconst_internal and finally it provides
infrastructure that I think will help address an issue Paul Antoine
reported a little while back.
The most obvious case is using paired addis from initial code generation
for some constants. It will also use a shNadd insn when the cost to
synthesize the original value is higher than the right-shifted value.
Finally it will negate the constant and use "sub" if the negated
constant is cheaper than the original constant.
There's more work to do in here, particularly WRT 32 bit objects for
rv64. Shreya is looking at that right now. There may also be cases
where another shNadd or addi would be profitable. We haven't really
explored those cases in any detail, while there may be cases to handle,
it's unclear how often they occur in practice.
I don't want to remove the define_insn_and_split for the paired addi
cases yet. I think that likely happens as a side effect of fixing Paul
Antoine's issue.
Bootstrapped and regression tested on a BPI & Pioneer box. Will
obviously wait for the pre-commit tester before moving forward.
Jeff
PR target/120603
gcc/
* config/riscv/riscv-protos.h (synthesize_add): Add prototype.
* config/riscv/riscv.cc (synthesize_add): New function.
* config/riscv/riscv.md (addsi3): Allow any constant as operands[2]
in the expander. Force the constant into a register as needed for
TARGET_64BIT. Use synthesize_add for !TARGET_64BIT.
(*adddi3): Renamed from adddi3.
(adddi3): New expander. Use synthesize_add.
gcc/testsuite
* gcc.target/riscv/add-synthesis-1.c: New test.
Co-authored-by: Jeff Law <jlaw@ventanamicro.com>
Co-authored-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
-rw-r--r-- | gcc/config/riscv/riscv-protos.h | 1 | ||||
-rw-r--r-- | gcc/config/riscv/riscv.cc | 94 | ||||
-rw-r--r-- | gcc/config/riscv/riscv.md | 28 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/riscv/add-synthesis-1.c | 40 |
4 files changed, 159 insertions, 4 deletions
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 539321f..b497325 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -143,6 +143,7 @@ extern void riscv_expand_sstrunc (rtx, rtx); extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t); extern bool synthesize_ior_xor (rtx_code, rtx [3]); extern bool synthesize_and (rtx [3]); +extern bool synthesize_add (rtx [3]); #ifdef RTX_CODE extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index c336584..4935367 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -15353,6 +15353,100 @@ synthesize_and (rtx operands[3]) return true; } +/* Synthesize OPERANDS[0] = OPERANDS[1] + OPERANDS[2]. + + OPERANDS[0] and OPERANDS[1] will be a REG and may be the same + REG. + + OPERANDS[2] is a CONST_INT. + + Return TRUE if the operation was fully synthesized and the caller + need not generate additional code. Return FALSE if the operation + was not synthesized and the caller is responsible for emitting the + proper sequence. */ + +bool +synthesize_add (rtx operands[3]) +{ + /* Trivial cases that don't need synthesis. */ + if (SMALL_OPERAND (INTVAL (operands[2]))) + return false; + + int budget1 = riscv_const_insns (operands[2], true); + int budget2 = riscv_const_insns (GEN_INT (-INTVAL (operands[2])), true); + + HOST_WIDE_INT ival = INTVAL (operands[2]); + + /* If we can emit two addi insns then that's better than synthesizing + the constant into a temporary, then adding the temporary to the + other input. The exception is when the constant can be loaded + in a single instruction which can issue whenever its convenient. */ + if (SUM_OF_TWO_S12 (ival) && budget1 >= 2) + { + HOST_WIDE_INT saturated = HOST_WIDE_INT_M1U << (IMM_BITS - 1); + + if (ival >= 0) + saturated = ~saturated; + + ival -= saturated; + + rtx x = gen_rtx_PLUS (word_mode, operands[1], GEN_INT (saturated)); + emit_insn (gen_rtx_SET (operands[0], x)); + rtx output = gen_rtx_PLUS (word_mode, operands[0], GEN_INT (ival)); + emit_insn (gen_rtx_SET (operands[0], output)); + return true; + } + + /* If we can shift the constant by 1, 2, or 3 bit positions + and the result is a cheaper constant, then do so. */ + ival = INTVAL (operands[2]); + if (TARGET_ZBA + && (((ival % 2) == 0 && budget1 + > riscv_const_insns (GEN_INT (ival >> 1), true)) + || ((ival % 4) == 0 && budget1 + > riscv_const_insns (GEN_INT (ival >> 2), true)) + || ((ival % 8) == 0 && budget1 + > riscv_const_insns (GEN_INT (ival >> 3), true)))) + { + // Load the shifted constant into a temporary + int shct = ctz_hwi (ival); + + /* We can handle shifting up to 3 bit positions via shNadd. */ + if (shct > 3) + shct = 3; + + /* The adjusted constant may still need synthesis, so do not copy + it directly into register. Let the expander handle it. */ + rtx tmp = force_reg (word_mode, GEN_INT (ival >> shct)); + + /* Generate shift-add of temporary and operands[1] + into the final destination. */ + rtx x = gen_rtx_ASHIFT (word_mode, tmp, GEN_INT (shct)); + rtx output = gen_rtx_PLUS (word_mode, x, operands[1]); + emit_insn (gen_rtx_SET (operands[0], output)); + return true; + } + + /* If the negated constant is cheaper than the original, then negate + the constant and use sub. */ + if (budget2 < budget1) + { + // load -INTVAL (operands[2]) into a temporary + rtx tmp = force_reg (word_mode, GEN_INT (-INTVAL (operands[2]))); + + // subtract operads[2] from operands[1] + rtx output = gen_rtx_MINUS (word_mode, operands[1], tmp); + emit_insn (gen_rtx_SET (operands[0], output)); + return true; + } + + /* No add synthesis was found. Synthesize the constant into + a temporary and use that. */ + rtx x = force_reg (word_mode, operands[2]); + x = gen_rtx_PLUS (word_mode, operands[1], x); + emit_insn (gen_rtx_SET (operands[0], x)); + return true; +} /* Initialize the GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 578dd43..a72604e 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -712,14 +712,17 @@ (set_attr "mode" "SI")]) (define_expand "addsi3" - [(set (match_operand:SI 0 "register_operand" "=r,r") - (plus:SI (match_operand:SI 1 "register_operand" " r,r") - (match_operand:SI 2 "arith_operand" " r,I")))] + [(set (match_operand:SI 0 "register_operand") + (plus:SI (match_operand:SI 1 "register_operand") + (match_operand:SI 2 "reg_or_const_int_operand")))] "" { if (TARGET_64BIT) { rtx t = gen_reg_rtx (DImode); + + if (CONST_INT_P (operands[2]) && !SMALL_OPERAND (operands[2])) + operands[2] = force_reg (SImode, operands[2]); emit_insn (gen_addsi3_extended (t, operands[1], operands[2])); t = gen_lowpart (SImode, t); SUBREG_PROMOTED_VAR_P (t) = 1; @@ -727,9 +730,26 @@ emit_move_insn (operands[0], t); DONE; } + + /* We may be able to find a faster sequence, if so, then we are + done. Otherwise let expansion continue normally. */ + if (CONST_INT_P (operands[2]) && synthesize_add (operands)) + DONE; +}) + +(define_expand "adddi3" + [(set (match_operand:DI 0 "register_operand") + (plus:DI (match_operand:DI 1 "register_operand") + (match_operand:DI 2 "reg_or_const_int_operand")))] + "TARGET_64BIT" +{ + /* We may be able to find a faster sequence, if so, then we are + done. Otherwise let expansion continue normally. */ + if (CONST_INT_P (operands[2]) && synthesize_add (operands)) + DONE; }) -(define_insn "adddi3" +(define_insn "*adddi3" [(set (match_operand:DI 0 "register_operand" "=r,r") (plus:DI (match_operand:DI 1 "register_operand" " r,r") (match_operand:DI 2 "arith_operand" " r,I")))] diff --git a/gcc/testsuite/gcc.target/riscv/add-synthesis-1.c b/gcc/testsuite/gcc.target/riscv/add-synthesis-1.c new file mode 100644 index 0000000..247096c --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/add-synthesis-1.c @@ -0,0 +1,40 @@ +/* { dg-options "-march=rv32gcb -mabi=ilp32d" { target { rv32 } } } */ +/* { dg-options "-march=rv64gcb -mabi=lp64d" { target { rv64 } } } */ +/* { dg-skip-if "" { *-*-* } { "-O0" } } */ + + + +#if __riscv_xlen == 64 +#define TYPE long +#else +#define TYPE int +#endif + +#define T(C) TYPE foo_##C (TYPE x) { return x + C; } +#define TM(C) TYPE foo_M##C (TYPE x) { return x + -C; } + +/* These cases were selected because they all can be synthesized + at expansion time without synthesizing the constant directly. + + That makes the assembler scan testing simpler. I've verified + by hand that cases that should synthesize the constant do in + fact still generate code that way. */ +T (2050) +T (4094) +T (4100) +T (8200) + +TM (2049) +TM (4096) +TM (4100) +TM (8200) + +#if __riscv_xlen == 64 +TM (0x200000000) +#endif + +/* We have 4/5 tests which should use shNadd insns and 4 + which used paired addi insns. */ +/* { dg-final { scan-assembler-times "sh.add\t" 4 { target { rv32 } } } } */ +/* { dg-final { scan-assembler-times "sh.add\t" 5 { target { rv64 } } } } */ +/* { dg-final { scan-assembler-times "addi\t" 8 } } */ |