Improve initial code generation for addsi/adddi

This is a patch primarily from Shreya, though I think she cribbed some code from Philipp that we had internally within Ventana and I made some minor adjustments as well. So the basic idea here is similar to her work on logical ops -- specifically when we can generate more efficient code at expansion time, then do so. In some cases the net is better code; in other cases we lessen reliance on mvconst_internal and finally it provides infrastructure that I think will help address an issue Paul Antoine reported a little while back. The most obvious case is using paired addis from initial code generation for some constants. It will also use a shNadd insn when the cost to synthesize the original value is higher than the right-shifted value. Finally it will negate the constant and use "sub" if the negated constant is cheaper than the original constant. There's more work to do in here, particularly WRT 32 bit objects for rv64. Shreya is looking at that right now. There may also be cases where another shNadd or addi would be profitable. We haven't really explored those cases in any detail, while there may be cases to handle, it's unclear how often they occur in practice. I don't want to remove the define_insn_and_split for the paired addi cases yet. I think that likely happens as a side effect of fixing Paul Antoine's issue. Bootstrapped and regression tested on a BPI & Pioneer box. Will obviously wait for the pre-commit tester before moving forward. Jeff PR target/120603 gcc/ * config/riscv/riscv-protos.h (synthesize_add): Add prototype. * config/riscv/riscv.cc (synthesize_add): New function. * config/riscv/riscv.md (addsi3): Allow any constant as operands[2] in the expander. Force the constant into a register as needed for TARGET_64BIT. Use synthesize_add for !TARGET_64BIT. (*adddi3): Renamed from adddi3. (adddi3): New expander. Use synthesize_add. gcc/testsuite * gcc.target/riscv/add-synthesis-1.c: New test. Co-authored-by: Jeff Law <jlaw@ventanamicro.com> Co-authored-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
author: Shreya Munnangi <smunnangi1@ventanamicro.com> 2025-08-11 21:42:50 -0600
committer: Jeff Law <jlaw@ventanamicro.com> 2025-08-11 21:46:04 -0600
commit: 1b5b461428fb6a43ef91e3dc330d6f59b6d88618 (patch)
tree: bdeb62a8eb273569e08b97d185af8e5305891e6d
parent: 9992c0a0e1b455ad5c68d7261b4bc9bfc2461f70 (diff)
download: gcc-1b5b461428fb6a43ef91e3dc330d6f59b6d88618.zip
gcc-1b5b461428fb6a43ef91e3dc330d6f59b6d88618.tar.gz
gcc-1b5b461428fb6a43ef91e3dc330d6f59b6d88618.tar.bz2
4 files changed, 159 insertions, 4 deletions
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 539321f..b497325 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -143,6 +143,7 @@ extern void riscv_expand_sstrunc (rtx, rtx);
 extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t);
 extern bool synthesize_ior_xor (rtx_code, rtx [3]);
 extern bool synthesize_and (rtx [3]);
+extern bool synthesize_add (rtx [3]);
 
 #ifdef RTX_CODE
 extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index c336584..4935367 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -15353,6 +15353,100 @@ synthesize_and (rtx operands[3])
   return true;
 }
 
+/* Synthesize OPERANDS[0] = OPERANDS[1] + OPERANDS[2].
+
+    OPERANDS[0] and OPERANDS[1] will be a REG and may be the same
+    REG.
+
+    OPERANDS[2] is a CONST_INT.
+
+    Return TRUE if the operation was fully synthesized and the caller
+    need not generate additional code.  Return FALSE if the operation
+    was not synthesized and the caller is responsible for emitting the
+    proper sequence.  */
+
+bool
+synthesize_add (rtx operands[3])
+{
+  /* Trivial cases that don't need synthesis.  */
+  if (SMALL_OPERAND (INTVAL (operands[2])))
+    return false;
+
+  int budget1 = riscv_const_insns (operands[2], true);
+  int budget2 = riscv_const_insns (GEN_INT (-INTVAL (operands[2])), true);
+
+  HOST_WIDE_INT ival = INTVAL (operands[2]);
+
+  /* If we can emit two addi insns then that's better than synthesizing
+     the constant into a temporary, then adding the temporary to the
+     other input.  The exception is when the constant can be loaded
+     in a single instruction which can issue whenever its convenient.  */
+  if (SUM_OF_TWO_S12 (ival) && budget1 >= 2)
+    {
+      HOST_WIDE_INT saturated = HOST_WIDE_INT_M1U << (IMM_BITS - 1);
+
+      if (ival >= 0)
+	saturated = ~saturated;
+
+      ival -= saturated;
+
+      rtx x = gen_rtx_PLUS (word_mode, operands[1], GEN_INT (saturated));
+      emit_insn (gen_rtx_SET (operands[0], x));
+      rtx output = gen_rtx_PLUS (word_mode, operands[0], GEN_INT (ival));
+      emit_insn (gen_rtx_SET (operands[0], output));
+      return true;
+    }
+
+  /* If we can shift the constant by 1, 2, or 3 bit positions
+     and the result is a cheaper constant, then do so.  */
+  ival = INTVAL (operands[2]);
+  if (TARGET_ZBA
+      && (((ival % 2) == 0 && budget1
+	   > riscv_const_insns (GEN_INT (ival >> 1), true))
+	   || ((ival % 4) == 0 && budget1
+	       > riscv_const_insns (GEN_INT (ival >> 2), true))
+	   || ((ival % 8) == 0 && budget1
+	       > riscv_const_insns (GEN_INT (ival >> 3), true))))
+    {
+      // Load the shifted constant into a temporary
+      int shct = ctz_hwi (ival);
+
+      /* We can handle shifting up to 3 bit positions via shNadd.  */
+      if (shct > 3)
+	shct = 3;
+
+      /* The adjusted constant may still need synthesis, so do not copy
+	 it directly into register.  Let the expander handle it.  */
+      rtx tmp = force_reg (word_mode, GEN_INT (ival >> shct));
+
+      /* Generate shift-add of temporary and operands[1]
+	 into the final destination.  */
+      rtx x = gen_rtx_ASHIFT (word_mode, tmp, GEN_INT (shct));
+      rtx output = gen_rtx_PLUS (word_mode, x, operands[1]);
+      emit_insn (gen_rtx_SET (operands[0], output));
+      return true;
+    }
+
+  /* If the negated constant is cheaper than the original, then negate
+     the constant and use sub.  */
+  if (budget2 < budget1)
+    {
+      // load -INTVAL (operands[2]) into a temporary
+      rtx tmp = force_reg (word_mode, GEN_INT (-INTVAL (operands[2])));
+
+      // subtract operads[2] from operands[1]
+      rtx output = gen_rtx_MINUS (word_mode, operands[1], tmp);
+      emit_insn (gen_rtx_SET (operands[0], output));
+      return true;
+    }
+
+  /* No add synthesis was found.  Synthesize the constant into
+     a temporary and use that.  */
+  rtx x = force_reg (word_mode, operands[2]);
+  x = gen_rtx_PLUS (word_mode, operands[1], x);
+  emit_insn (gen_rtx_SET (operands[0], x));
+  return true;
+}
 
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 578dd43..a72604e 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -712,14 +712,17 @@
    (set_attr "mode" "SI")])
 
 (define_expand "addsi3"
-  [(set (match_operand:SI          0 "register_operand" "=r,r")
-	(plus:SI (match_operand:SI 1 "register_operand" " r,r")
-		 (match_operand:SI 2 "arith_operand"    " r,I")))]
+  [(set (match_operand:SI          0 "register_operand")
+	(plus:SI (match_operand:SI 1 "register_operand")
+		 (match_operand:SI 2 "reg_or_const_int_operand")))]
   ""
 {
   if (TARGET_64BIT)
     {
       rtx t = gen_reg_rtx (DImode);
+
+      if (CONST_INT_P (operands[2]) && !SMALL_OPERAND (operands[2]))
+	operands[2] = force_reg (SImode, operands[2]);
       emit_insn (gen_addsi3_extended (t, operands[1], operands[2]));
       t = gen_lowpart (SImode, t);
       SUBREG_PROMOTED_VAR_P (t) = 1;
@@ -727,9 +730,26 @@
       emit_move_insn (operands[0], t);
       DONE;
     }
+
+  /* We may be able to find a faster sequence, if so, then we are
+     done.  Otherwise let expansion continue normally.  */
+  if (CONST_INT_P (operands[2]) && synthesize_add (operands))
+    DONE;
+})
+
+(define_expand "adddi3"
+  [(set (match_operand:DI          0 "register_operand")
+	(plus:DI (match_operand:DI 1 "register_operand")
+		 (match_operand:DI 2 "reg_or_const_int_operand")))]
+  "TARGET_64BIT"
+{
+  /* We may be able to find a faster sequence, if so, then we are
+     done.  Otherwise let expansion continue normally.  */
+  if (CONST_INT_P (operands[2]) && synthesize_add (operands))
+    DONE;
 })
 
-(define_insn "adddi3"
+(define_insn "*adddi3"
   [(set (match_operand:DI          0 "register_operand" "=r,r")
 	(plus:DI (match_operand:DI 1 "register_operand" " r,r")
 		 (match_operand:DI 2 "arith_operand"    " r,I")))]
diff --git a/gcc/testsuite/gcc.target/riscv/add-synthesis-1.c b/gcc/testsuite/gcc.target/riscv/add-synthesis-1.c
new file mode 100644
index 0000000..247096c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/add-synthesis-1.c
@@ -0,0 +1,40 @@
+/* { dg-options "-march=rv32gcb -mabi=ilp32d" { target { rv32 } } } */
+/* { dg-options "-march=rv64gcb -mabi=lp64d" { target { rv64 } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+
+
+#if  __riscv_xlen == 64
+#define TYPE long
+#else
+#define TYPE int
+#endif
+
+#define T(C)  TYPE foo_##C (TYPE x) { return x + C; }
+#define TM(C)  TYPE foo_M##C (TYPE x) { return x + -C; }
+
+/* These cases were selected because they all can be synthesized
+   at expansion time without synthesizing the constant directly.
+
+   That makes the assembler scan testing simpler.  I've verified
+   by hand that cases that should synthesize the constant do in
+   fact still generate code that way.  */
+T (2050)
+T (4094)
+T (4100)
+T (8200)
+
+TM (2049)
+TM (4096)
+TM (4100)
+TM (8200)
+
+#if  __riscv_xlen == 64
+TM (0x200000000)
+#endif
+
+/* We have 4/5 tests which should use shNadd insns and 4
+   which used paired addi insns.  */
+/* { dg-final { scan-assembler-times "sh.add\t" 4 { target { rv32 } } } } */
+/* { dg-final { scan-assembler-times "sh.add\t" 5 { target { rv64 } } } } */
+/* { dg-final { scan-assembler-times "addi\t" 8 } } */
author	Shreya Munnangi <smunnangi1@ventanamicro.com>	2025-08-11 21:42:50 -0600
committer	Jeff Law <jlaw@ventanamicro.com>	2025-08-11 21:46:04 -0600
commit	1b5b461428fb6a43ef91e3dc330d6f59b6d88618 (patch)
tree	bdeb62a8eb273569e08b97d185af8e5305891e6d
parent	9992c0a0e1b455ad5c68d7261b4bc9bfc2461f70 (diff)
download	gcc-1b5b461428fb6a43ef91e3dc330d6f59b6d88618.zip gcc-1b5b461428fb6a43ef91e3dc330d6f59b6d88618.tar.gz gcc-1b5b461428fb6a43ef91e3dc330d6f59b6d88618.tar.bz2