aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/ia64
diff options
context:
space:
mode:
authorZack Weinberg <zack@gcc.gnu.org>2003-10-29 00:55:43 +0000
committerZack Weinberg <zack@gcc.gnu.org>2003-10-29 00:55:43 +0000
commitb38ba46301ac50a5c0d8c93953aa150d748bff8b (patch)
treeef98c209a92bd82779ac5da489d6ce4b266ce19a /gcc/config/ia64
parent1e8fee4a4272759c129f4f884d0288b98cb4943a (diff)
downloadgcc-b38ba46301ac50a5c0d8c93953aa150d748bff8b.zip
gcc-b38ba46301ac50a5c0d8c93953aa150d748bff8b.tar.gz
gcc-b38ba46301ac50a5c0d8c93953aa150d748bff8b.tar.bz2
ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.
* ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants. (*sqrt_approx): New instruction pattern for approximate square roots. (*setf_exp_xf): New instruction pattern for exponentiation. (*maddxf4_alts_truncsf): New instruction pattern for truncation. (sqrtsf2_internal_thr): New define_and_split implementing throughput-optimized inline calculation of SFmode square root. (sqrtdf2_internal_thr): Likewise for DFmode. (sqrtxf2_internal_thr): Likewise for XFmode. (sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between latency- and throughput-optimized square root algorithms. * ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR, TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT): New macros. (TARGET_SWITCHES): Add -minline-sqrt-min-latency and -minline-sqrt-max-throughput. * ia64.c (ia64_override_options): If both -minline-sqrt-min-latency and -minline-sqrt-max-throughput are given, notify the user that both options cannot be used simultaneously. If -minline-sqrt-min-latency is given, notify the user that this mode is not yet implemented. (rtx_needs_barrier): Reformat initial comment to obey 72-character width limit. Support UNSPEC_SETF_EXP and UNSPEC_FR_SQRT_RECIP_APPROX. From-SVN: r73027
Diffstat (limited to 'gcc/config/ia64')
-rw-r--r--gcc/config/ia64/ia64.c20
-rw-r--r--gcc/config/ia64/ia64.h15
-rw-r--r--gcc/config/ia64/ia64.md475
3 files changed, 507 insertions, 3 deletions
diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c
index 7b0069d..a25c4c5 100644
--- a/gcc/config/ia64/ia64.c
+++ b/gcc/config/ia64/ia64.c
@@ -4487,6 +4487,18 @@ ia64_override_options (void)
target_flags &= ~MASK_INLINE_INT_DIV_THR;
}
+ if (TARGET_INLINE_SQRT_LAT && TARGET_INLINE_SQRT_THR)
+ {
+ warning ("cannot optimize square root for both latency and throughput");
+ target_flags &= ~MASK_INLINE_SQRT_THR;
+ }
+
+ if (TARGET_INLINE_SQRT_LAT)
+ {
+ warning ("not yet implemented: latency-optimized inline square root");
+ target_flags &= ~MASK_INLINE_SQRT_LAT;
+ }
+
if (ia64_fixed_range_string)
fix_range (ia64_fixed_range_string);
@@ -4896,9 +4908,9 @@ set_src_needs_barrier (rtx x, struct reg_flags flags, int pred, rtx cond)
return need_barrier;
}
-/* Handle an access to rtx X of type FLAGS using predicate register PRED.
- Return 1 is this access creates a dependency with an earlier instruction
- in the same group. */
+/* Handle an access to rtx X of type FLAGS using predicate register
+ PRED. Return 1 if this access creates a dependency with an earlier
+ instruction in the same group. */
static int
rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
@@ -5124,7 +5136,9 @@ rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
case UNSPEC_FR_SPILL:
case UNSPEC_FR_RESTORE:
case UNSPEC_GETF_EXP:
+ case UNSPEC_SETF_EXP:
case UNSPEC_ADDP4:
+ case UNSPEC_FR_SQRT_RECIP_APPROX:
need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
break;
diff --git a/gcc/config/ia64/ia64.h b/gcc/config/ia64/ia64.h
index 8ccda53..44ef6c6 100644
--- a/gcc/config/ia64/ia64.h
+++ b/gcc/config/ia64/ia64.h
@@ -87,6 +87,10 @@ extern int target_flags;
#define MASK_INLINE_INT_DIV_THR 0x00001000 /* inline div, max throughput. */
+#define MASK_INLINE_SQRT_LAT 0x00002000 /* inline sqrt, min latency. */
+
+#define MASK_INLINE_SQRT_THR 0x00004000 /* inline sqrt, max throughput. */
+
#define MASK_DWARF2_ASM 0x40000000 /* test dwarf2 line info via gas. */
#define MASK_EARLY_STOP_BITS 0x00002000 /* tune stop bits for the model. */
@@ -127,6 +131,13 @@ extern int target_flags;
#define TARGET_INLINE_INT_DIV \
(target_flags & (MASK_INLINE_INT_DIV_LAT | MASK_INLINE_INT_DIV_THR))
+#define TARGET_INLINE_SQRT_LAT (target_flags & MASK_INLINE_SQRT_LAT)
+
+#define TARGET_INLINE_SQRT_THR (target_flags & MASK_INLINE_SQRT_THR)
+
+#define TARGET_INLINE_SQRT \
+ (target_flags & (MASK_INLINE_SQRT_LAT | MASK_INLINE_SQRT_THR))
+
#define TARGET_DWARF2_ASM (target_flags & MASK_DWARF2_ASM)
extern int ia64_tls_size;
@@ -186,6 +197,10 @@ extern int ia64_tls_size;
N_("Generate inline integer division, optimize for latency") }, \
{ "inline-int-divide-max-throughput", MASK_INLINE_INT_DIV_THR, \
N_("Generate inline integer division, optimize for throughput") },\
+ { "inline-sqrt-min-latency", MASK_INLINE_SQRT_LAT, \
+ N_("Generate inline square root, optimize for latency") }, \
+ { "inline-sqrt-max-throughput", MASK_INLINE_SQRT_THR, \
+ N_("Generate inline square root, optimize for throughput") }, \
{ "dwarf2-asm", MASK_DWARF2_ASM, \
N_("Enable Dwarf 2 line debug info via GNU as")}, \
{ "no-dwarf2-asm", -MASK_DWARF2_ASM, \
diff --git a/gcc/config/ia64/ia64.md b/gcc/config/ia64/ia64.md
index 547b797..ec66fd5 100644
--- a/gcc/config/ia64/ia64.md
+++ b/gcc/config/ia64/ia64.md
@@ -74,6 +74,8 @@
(UNSPEC_ADDP4 24)
(UNSPEC_PROLOGUE_USE 25)
(UNSPEC_RET_ADDR 26)
+ (UNSPEC_SETF_EXP 27)
+ (UNSPEC_FR_SQRT_RECIP_APPROX 28)
])
(define_constants
@@ -2757,6 +2759,155 @@
operands[10] = CONST1_RTX (XFmode);
}
[(set_attr "predicable" "no")])
+
+;; Inline square root.
+
+(define_insn "*sqrt_approx"
+ [(set (match_operand:XF 0 "fr_register_operand" "=f")
+ (div:XF (const_int 1)
+ (sqrt:XF (match_operand:XF 2 "fr_register_operand" "f"))))
+ (set (match_operand:BI 1 "register_operand" "=c")
+ (unspec:BI [(match_dup 2)] UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (match_operand:SI 3 "const_int_operand" "")) ]
+ ""
+ "frsqrta.s%3 %0, %1 = %2"
+ [(set_attr "itanium_class" "fmisc")
+ (set_attr "predicable" "no")])
+
+(define_insn "*setf_exp_xf"
+ [(set (match_operand:XF 0 "fr_register_operand" "=f")
+ (unspec:XF [(match_operand:DI 1 "register_operand" "r")]
+ UNSPEC_SETF_EXP))]
+ ""
+ "setf.exp %0 = %1"
+ [(set_attr "itanium_class" "frfr")])
+
+(define_expand "sqrtsf2"
+ [(set (match_operand:SF 0 "fr_register_operand" "=&f")
+ (sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx insn;
+ if (TARGET_INLINE_SQRT_LAT)
+#if 0
+ insn = gen_sqrtsf2_internal_lat (operands[0], operands[1]);
+#else
+ abort ();
+#endif
+ else
+ insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtsf2_internal_thr"
+ [(set (match_operand:SF 0 "fr_register_operand" "=&f")
+ (sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))
+ ;; Register r2 in optimization guide.
+ (clobber (match_scratch:DI 2 "=r"))
+ ;; Register f8 in optimization guide
+ (clobber (match_scratch:XF 3 "=&f"))
+ ;; Register f9 in optimization guide
+ (clobber (match_scratch:XF 4 "=&f"))
+ ;; Register f10 in optimization guide
+ (clobber (match_scratch:XF 5 "=&f"))
+ ;; Register p6 in optimization guide.
+ (clobber (match_scratch:BI 6 "=c"))]
+ "TARGET_INLINE_SQRT_THR"
+ "#"
+ "&& reload_completed"
+ [ ;; exponent of +1/2 in r2
+ (set (match_dup 2) (const_int 65534))
+ ;; +1/2 in f8
+ (set (match_dup 3)
+ (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+ ;; Step 1
+ ;; y0 = 1/sqrt(a) in f7
+ (parallel [(set (match_dup 7)
+ (div:XF (const_int 1)
+ (sqrt:XF (match_dup 8))))
+ (set (match_dup 6)
+ (unspec:BI [(match_dup 8)]
+ UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (const_int 0))])
+ ;; Step 2
+ ;; H0 = 1/2 * y0 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 3
+ ;; S0 = a * y0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 8) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 4
+ ;; d = 1/2 - S0 * H0 in f10
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 5
+ ;; d' = d + 1/2 * d in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 5))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 6
+ ;; e = d + d * d' in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 3))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 7
+ ;; S1 = S0 + e * S0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (float_truncate:SF
+ (plus:XF (mult:XF (match_dup 3) (match_dup 7))
+ (match_dup 7))))
+ (use (const_int 1))]))
+ ;; Step 8
+ ;; H1 = H0 + e * H0 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 9
+ ;; d1 = a - S1 * S1 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 10
+ ;; S = S1 + d1 * H1 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (float_truncate:SF
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 7))))
+ (use (const_int 0))]))]
+{
+ /* Generate 82-bit versions of the input and output operands. */
+ operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ /* Generate required floating-point constants. */
+ operands[9] = CONST0_RTX (XFmode);
+}
+ [(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
@@ -3102,6 +3253,155 @@
operands[10] = CONST1_RTX (XFmode);
}
[(set_attr "predicable" "no")])
+
+;; Inline square root.
+
+(define_expand "sqrtdf2"
+ [(set (match_operand:DF 0 "fr_register_operand" "=&f")
+ (sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx insn;
+ if (TARGET_INLINE_SQRT_LAT)
+#if 0
+ insn = gen_sqrtdf2_internal_lat (operands[0], operands[1]);
+#else
+ abort ();
+#endif
+ else
+ insn = gen_sqrtdf2_internal_thr (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtdf2_internal_thr"
+ [(set (match_operand:DF 0 "fr_register_operand" "=&f")
+ (sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))
+ ;; Register r2 in optimization guide.
+ (clobber (match_scratch:DI 2 "=r"))
+ ;; Register f8 in optimization guide
+ (clobber (match_scratch:XF 3 "=&f"))
+ ;; Register f9 in optimization guide
+ (clobber (match_scratch:XF 4 "=&f"))
+ ;; Register f10 in optimization guide
+ (clobber (match_scratch:XF 5 "=&f"))
+ ;; Register p6 in optimization guide.
+ (clobber (match_scratch:BI 6 "=c"))]
+ "TARGET_INLINE_SQRT_THR"
+ "#"
+ "&& reload_completed"
+ [ ;; exponent of +1/2 in r2
+ (set (match_dup 2) (const_int 65534))
+ ;; +1/2 in f10
+ (set (match_dup 5)
+ (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+ ;; Step 1
+ ;; y0 = 1/sqrt(a) in f7
+ (parallel [(set (match_dup 7)
+ (div:XF (const_int 1)
+ (sqrt:XF (match_dup 8))))
+ (set (match_dup 6)
+ (unspec:BI [(match_dup 8)]
+ UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (const_int 0))])
+ ;; Step 2
+ ;; H0 = 1/2 * y0 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 3
+ ;; G0 = a * y0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 8) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 4
+ ;; r0 = 1/2 - G0 * H0 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 5
+ ;; H1 = H0 + r0 * H0 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 6
+ ;; G1 = G0 + r0 * G0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 7))
+ (match_dup 7)))
+ (use (const_int 1))]))
+ ;; Step 7
+ ;; r1 = 1/2 - G1 * H1 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 8
+ ;; H2 = H1 + r1 * H1 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 9
+ ;; G2 = G1 + r1 * G1 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 7))
+ (match_dup 7)))
+ (use (const_int 1))]))
+ ;; Step 10
+ ;; d2 = a - G2 * G2 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 11
+ ;; G3 = G2 + d2 * H2 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 7)))
+ (use (const_int 1))]))
+ ;; Step 12
+ ;; d3 = a - G3 * G3 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 13
+ ;; S = G3 + d3 * H2 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (float_truncate:DF
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 7))))
+ (use (const_int 0))]))]
+{
+ /* Generate 82-bit versions of the input and output operands. */
+ operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ /* Generate required floating-point constants. */
+ operands[9] = CONST0_RTX (XFmode);
+}
+ [(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
@@ -3292,6 +3592,17 @@
"fma.s%4 %0 = %F1, %F2, %F3"
[(set_attr "itanium_class" "fmac")])
+(define_insn "*maddxf4_alts_truncsf"
+ [(set (match_operand:SF 0 "fr_register_operand" "=f")
+ (float_truncate:SF
+ (plus:XF (mult:XF (match_operand:XF 1 "xfreg_or_fp01_operand" "fG")
+ (match_operand:XF 2 "xfreg_or_fp01_operand" "fG"))
+ (match_operand:XF 3 "xfreg_or_fp01_operand" "fG"))))
+ (use (match_operand:SI 4 "const_int_operand" ""))]
+ ""
+ "fma.s.s%4 %0 = %F1, %F2, %F3"
+ [(set_attr "itanium_class" "fmac")])
+
(define_insn "*maddxf4_alts_truncdf"
[(set (match_operand:DF 0 "fr_register_operand" "=f")
(float_truncate:DF
@@ -3591,6 +3902,170 @@
"operands[6] = CONST1_RTX (XFmode);"
[(set_attr "predicable" "no")])
+;; Inline square root.
+
+(define_expand "sqrtxf2"
+ [(set (match_operand:XF 0 "fr_register_operand" "=&f")
+ (sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx insn;
+ if (TARGET_INLINE_SQRT_LAT)
+#if 0
+ insn = gen_sqrtxf2_internal_lat (operands[0], operands[1]);
+#else
+ abort ();
+#endif
+ else
+ insn = gen_sqrtxf2_internal_thr (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtxf2_internal_thr"
+ [(set (match_operand:XF 0 "fr_register_operand" "=&f")
+ (sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))
+ ;; Register r2 in optimization guide.
+ (clobber (match_scratch:DI 2 "=r"))
+ ;; Register f8 in optimization guide
+ (clobber (match_scratch:XF 3 "=&f"))
+ ;; Register f9 in optimization guide
+ (clobber (match_scratch:XF 4 "=&f"))
+ ;; Register f10 in optimization guide
+ (clobber (match_scratch:XF 5 "=&f"))
+ ;; Register f11 in optimization guide
+ (clobber (match_scratch:XF 6 "=&f"))
+ ;; Register p6 in optimization guide.
+ (clobber (match_scratch:BI 7 "=c"))]
+ "TARGET_INLINE_SQRT_THR"
+ "#"
+ "&& reload_completed"
+ [ ;; exponent of +1/2 in r2
+ (set (match_dup 2) (const_int 65534))
+ ;; +1/2 in f8. The Intel manual mistakenly specifies f10.
+ (set (match_dup 3)
+ (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+ ;; Step 1
+ ;; y0 = 1/sqrt(a) in f7
+ (parallel [(set (match_dup 8)
+ (div:XF (const_int 1)
+ (sqrt:XF (match_dup 9))))
+ (set (match_dup 7)
+ (unspec:BI [(match_dup 9)]
+ UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (const_int 0))])
+ ;; Step 2
+ ;; H0 = 1/2 * y0 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 8))
+ (match_dup 10)))
+ (use (const_int 1))]))
+ ;; Step 3
+ ;; S0 = a * y0 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 9) (match_dup 8))
+ (match_dup 10)))
+ (use (const_int 1))]))
+ ;; Step 4
+ ;; d0 = 1/2 - S0 * H0 in f10
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 5
+ ;; H1 = H0 + d0 * H0 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 6
+ ;; S1 = S0 + d0 * S0 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 8))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 7
+ ;; d1 = 1/2 - S1 * H1 in f10
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 8
+ ;; H2 = H1 + d1 * H1 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 9
+ ;; S2 = S1 + d1 * S1 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 8))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 10
+ ;; d2 = 1/2 - S2 * H2 in f10
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 11
+ ;; e2 = a - S2 * S2 in f8
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 12
+ ;; S3 = S2 + e2 * H2 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 13
+ ;; H3 = H2 + d2 * H2 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 14
+ ;; e3 = a - S3 * S3 in f8
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 15
+ ;; S = S3 + e3 * H3 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+ (match_dup 8)))
+ (use (const_int 0))]))]
+{
+ /* Generate 82-bit versions of the input and output operands. */
+ operands[8] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[9] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ /* Generate required floating-point constants. */
+ operands[10] = CONST0_RTX (XFmode);
+}
+ [(set_attr "predicable" "no")])
+
;; ??? frcpa works like cmp.foo.unc.
(define_insn "*recip_approx"