diff options
Diffstat (limited to 'libgcc/config/avr')
-rw-r--r-- | libgcc/config/avr/libf7/libf7-asm.sx | 251 | ||||
-rw-r--r-- | libgcc/config/avr/libf7/libf7.c | 36 |
2 files changed, 184 insertions, 103 deletions
diff --git a/libgcc/config/avr/libf7/libf7-asm.sx b/libgcc/config/avr/libf7/libf7-asm.sx index 01d1fa3..b00f759 100644 --- a/libgcc/config/avr/libf7/libf7-asm.sx +++ b/libgcc/config/avr/libf7/libf7-asm.sx @@ -1287,6 +1287,189 @@ ENDF div #endif /* F7MOD_div_ */ +#ifdef F7MOD_sqrt_approx_ +;; ReMainder +#define MX 16 +#define M0 17 +#define M1 26 +#define M2 27 +#define M3 14 +#define M4 15 +#define M5 TMP +#define M6 r29 + +#define AA ZERO +#define One r13 +#define Bits r28 +#define Bytes M6 + +;; Extend C[] by 0b01 at the low end. +#define CX (0b01 << 6) + +;;; Compute square-root of const f7_t *R22 for a positive number. +DEFUN sqrt_approx + ;; 7 = Y, R17...R13 + do_prologue_saves 7 + + wmov ZL, r22 ; Input const f7_t* + wmov YL, r24 ; Output f7_t* + F7call load_mant + ldi CA, 0xff + + ;; The paper-pencil method for the mantissa consumes bits in pairs and + ;; expects the input as Q-format 2.*, but mant is in 1.*. This means + ;; we have to shift one to the right. If expo is odd, then we shift + ;; one to the left and subtract one from expo in order to compensate + ;; and to get an even exponent. + + ;; Divide expo by 2 because we are doing sqrt. + ldd XH, Z+Expo+1 + ldd XL, Z+Expo+0 + asr XH + ror XL + ;; Store expo to result. + wmov ZL, YL + std Z+Expo+0, XL + std Z+Expo+1, XH + + brcs 1f + ;; Expo was even. Do >>=1 in order to get Q2.* as explained above. + LSR C6 $ ror C5 $ ror C4 $ ror C3 + ror C2 $ ror C1 $ ror C0 $ ror CA +1: + ;; For odd expo, >>=1 to get Q2.* and <<=1 to get an even expo cancel out. + ;; And the right-shift of the exponent implicitly subtracted 1 from it + ;; as needed. + F7call store_mant.with_flags + + ;; Let Z point one past the mantissa's MSB. + adiw ZL, Off + F7_MANT_BYTES + + ;; Clear the result mantissa. + .global __clr_8 + XCALL __clr_8 + ;; Clear ReMainder. M6 === Bytes will be zero when Bytes is down to 0. + clr M5 + wmov M3, C3 + wmov M1, C1 + wmov MX, CA + + clr One + inc One + + ;; "+1" because .flags extends the mantissa at the low end. + ldi Bytes, 1 + F7_MANT_BYTES +.Loop_bytes: + ld AA, -Z + ldi Bits, 8 +.Loop_bits: + ;; Shift top 2 bits of MX into M[]. + LSL MX $ rol M0 $ rol M1 $ rol M2 $ rol M3 + LSL MX $ rol M0 $ rol M1 $ rol M2 $ rol M3 + + ;; "Take down" 2 bits from A[] to MX.7 and MX.6 + mov MX, AA + andi MX, 0xc0 + lsl AA + lsl AA + + ;; Compare remainder against current result extended by 0b01. + CPI MX, CX + cpc M0, C0 + cpc M1, C1 + cpc M2, C2 + cpc M3, C3 + brcs 1f + ;; If the extended result fits, subtract it from M and set the + ;; next result bit to 1. + SUBI MX, CX + sbc M0, C0 + sbc M1, C1 + sbc M2, C2 + sbc M3, C3 +1: + ;; If it doesn't fit, set the next result bit to 0 (and don't subtract). + rol C0 + eor C0, One + rol C1 + rol C2 + rol C3 + + subi Bits, 2 + brne .Loop_bits + ;; AA (=== ZERO) is zero again. + + dec Bytes + brne .Loop_bytes + ;; B6 (=== Bytes) is zero now. + + ;; Now we consumed all the 64 bits of the extended mantissa, but we + ;; only expanded 64 / 2 = 32 bits of the result, which is currently + ;; held in C3 ... C0. Do the same like above, but on all bytes. + ;; Shift in 00's because the mantissa is exhausted. + + ;; "-1" because flags is part of the mantissa, which is already consumed. + ldi Bits, 8 * (F7_MANT_BYTES - 1) +.Loop2_bits: + ;; Shift top 2 bits of MX into M[]. +.Ltwice: + LSL MX + rol M0 + rol M1 + rol M2 + rol M3 + rol M4 + rol M5 + rol M6 + subi Bits, 0x80 + brmi .Ltwice + + ;; "Take down" two 0's to MX.7 and MX.6 + ; clr MX ;; MX is already zero. + + ;; Compare remainder against current result extended by 0b01. + CPI MX, CX + cpc M0, C0 + cpc M1, C1 + cpc M2, C2 + cpc M3, C3 + cpc M4, C4 + cpc M5, C5 + cpc M6, C6 + brcs 1f + ;; If the extended result fits, subtract it from M and set the + ;; next result bit to 1. + SUBI MX, CX + sbc M0, C0 + sbc M1, C1 + sbc M2, C2 + sbc M3, C3 + sbc M4, C4 + sbc M5, C5 + sbc M6, C6 +1: + ;; If it doesn't fit, set the next result bit to 0 (and don't subtract). + rol C0 + eor C0, One + rol C1 + rol C2 + rol C3 + rol C4 + rol C5 + rol C6 + + subi Bits, 2 + brne .Loop2_bits + + ;; Set flags. + st Z, ZERO + F7call store_mant + + do_epilogue_restores 7 +ENDF sqrt_approx +#endif /* F7MOD_sqrt_approx_ */ + + #if defined (F7MOD_sqrt16_) && defined (__AVR_HAVE_MUL__) #define Mask C6 @@ -1348,74 +1531,6 @@ ENDF sqrt16_round #undef Q1 #endif /* F7MOD_sqrt16_ && MUL */ -#ifdef F7MOD_sqrt_approx_ -DEFUN sqrt_approx - push r17 - push r16 - wmov XL, r24 - wmov ZL, r22 - - ;; C[] = 0. - .global __clr_8 - XCALL __clr_8 - - ldd C5, Z+5+Off - ldd C6, Z+6+Off - - ldd Carry, Z+0+Expo - ldd TMP, Z+1+Expo - wmov ZL, XL - - st Z, ZERO - - asr TMP - ror Carry - std Z+1+Expo, TMP - std Z+0+Expo, Carry - - ;; Re-interpreting our Q-format 1.xx mantissa as Q2.yy, we have to shift - ;; the mantissa to the right by 1. As we need an even exponent, multiply - ;; the mantissa by 2 for odd exponents, i.e. only right-shift if .expo - ;; is even. - - brcs 1f - lsr C6 - ror C5 - -1: - F7call sqrt16_round - - ;; sqrt16_round() returns: C = 0: error in [0, -1/2 LSB). - ;; C = 1: error in [1/2 LSB, 0) - - brcc 2f - ;; Undo the round-up from sqrt16_round(); this will transform to - ;; error in [-1/2 LSB, -1 LSB). - sbiw C5, 1 - ;; Together with the correct bit C4.7, the error is in [0, -1/2 LSB). - ori C4, 1 << 7 - -2: ;; Setting C4.6 adds 1/4 LSB and the error is now in [1/4 LSB, -1/4 LSB) - ;; in either case. - ori C4, 1 << 6 - - ;; ???????????? - ;; sqrt16_round() runs on integers which means that it computes the - ;; square root of mant * 2^14 if we regard mant as Q-format 2.yy, - ;; i.e. 2 integral bits. The result is sqrt(mant) * 2^7, - ;; and in order to get the same scaling like the input, .expo has to - ;; be adjusted by 7. ??????????????? - - ldi Carry, 8 - F7call normalize.store_with_flags - - pop r16 - pop r17 - ret - -ENDF sqrt_approx -#endif /* F7MOD_sqrt_approx_ */ - #undef CA #undef C0 diff --git a/libgcc/config/avr/libf7/libf7.c b/libgcc/config/avr/libf7/libf7.c index 49baac7..da2a4b6 100644 --- a/libgcc/config/avr/libf7/libf7.c +++ b/libgcc/config/avr/libf7/libf7.c @@ -1188,40 +1188,6 @@ f7_t* f7_ldexp (f7_t *cc, const f7_t *aa, int delta) #ifdef F7MOD_sqrt_ -static void sqrt_worker (f7_t *cc, const f7_t *rr) -{ - f7_t tmp7, *tmp = &tmp7; - f7_t aa7, *aa = &aa7; - - // aa in [1/2, 2) => aa->expo in { -1, 0 }. - int16_t a_expo = -(rr->expo & 1); - int16_t c_expo = (rr->expo - a_expo) >> 1; // FIXME: r_expo = INT_MAX??? - - __asm ("" : "+r" (aa)); - - f7_copy (aa, rr); - aa->expo = a_expo; - - // No use of rr or *cc past this point: We may use cc as temporary. - // Approximate square-root of A by X <-- (X + A / X) / 2. - - f7_sqrt_approx_asm (cc, aa); - - // Iterate X <-- (X + A / X) / 2. - // 3 Iterations with 16, 32, 58 bits of precision for the quotient. - - for (uint8_t prec = 16; (prec & 0x80) == 0; prec <<= 1) - { - f7_divx (tmp, aa, cc, (prec & 64) ? 2 + F7_MANT_BITS : prec); - f7_Iadd (cc, tmp); - // This will never underflow because |c_expo| is small. - cc->expo--; - } - - // Similar: |c_expo| is small, hence no ldexp needed. - cc->expo += c_expo; -} - F7_WEAK void f7_sqrt (f7_t *cc, const f7_t *aa) { @@ -1236,7 +1202,7 @@ void f7_sqrt (f7_t *cc, const f7_t *aa) if (f7_class_zero (a_class)) return f7_clr (cc); - sqrt_worker (cc, aa); + f7_sqrt_approx_asm (cc, aa); } #endif // F7MOD_sqrt_ |