diff options
Diffstat (limited to 'sysdeps/ia64/fpu/e_powl.S')
-rw-r--r-- | sysdeps/ia64/fpu/e_powl.S | 4080 |
1 files changed, 1720 insertions, 2360 deletions
diff --git a/sysdeps/ia64/fpu/e_powl.S b/sysdeps/ia64/fpu/e_powl.S index d286e9a..3f93f60 100644 --- a/sysdeps/ia64/fpu/e_powl.S +++ b/sysdeps/ia64/fpu/e_powl.S @@ -1,10 +1,10 @@ .file "powl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,61 +20,70 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ********************************************************************* +//********************************************************************* // // Function: powl(x,y), where -// y +// y // powl(x,y) = x , for double extended precision x and y values // -// ********************************************************************* +//********************************************************************* // -// History: -// 2/02/00 (Hand Optimized) -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// History: +// 02/02/00 (Hand Optimized) +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 1/22/01 Corrected results for powl(1,inf), powl(1,nan), and +// 01/22/01 Corrected results for powl(1,inf), powl(1,nan), and // powl(snan,0) to be 1 per C99, not nan. Fixed many flag settings. -// 2/06/01 Call __libm_error support if over/underflow when y=2. -// -// ********************************************************************* +// 02/06/01 Call __libm_error support if over/underflow when y=2. +// 04/17/01 Support added for y close to 1 and x a non-special value. +// Shared software under/overflow detection for all paths +// 02/07/02 Corrected sf3 setting to disable traps +// 05/13/02 Improved performance of all paths +// 02/10/03 Reordered header: .section, .global, .proc, .align; +// used data8 for long double table values +// 04/17/03 Added missing mutex directive +// 10/13/03 Corrected .endp names to match .proc names +// +//********************************************************************* // // Resources Used: // -// Floating-Point Registers: -// f8 (Input and Return Value) -// f9-f15,f32-f63,f99 +// Floating-Point Registers: +// f8 (Input x and Return Value) +// f9 (Input y) +// f10-f15,f32-f79 // // General Purpose Registers: -// Locals r32 - r61 +// Locals r14-24,r32-r65 // Parameters to __libm_error_support r62,r63,r64,r65 // // Predicate Registers: p6-p15 // -// ********************************************************************* +//********************************************************************* // // Special Cases and IEEE special conditions: // // Denormal fault raised on denormal inputs -// Overflow exceptions raised when appropriate for pow -// Underflow exceptions raised when appropriate for pow +// Overflow exceptions raised when appropriate for pow +// Underflow exceptions raised when appropriate for pow // (Error Handling Routine called for overflow and Underflow) // Inexact raised when appropriate by algorithm // @@ -102,8 +111,8 @@ // 22. X or Y denorm/unorm and denorm/unorm operand trap is enabled, // generate denorm/unorm fault except if invalid or div_0 raised. // -// ********************************************************************* -// +//********************************************************************* +// // Algorithm // ========= // @@ -113,23 +122,23 @@ // If Y = 0.5, return sqrt(X). // // Compute log(X) to extra precision. -// +// // ker_log_80( X, logX_hi, logX_lo, Safe ); // -// ...logX_hi + logX_lo approximates log(X) to roughly 80 +// ...logX_hi + logX_lo approximates log(X) to roughly 80 // ...significant bits of accuracy. // // Compute Y*log(X) to extra precision. // // P_hi := Y * logX_hi -// P_lo := Y * logX_hi - P_hi ...using FMA -// P_lo := Y * logX_lo + P_lo ...using FMA +// P_lo := Y * logX_hi - P_hi ...using FMA +// P_lo := Y * logX_lo + P_lo ...using FMA // // Compute exp(P_hi + P_lo) // -// Flag := 2; +// Flag := 2; // Expo_Range := 2; (assuming double-extended power function) -// ker_exp_64( P_hi, P_lo, Flag, Expo_Range, +// ker_exp_64( P_hi, P_lo, Flag, Expo_Range, // Z_hi, Z_lo, scale, Safe ) // // scale := sgn * scale @@ -138,7 +147,7 @@ // return scale*Z_hi + (scale*Z_lo) // quickly // Else -// take necessary precaution in computing +// take necessary precaution in computing // scale*Z_hi + (scale*Z_lo) // to set possible exceptions correctly. // End If @@ -152,8 +161,8 @@ // If Y is qNaN, return Y without exception. // If X is qNaN, return X without exception. // -// At this point, X is real and Y is +-inf. -// Thus |X| can only be 1, strictly bigger than 1, or +// At this point, X is real and Y is +-inf. +// Thus |X| can only be 1, strictly bigger than 1, or // strictly less than 1. // // If |X| < 1, then @@ -169,8 +178,8 @@ // ...Note that Y is real, finite, non-zero, and not +1. // // If X is qNaN, return X without exception. -// -// If X is +-0, +// +// If X is +-0, // return ( Y > 0 ? +0 : +inf ) // // If X is +inf @@ -180,11 +189,11 @@ // return -0 ** -Y // return ( Y > 0 ? +inf : +0 ) // -// Case_Invalid +// Case_Invalid // // Return 0 * inf to generate a quiet NaN together // with an invalid exception. -// +// // Implementation // ============== // @@ -193,15 +202,15 @@ // // STAGE 1 // ------- -// This stage contains two threads. +// This stage contains two threads. // // Stage1.Thread1 // // fclass.m X_excep, X_ok = X, (NatVal or s/qNaN) or -// +-0, +-infinity +// +-0, +-infinity // // fclass.nm X_unsupp, X_supp = X, (NatVal or s/qNaN) or -// +-(0, unnorm, norm, infinity) +// +-(0, unnorm, norm, infinity) // // X_norm := fnorm( X ) with traps disabled // @@ -209,26 +218,26 @@ // If (X_unsupp) goto Filtering (Step 2) // // Stage1.Thread2 -// .............. +// .............. // // fclass.m Y_excep, Y_ok = Y, (NatVal or s/qNaN) or -// +-0, +-infinity +// +-0, +-infinity // // fclass.nm Y_unsupp, Y_supp = Y, (NatVal or s/qNaN) or -// +-(0, unnorm, norm, infinity) +// +-(0, unnorm, norm, infinity) // // Y_norm := fnorm( Y ) with traps disabled // // If (Y_excep) goto Filtering (Step 2) // If (Y_unsupp) goto Filtering (Step 2) // -// +// // STAGE 2 // ------- // This stage contains two threads. // -// Stage2.Thread1 -// .............. +// Stage2.Thread1 +// .............. // // Set X_lt_0 if X < 0 (using fcmp) // sgn := +1.0 @@ -245,14 +254,14 @@ // This stage contains two threads. // // -// Stage3.Thread1 -// .............. +// Stage3.Thread1 +// .............. // // X := fnorm(X) in prevailing traps // // -// Stage3.Thread2 -// .............. +// Stage3.Thread2 +// .............. // // Y := fnorm(Y) in prevailing traps // @@ -262,60 +271,56 @@ // Go to Case_Normal. // -#include "libm_support.h" - -#ifdef _LIBC -.rodata -#else -.data -#endif - -// Inv_L, L_hi, L_lo -.align 64 -Constants_exp_64_Arg: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) -data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 -data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 -data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) - -.align 64 -Constants_exp_64_Exponents: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) -data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF -data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF -data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF -ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) - -.align 64 -Constants_exp_64_A: -ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) -// Reversed -data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 -data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 -data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_exp_64_A) - -.align 64 -Constants_exp_64_P: -ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) -// Reversed -data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 -data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 -data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 -data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 -data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 -data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_exp_64_P) - -.align 64 -Constants_exp_64_T1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) -data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 -data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 + +// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** + +// double-extended 1/ln(2) +// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 +// 3fff b8aa 3b29 5c17 f0bc +// For speed the significand will be loaded directly with a movl and setf.sig +// and the exponent will be bias+63 instead of bias+0. Thus subsequent +// computations need to scale appropriately. +// The constant 2^12/ln(2) is needed for the computation of N. This is also +// obtained by scaling the computations. +// +// Two shifting constants are loaded directly with movl and setf.d. +// 1. RSHF_2TO51 = 1.1000..00 * 2^(63-12) +// This constant is added to x*1/ln2 to shift the integer part of +// x*2^12/ln2 into the rightmost bits of the significand. +// The result of this fma is N_signif. +// 2. RSHF = 1.1000..00 * 2^(63) +// This constant is subtracted from N_signif * 2^(-51) to give +// the integer part of N, N_fix, as a floating-point number. +// The result of this fms is float_N. +RODATA + +.align 16 +// L_hi, L_lo +LOCAL_OBJECT_START(Constants_exp_64_Arg) +data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12 +data8 0xF473DE6AF278ECE6,0x00003FD4 // L_lo = lo part log(2)/2^12 +LOCAL_OBJECT_END(Constants_exp_64_Arg) + +LOCAL_OBJECT_START(Constants_exp_64_A) +// Reversed +data8 0xAAAAAAABB1B736A0,0x00003FFA +data8 0xAAAAAAAB90CD6327,0x00003FFC +data8 0xFFFFFFFFFFFFFFFF,0x00003FFD +LOCAL_OBJECT_END(Constants_exp_64_A) + +LOCAL_OBJECT_START(Constants_exp_64_P) +// Reversed +data8 0xD00D6C8143914A8A,0x00003FF2 +data8 0xB60BC4AC30304B30,0x00003FF5 +data8 0x888888887474C518,0x00003FF8 +data8 0xAAAAAAAA8DAE729D,0x00003FFA +data8 0xAAAAAAAAAAAAAF61,0x00003FFC +data8 0x80000000000004C7,0x00003FFE +LOCAL_OBJECT_END(Constants_exp_64_P) + +LOCAL_OBJECT_START(Constants_exp_64_T1) +data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 +data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA @@ -330,274 +335,263 @@ data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C -ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) - -.align 64 -Constants_exp_64_T2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) -data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 -data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 -data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E -data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 -data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 -data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA -data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 -data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A -data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 -data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA -data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 -data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA -data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 -data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 -data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE +LOCAL_OBJECT_END(Constants_exp_64_T1) + +LOCAL_OBJECT_START(Constants_exp_64_T2) +data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 +data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 +data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E +data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 +data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 +data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA +data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 +data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A +data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 +data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA +data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 +data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA +data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 +data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 +data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 -ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) - -.align 64 -Constants_exp_64_W1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) -data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 -data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 -data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA -data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 -data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 -data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE -data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B -data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 -data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 -data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 -data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A -data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB -data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E -data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA -data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 -data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B -data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 -data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 -data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 -data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 -data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB -data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 -data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C -data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D -data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 -data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F -data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 -data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 -data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC -data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB -data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB -data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) - -.align 64 -Constants_exp_64_W2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) -data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 -data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 -data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A -data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E -data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 -data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 -data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 -data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 -data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 -data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D -data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 -data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 -data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 -data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F -data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 -data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 -data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D -data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 -data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 -data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED -data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B -data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 -data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 -data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C -data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 -data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE -data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 -data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 -data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 -data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 -data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE -data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) - -.align 64 -Constants_log_80_P: -ASM_TYPE_DIRECTIVE(Constants_log_80_P,@object) -// 1/2, P_8, P_7, ..., P_1 -data4 0x00000000, 0x80000000, 0x00003FFE, 0x00000000 -data4 0x3B1042BC, 0xCCCE8B88, 0x0000BFFB, 0x00000000 -data4 0xCADC2149, 0xE38997B7, 0x00003FFB, 0x00000000 -data4 0xB1ACB090, 0xFFFFFFFE, 0x0000BFFB, 0x00000000 -data4 0x06481C81, 0x92492498, 0x00003FFC, 0x00000000 -data4 0xAAAAB0EF, 0xAAAAAAAA, 0x0000BFFC, 0x00000000 -data4 0xCCC91416, 0xCCCCCCCC, 0x00003FFC, 0x00000000 -data4 0x00000000, 0x80000000, 0x0000BFFD, 0x00000000 -data4 0xAAAAAAAB, 0xAAAAAAAA, 0x00003FFD -ASM_SIZE_DIRECTIVE(Constants_log_80_P) - -.align 64 -Constants_log_80_Q: -ASM_TYPE_DIRECTIVE(Constants_log_80_Q,@object) -// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1 -data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 -data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 -data4 0xA51BE0AF,0x92492453,0x00003FFC,0x00000000 -data4 0xA0CFD29F,0xAAAAAB73,0x0000BFFC,0x00000000 -data4 0xCCCE3872,0xCCCCCCCC,0x00003FFC,0x00000000 -data4 0xFFFFB4FB,0xFFFFFFFF,0x0000BFFC,0x00000000 -data4 0xAAAAAAAB,0xAAAAAAAA,0x00003FFD,0x00000000 -data4 0x00000000,0x80000000,0x0000BFFE,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_log_80_Q) - -.align 64 -Constants_log_80_Z_G_H_h1: -ASM_TYPE_DIRECTIVE(Constants_log_80_Z_G_H_h1,@object) -// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double +LOCAL_OBJECT_END(Constants_exp_64_T2) + +LOCAL_OBJECT_START(Constants_exp_64_W1) +data8 0x0000000000000000, 0xBE384454171EC4B4 +data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8 +data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36 +data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE +data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F +data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329 +data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5 +data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F +data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF +data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F +data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92 +data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E +data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D +data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29 +data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A +data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA +data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6 +data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF +data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC +data8 0xBE51C2141AA42614, 0xBE48D087C37293F4 +data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38 +data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962 +data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788 +data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7 +data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2 +data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4 +data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA +data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B +data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A +data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719 +data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D +data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707 +LOCAL_OBJECT_END(Constants_exp_64_W1) + +LOCAL_OBJECT_START(Constants_exp_64_W2) +data8 0x0000000000000000, 0xBE641F2537A3D7A2 +data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6 +data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE +data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3 +data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4 +data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B +data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7 +data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA +data8 0xBE56856B49BFF529, 0x3E66DD3300508651 +data8 0x3E51165FC114BC13, 0x3E53333DC453290F +data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696 +data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93 +data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE +data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22 +data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97 +data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8 +data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC +data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1 +data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7 +data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D +data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C +data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5 +data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9 +data8 0xBE559725ADE45917, 0xBE68C29C042FC476 +data8 0xBE67593B01E511FA, 0xBE4A4313398801ED +data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E +data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D +data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F +data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1 +data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795 +data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E +data8 0x3E68BF5C17365712, 0x3E3956F9B3785569 +LOCAL_OBJECT_END(Constants_exp_64_W2) + +LOCAL_OBJECT_START(Constants_log_80_P) +// P_8, P_7, ..., P_1 +data8 0xCCCE8B883B1042BC, 0x0000BFFB // P_8 +data8 0xE38997B7CADC2149, 0x00003FFB // P_7 +data8 0xFFFFFFFEB1ACB090, 0x0000BFFB // P_6 +data8 0x9249249806481C81, 0x00003FFC // P_5 +data8 0x0000000000000000, 0x00000000 // Pad for bank conflicts +data8 0xAAAAAAAAAAAAB0EF, 0x0000BFFC // P_4 +data8 0xCCCCCCCCCCC91416, 0x00003FFC // P_3 +data8 0x8000000000000000, 0x0000BFFD // P_2 +data8 0xAAAAAAAAAAAAAAAB, 0x00003FFD // P_1 +LOCAL_OBJECT_END(Constants_log_80_P) + +LOCAL_OBJECT_START(Constants_log_80_Q) +// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1 +data8 0xB172180000000000,0x00003FFE +data8 0x82E308654361C4C6,0x0000BFE2 +data8 0x92492453A51BE0AF,0x00003FFC +data8 0xAAAAAB73A0CFD29F,0x0000BFFC +data8 0xCCCCCCCCCCCE3872,0x00003FFC +data8 0xFFFFFFFFFFFFB4FB,0x0000BFFC +data8 0xAAAAAAAAAAAAAAAB,0x00003FFD +data8 0x8000000000000000,0x0000BFFE +LOCAL_OBJECT_END(Constants_log_80_Q) + +LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h1) +// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double data4 0x00008000,0x3F800000,0x00000000,0x00000000 -data4 0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00000000,0x00000000,0x00000000,0x00000000 data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000 data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000 data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000 data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000 data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000 -data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000 +data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000 data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000 -data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000 -data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000 -data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000 -data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000 +data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000 +data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000 +data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000 +data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000 data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000 -data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000 -data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000 -data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000 -data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000 -data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000 -data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000 -data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000 -data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000 -data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000 -data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000 +data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000 +data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000 +data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000 +data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000 +data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000 +data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000 +data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000 +data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000 +data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000 +data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000 data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000 -data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000 -data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000 -data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000 -data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000 -data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000 +data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000 +data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000 +data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000 +data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000 +data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000 data4 0x00004211,0x3F042108,0x3F29516A,0x00000000 -data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_log_80_Z_G_H_h1) - -.align 64 -Constants_log_80_Z_G_H_h2: -ASM_TYPE_DIRECTIVE(Constants_log_80_Z_G_H_h2,@object) -// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double -data4 0x00008000,0x3F800000,0x00000000,0x00000000 -data4 0x00000000,0x00000000,0x00000000,0x00000000 -data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000 +data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000 +LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h1) + +LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h2) +// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double +data4 0x00008000,0x3F800000,0x00000000,0x00000000 +data4 0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000 data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000 -data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000 -data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000 -data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000 -data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000 -data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000 -data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000 -data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000 +data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000 +data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000 +data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000 +data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000 +data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000 +data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000 +data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000 data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000 -data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000 -data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000 +data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000 +data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000 data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000 -data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000 -data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000 -data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000 -data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000 -data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000 -data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000 -data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000 -data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000 -data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000 -data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000 -data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000 -data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000 -data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000 -data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000 -data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000 -data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000 +data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000 +data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000 +data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000 +data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000 +data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000 +data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000 +data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000 +data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000 +data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000 +data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000 +data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000 +data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000 +data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000 +data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000 +data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000 +data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000 data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_log_80_Z_G_H_h2) - -.align 64 -Constants_log_80_h3_G_H: -ASM_TYPE_DIRECTIVE(Constants_log_80_h3_G_H,@object) -// h3 IEEE double extended, H3 and G3 IEEE single -data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00 +LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h2) + +LOCAL_OBJECT_START(Constants_log_80_h3_G_H) +// h3 IEEE double extended, H3 and G3 IEEE single +data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00 data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400 -data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00 -data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400 +data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00 +data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400 data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00 -data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400 -data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08 -data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408 -data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10 -data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410 -data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18 +data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400 +data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08 +data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408 +data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10 +data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410 +data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18 data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420 -data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20 -data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428 -data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30 -data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438 -data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40 -data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448 -data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50 -data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458 -data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68 -data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470 -data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78 +data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20 +data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428 +data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30 +data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438 +data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40 +data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448 +data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50 +data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458 +data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68 +data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470 +data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78 data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488 -data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90 -data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0 -data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8 -data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8 -data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8 -data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8 -data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0 -data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0 -data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here -data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D +data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90 +data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0 +data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8 +data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8 +data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8 +data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8 +data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0 +data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0 +data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here +data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101 -data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED -data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766 -data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6 -data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620 -data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D -ASM_SIZE_DIRECTIVE(Constants_log_80_h3_G_H) - -.align 64 -Constant_half: -ASM_TYPE_DIRECTIVE(Constant_half,@object) -data4 0x00000000,0x80000000,0x00003FFE -ASM_SIZE_DIRECTIVE(Constant_half) - -GR_Expo_Range = r32 -GR_Flag = r33 +data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED +data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766 +data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6 +data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620 +data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D +LOCAL_OBJECT_END(Constants_log_80_h3_G_H) + +GR_sig_inv_ln2 = r14 +GR_rshf_2to51 = r15 +GR_exp_2tom51 = r16 +GR_rshf = r17 +GR_exp_half = r18 +GR_sign_mask = r19 +GR_exp_square_oflow = r20 +GR_exp_square_uflow = r21 +GR_exp_ynear1_oflow = r22 +GR_exp_ynear1_uflow = r23 +GR_signif_Z = r24 + +GR_signexp_x = r32 + +GR_exp_x = r33 + GR_Table_Ptr = r34 GR_Table_Ptr1 = r35 -GR_BIAS = r35 GR_Index1 = r36 -GR_sign_mask = r36 GR_Index2 = r37 GR_Expo_X = r37 -GR_signif_Z = r38 GR_M = r38 GR_X_0 = r39 @@ -620,45 +614,49 @@ GR_k = r44 GR_Big_Pos_Exp = r45 +GR_exp_pos_max = r46 -GR_BIAS_p_k = r47 -GR_BIASed_exp_y = r47 +GR_exp_bias_p_k = r47 -GR_Big_Neg_Exp = r48 GR_Index3 = r48 GR_temp = r48 GR_vsm_expo = r49 -GR_y_sign = r49 GR_T1_ptr = r50 +GR_P_ptr1 = r50 GR_T2_ptr = r51 +GR_P_ptr2 = r51 GR_N_fix = r52 GR_exp_y = r53 GR_signif_y = r54 -GR_exp_and_sign_y = r55 +GR_signexp_y = r55 +GR_fraction_y = r55 GR_low_order_bit = r56 -GR_get_exp_mask = r57 -GR_exponent_zero = r58 - -// ** Registers for unwind support +GR_exp_mask = r57 +GR_exp_bias = r58 +GR_y_sign = r59 +GR_table_base = r60 +GR_ptr_exp_Arg = r61 +GR_Delta_Exp = r62 +GR_Special_Exp = r63 +GR_exp_neg_max = r64 +GR_Big_Neg_Exp = r65 + +//** Registers for unwind support GR_SAVE_PFS = r59 GR_SAVE_B0 = r60 GR_SAVE_GP = r61 -GR_Parameter_X = r62 -GR_Parameter_Y = r63 -GR_Parameter_RESULT = r64 -GR_Parameter_TAG = r65 +GR_Parameter_X = r62 +GR_Parameter_Y = r63 +GR_Parameter_RESULT = r64 +GR_Parameter_TAG = r65 -FR_X = f8 -FR_Y = f9 -FR_RESULT = f99 - -// ** +//** FR_Input_X = f8 -FR_Output = f8 +FR_Result = f8 FR_Input_Y = f9 FR_Neg = f10 @@ -671,7 +669,6 @@ FR_poly_hi = f11 FR_Sgn = f12 -FR_Neg_X = f13 FR_half_W = f13 FR_X_cor = f14 @@ -698,13 +695,11 @@ FR_Scale = f36 FR_G_1 = f37 FR_G = f37 FR_Wsq = f37 -FR_L_Inv = f37 FR_temp = f37 FR_H_1 = f38 FR_H = f38 FR_W4 = f38 -FR_float_N = f38 FR_h = f39 FR_h_1 = f39 @@ -720,9 +715,7 @@ FR_L_lo = f41 FR_A_1 = f41 FR_h_2 = f42 -FR_P_6 = f42 -FR_abs_W = f43 FR_W1 = f43 FR_G_3 = f44 @@ -740,7 +733,6 @@ FR_H_3 = f47 FR_float_N = f48 -FR_P_4 = f49 FR_A_2 = f49 FR_Q_4 = f50 @@ -768,7 +760,6 @@ FR_Two = f56 FR_Big = f57 FR_neg_2_mK = f58 -FR_NBig = f58 FR_r = f59 @@ -777,1652 +768,1253 @@ FR_poly_lo = f60 FR_poly = f61 FR_P_5 = f62 +FR_Result_small = f62 FR_rsq = f63 -FR_Result = f99 -FR_Result_small = f100 -FR_Result_big = f101 +FR_Delta = f64 -.section .text -.proc powl# -.global powl# -.align 64 +FR_save_Input_X = f65 +FR_norm_X = f66 +FR_norm_Y = f67 +FR_Y_lo_2 = f68 -powl: -{ .mfi -alloc GR_Expo_Range = ar.pfs,0,30,4,0 -(p0) fclass.m.unc p7, p13 = FR_Input_Y, 0x1E7 -nop.i 0 -} -{ .mfi -(p0) getf.exp GR_exp_and_sign_y = FR_Input_Y +FR_P_6 = f69 +FR_Result_big = f69 + +FR_RSHF_2TO51 = f70 +FR_INV_LN2_2TO63 = f71 +FR_2TOM51 = f72 +FR_RSHF = f73 +FR_TMP1 = f74 +FR_TMP2 = f75 +FR_TMP3 = f76 +FR_Tscale = f77 +FR_P_4 = f78 +FR_NBig = f79 + + +.section .text +GLOBAL_LIBM_ENTRY(powl) // -// Save State +// Get significand of x. It is the critical path. // -(p0) fclass.m.unc p6, p12 = FR_Input_X, 0x1E7 -nop.i 0 -};; { .mfi -(p0) getf.sig GR_signif_y = FR_Input_Y -(p0) fcmp.eq.unc.s1 p12, p13 = FR_Input_X, f1 -nop.i 0 + getf.sig GR_signif_Z = FR_Input_X // Get significand of x + fclass.m p11, p12 = FR_Input_X, 0x0b // Test x unorm + nop.i 999 } { .mfi - nop.m 999 -// -// Check for y = 1 -// Identify EM unsupporteds. -// Load FR_half = .5 -// -(p0) fadd.s1 FR_Two = f1, f1 -// -// Load 1/2 in GP register -// -nop.i 0 + nop.m 999 + fnorm.s1 FR_norm_X = FR_Input_X // Normalize x + mov GR_exp_half = 0xffff - 1 // Exponent for 0.5 } ;; -{ .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constant_half#), gp - nop.i 999 +{ .mfi + alloc r32 = ar.pfs,0,30,4,0 + fclass.m p7, p0 = FR_Input_Y, 0x1E7 // Test y natval, nan, inf, zero + mov GR_exp_pos_max = 0x13fff // Max exponent for pos oflow test +} +{ .mfi + addl GR_table_base = @ltoff(Constants_exp_64_Arg#), gp // Ptr to tables + fnorm.s1 FR_norm_Y = FR_Input_Y // Normalize y + mov GR_exp_neg_max = 0x33fff // Max exponent for neg oflow test } ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - nop.m 999 - nop.i 999 +{ .mfi + getf.exp GR_signexp_y = FR_Input_Y // Get sign and exp of y +(p12) fclass.m p11, p0 = FR_Input_Y, 0x0b // Test y unorm + mov GR_sign_mask = 0x20000 // Sign mask +} +{ .mfi + ld8 GR_table_base = [GR_table_base] // Get base address for tables + fadd.s1 FR_Two = f1, f1 // Form 2.0 for square test + mov GR_exp_mask = 0x1FFFF // Exponent mask } ;; -{ .mlx -(p0) ldfe FR_Half =[GR_Table_Ptr],0 -(p0) movl GR_get_exp_mask = 0x1FFFF ;; +{ .mfi + getf.sig GR_signif_y = FR_Input_Y // Get significand of y + fclass.m p6, p0 = FR_Input_X, 0x1E7 // Test x natval, nan, inf, zero + nop.i 999 } +;; { .mfi - nop.m 999 -(p0) fclass.nm.unc p9, p15 = FR_Input_Y, 0x1FF -// -// Create FR_Two = 2 -// Get exp and significand of Y -// Crate Masks -// sgn = 1 -// -(p0) and GR_exp_y = GR_get_exp_mask,GR_exp_and_sign_y + getf.exp GR_signexp_x = FR_Input_X // Get signexp of x + fmerge.s FR_save_Input_X = FR_Input_X, FR_Input_X + extr.u GR_Index1 = GR_signif_Z, 59, 4 // Extract upper 4 signif bits of x } -{ .mlx - nop.m 999 -(p0) movl GR_exponent_zero = 0xFFFF ;; +{ .mfb + setf.exp FR_Half = GR_exp_half // Load half + nop.f 999 +(p11) br.cond.spnt POWL_DENORM // Branch if x or y denorm/unorm } +;; + +// Return here from POWL_DENORM +POWL_COMMON: { .mfi - nop.m 999 -(p0) mov FR_Sgn = f1 - nop.i 999 + setf.exp FR_Big = GR_exp_pos_max // Form big pos value for oflow test + fclass.nm p11, p0 = FR_Input_Y, 0x1FF // Test Y unsupported + shl GR_Index1 = GR_Index1,5 // Adjust index1 pointer x 32 } { .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p10, p11 = FR_Input_Y, f1 - nop.i 999 ;; + add GR_Table_Ptr = 0x7c0, GR_table_base // Constants_log_80_Z_G_H_h1 + fma.s1 FR_Sgn = f1,f1,f0 // Assume result positive + mov GR_exp_bias = 0xFFFF // Form exponent bias } -{ .mfb - nop.m 999 +;; + // // Identify NatVals, NaNs, Infs, and Zeros. -// Load Half // -(p0) fclass.nm.unc p8, p14 = FR_Input_X, 0x1FF -// -// Remove sign bit from exponent of y. -// Check for x = 1 -// -(p6) br.cond.spnt L(POWL_64_SPECIAL) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.spnt L(POWL_64_SPECIAL) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p8) br.cond.spnt L(POWL_64_UNSUPPORT) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt L(POWL_64_UNSUPPORT) ;; -} -{ .mfi -(p0) cmp.lt.unc p9, p0 = GR_exp_y,GR_exponent_zero -(p0) fcmp.lt.unc.s1 p6, p13 = FR_Input_X, f0 // +// Remove sign bit from exponent of y. +// Check for x = 1 // Branch on Infs, Nans, Zeros, and Natvals // Check to see that exponent < 0 // -(p0) sub GR_exp_y = GR_exp_y,GR_exponent_zero -} -// x not zero, is y ==2? { .mfi - nop.m 999 -(p11) fcmp.eq.unc.s1 p7, p14 = FR_Input_Y, FR_Two - nop.i 999 ;; + setf.exp FR_NBig = GR_exp_neg_max // Form big neg value for oflow test + fclass.nm p8, p0 = FR_Input_X, 0x1FF // Test X unsupported + and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y } { .mfb - nop.m 999 -(p9) fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0 -(p7) br.cond.spnt L(POWL_64_SQUARE) ;; // Branch if x not zero and y=2 -} -{ .mfi - nop.m 999 -(p6) fmerge.ns FR_Neg_X = FR_Input_X, FR_Input_X - nop.i 999 ;; + add GR_Index1 = GR_Index1,GR_Table_Ptr + nop.f 999 +(p6) br.cond.spnt POWL_64_SPECIAL // Branch if x natval, nan, inf, zero } -{ .mfi - nop.m 999 -(p10) fmpy.s0 FR_Result = FR_Input_X, f1 -// -// For y = 1, compute result = x -// For x = 1, compute 1 -// When Y is one return X and possible raise -// denormal operand exception. -// Remove exponent BIAS +;; + +// load Z_1 from Index1 + +// There is logic starting here to determine if y is an integer when x < 0. +// If 0 < |y| < 1 then clearly y is not an integer. +// If |y| > 1, then the significand of y is shifted left by the size of +// the exponent of y. This preserves the lsb of the integer part + the +// fractional bits. The lsb of the integer can be tested to determine if +// the integer is even or odd. The fractional bits can be tested. If zero, +// then y is an integer. // -(p6) shl GR_exp_and_sign_y= GR_signif_y,GR_exp_y ;; -} { .mfi -(p9) or GR_exp_and_sign_y = 0xF,GR_signif_y -(p12) fma.s0 FR_Result = FR_Input_Y, f0, f1 - nop.i 999 ;; + ld2 GR_Z_1 =[GR_Index1],4 // Load Z_1 + fmerge.s FR_Z = f0, FR_norm_X // Z = |x| + extr.u GR_X_0 = GR_signif_Z, 49, 15 // Extract X_0 from significand } -{ .mii - nop.m 999 -(p6) extr.u GR_exp_y = GR_exp_and_sign_y,63,1 ;; -(p6) cmp.ne.unc p9, p0 = GR_exp_y, r0 +{ .mfb + cmp.lt p9, p0 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 + nop.f 999 +(p7) br.cond.spnt POWL_64_SPECIAL // Branch if y natval, nan, inf, zero } -{ .mii - nop.m 999 -// -// Both predicates can be set. -// Don't consider y's < 1. -// -(p6) shl GR_signif_y= GR_exp_and_sign_y,1 ;; -// -// Is shift off integer part of y. -// Get y's even or odd bit. -// -(p6) cmp.ne.unc p8, p0 = GR_signif_y, r0 +;; + +{ .mfb + ldfs FR_G_1 = [GR_Index1],4 // Load G_1 + fcmp.eq.s1 p10, p0 = FR_Input_Y, f1 // Test Y = +1.0 +(p8) br.cond.spnt POWL_64_UNSUPPORT // Branch if x unsupported } -{ .mib - nop.m 999 - nop.i 999 +;; + // -// Is the fractional part of the y = 0? -// Is the integer even or odd. +// X_0 = High order 15 bit of Z // -(p10) br.cond.spnt L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p12) br.cond.spnt L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p8) br.cond.spnt L(POWL_64_XNEG) ;; +{ .mfb + ldfs FR_H_1 = [GR_Index1],8 // Load H_1 +(p9) fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0 // Test x<0, 0 <|y|<1 +(p11) br.cond.spnt POWL_64_UNSUPPORT // Branch if y unsupported } +;; + { .mfi - nop.m 999 -(p9) fmerge.ns FR_Sgn = FR_Sgn, FR_Sgn - nop.i 999 + ldfe FR_h_1 = [GR_Index1] // Load h_1 + fcmp.eq.s1 p7, p0 = FR_Input_Y, FR_Two // Test y = 2.0 + pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // X_1 = X_0 * Z_1 (bits 15-30) + // Wait 4 cycles to use result } { .mfi - nop.m 999 -(p0) fcmp.eq.unc.s0 p11, p0 = FR_Input_Y, FR_Half - nop.i 999 ;; + add GR_Table_Ptr = 0x9c0, GR_table_base // Constants_log_80_Z_G_H_h2 + nop.f 999 + sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent of y } +;; + // -// Raise possible denormal operand exception for both -// X and Y. +// Branch for (x < 0) and Y not an integer. // { .mfb - nop.m 999 -// -// Branch for (x < 0) and Y not an integer. -// -(p0) fcmp.eq.unc.s0 p12, p0 = FR_Input_X, f1 -// -// For x < 0 and y integer, make x positive -// For x < 0 and y odd integer,, set sign = -1. -// -(p11) br.cond.spnt L(POWL_64_SQRT) ;; -} -{ .mmf -(p0) cmp.eq.unc p15, p14 = r0, r0 - nop.m 999 -(p13) fnorm.s1 FR_Z = FR_Input_X ;; -} -{ .mfi - nop.m 999 -(p6) fnorm.s1 FR_Z = FR_Neg_X - nop.i 999 + nop.m 999 + fcmp.lt.s1 p6, p0 = FR_Input_X, f0 // Test x < 0 +(p9) br.cond.spnt POWL_64_XNEG // Branch if x < 0, 0 < |y| < 1 } ;; -// -// Branch to embedded sqrt(x) -// -// -// Computes ln( x ) to extra precision -// Input FR 1: FR_X -// Output FR 2: FR_Y_hi -// Output FR 3: FR_Y_lo -// Output PR 1: PR_Safe -// - -{ .mmi +{ .mfi nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Z_G_H_h1#), gp + fcmp.eq.s1 p12, p0 = FR_Input_X, f1 // Test x=+1.0 nop.i 999 } +{ .mfb + nop.m 999 + fsub.s1 FR_W = FR_Z, f1 // W = Z - 1 +(p7) br.cond.spnt POWL_64_SQUARE // Branch if y=2 +} ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] +{ .mfi nop.m 999 - nop.i 999 +(p10) fmpy.s0 FR_Result = FR_Input_X, f1 // If y=+1.0, result=x +(p6) shl GR_fraction_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction + // Wait 4 cycles to use result } ;; - -{ .mlx - nop.m 999 -(p0) movl GR_BIAS = 0x000000000000FFFF ;; -} { .mfi - nop.m 999 -(p0) fsub.s1 FR_W = FR_Z, f1 - nop.i 999 ;; -} -// -// Z = Norm(X) - both + and - case -// Set Safe = True -// -{ .mmb -(p0) getf.sig GR_signif_Z = FR_Z -(p0) getf.exp GR_N = FR_Z - nop.b 999 ;; -} -{ .mii - nop.m 999 -// -// Get significand of Z -// W = Z - 1 -// -(p0) extr.u GR_Index1 = GR_signif_Z, 59, 4 ;; -// -// Index1 = High order 4 bits of Z -// X_0 = High order 15 bit of Z -// -(p0) shl GR_Index1 = GR_Index1,5 ;; -} -{ .mfi - nop.m 999 -// -// Add offset to Index1 ptr. -// -(p0) fabs FR_abs_W = FR_W -// -// BIAS = 0x000...FFFF -// Adjust Index1 ptr ( x 32) . -// -(p0) add GR_Index1 = GR_Index1,GR_Table_Ptr + nop.m 999 +(p12) fma.s0 FR_Result = FR_Input_Y, f0, f1 // If x=1.0, result=1, chk denorm + extr.u GR_Index2 = GR_X_1, 6, 4 // Extract index2 } -{ .mmi - nop.m 999 ;; -(p0) ld2 GR_Z_1 =[GR_Index1],4 -(p0) extr.u GR_X_0 = GR_signif_Z, 49, 15 +;; + +// +// N = exponent of Z +// +{ .mib + getf.exp GR_N = FR_Z // Get exponent of Z (also x) + shl GR_Index2=GR_Index2,5 // Index2 x 32 bytes +(p10) br.ret.spnt b0 // Exit if y=+1.0 } ;; -{ .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Z_G_H_h2#), gp +{ .mib + add GR_Index2 = GR_Index2, GR_Table_Ptr // Pointer to table 2 nop.i 999 +(p12) br.ret.spnt b0 // Exit if x=+1.0 } ;; { .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - nop.m 999 + ld2 GR_Z_2 =[GR_Index2],4 // Load Z_2 +;; + ldfs FR_G_2 = [GR_Index2],4 // Load G_2 nop.i 999 } ;; - -{ .mmi -(p0) ldfs FR_G_1 = [GR_Index1],4 ;; -(p0) ldfs FR_H_1 = [GR_Index1],8 - nop.i 999 ;; +{ .mii + ldfs FR_H_2 = [GR_Index2],8 // Load H_2 +(p6) tbit.nz.unc p9, p0 = GR_fraction_y, 63 // Test x<0 and y odd integer + add GR_Table_Ptr = 0xbcc, GR_table_base // Constants_log_80_h3_G_H, G_3 } +;; + // -// Adjust Index2 (x 32). +// For x < 0 and y odd integer,, set sign = -1. // { .mfi -(p0) ldfe FR_h_1 = [GR_Index1],0 - nop.f 999 -(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 ;; -} -{ .mmi - nop.m 999 ;; -// -// load Z_1 from Index1 -// abs_W = |W| -// Point to Table2 -// -(p0) getf.exp GR_M = FR_abs_W -// -// M = M - BIAS -// Load G_1 -// N = exponent of Z -// - nop.i 999;; + getf.exp GR_M = FR_W // Get signexp of W + nop.f 999 + pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // X_2 = X_1 * Z_2 (bits 15-30) } -{ .mmi - nop.m 999 - nop.m 999 - nop.i 999;; +{ .mfi + ldfe FR_h_2 = [GR_Index2] // Load h_2 +(p9) fnma.s1 FR_Sgn = f1, f1, f0 // If x<0, y odd int, result negative + sub GR_N = GR_N, GR_exp_bias // Get true exponent of x = N } -{ .mmi - nop.m 999 - nop.m 999 - nop.i 999;; +;; + +{ .mfi + add GR_Table_Ptr1 = 0xdc0, GR_table_base // Ptr to H_3 + fcmp.eq.s0 p11, p0 = FR_Input_Y, FR_Half // Test y=0.5, also set denorm +(p6) shl GR_fraction_y= GR_fraction_y, 1 // Shift left 1 to get fraction } -{ .mmi - nop.m 999 - nop.m 999 -(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;; +;; + +{ .mmb + setf.sig FR_float_N = GR_N +(p6) cmp.ne.unc p8, p0 = GR_fraction_y, r0 // Test x<0 and y not integer +(p8) br.cond.spnt POWL_64_XNEG // Branch if x<0 and y not int } -{ .mii - nop.m 999 -// -// Extract Index2 -// Load H_1 -// Is -8 > M ? +;; + // -(p0) shl GR_Index2=GR_Index2,5 ;; -(p0) add GR_Index2 = GR_Index2, GR_Table_Ptr -} +// Raise possible denormal operand exception for both X and Y. +// Set pointers in case |x| near 1 +// Branch to embedded sqrt(x) if y=0.5 // -// M = exponent of abs_W -// X_1 = X_0 * Z_1 -// -{ .mii -(p0) sub GR_M = GR_M, GR_BIAS - nop.i 999 ;; -(p0) cmp.gt.unc p7, p14 = -8, GR_M +{ .mfi + add GR_P_ptr1 = 0x6b0, GR_table_base // Constants_log_80_P, P8, NEAR path + fcmp.eq.s0 p12, p0 = FR_Input_X, FR_Input_Y // Dummy to set denormal + add GR_P_ptr2 = 0x700, GR_table_base // Constants_log_80_P, P4, NEAR path } -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.spnt L(LOGL80_NEAR) ;; +{ .mfb + cmp.eq p15, p14 = r0, r0 // Assume result safe (no over/under) + fsub.s1 FR_Delta = FR_Input_Y,f1 // Delta = y - 1.0 +(p11) br.cond.spnt POWL_64_SQRT // Branch if y=0.5 } +;; + // -// Load h_1 -// Possible branch out. -// Add offset of table to Index2 +// Computes ln( x ) to extra precision +// Input FR 1: FR_X +// Output FR 2: FR_Y_hi +// Output FR 3: FR_Y_lo +// Output PR 1: PR_Safe // { .mfi -(p0) ld2 GR_Z_2 =[GR_Index2],4 -(p0) fmerge.se FR_S = f1,FR_Z -(p0) sub GR_N = GR_N, GR_BIAS + and GR_M = GR_exp_mask, GR_M // Mask to get exponent of W + nop.f 999 + extr.u GR_Index3 = GR_X_2, 1, 5 // Get index3 } ;; { .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_h3_G_H#), gp - nop.i 999 + shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 // Ptr to H_3 + shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr // Ptr to G_3 + sub GR_M = GR_M, GR_exp_bias // Get true exponent of W } ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - nop.m 999 - nop.i 999 +{ .mib + ldfs FR_G_3 = [GR_Index3],-12 // Load G_3 + cmp.gt p7, p14 = -8, GR_M // Test if |x-1| < 2^-8 +(p7) br.cond.spnt LOGL80_NEAR // Branch if |x-1| < 2^-8 } ;; -// -// load Z_2 -// N - BIAS -// Point to Table 3. -// S = merging of Z and 1.0 -// -{ .mmi -(p0) ldfs FR_G_2 = [GR_Index2],4 -(p0) setf.sig FR_float_N = GR_N -(p0) add GR_Table_Ptr1 = 0x200,GR_Table_Ptr ;; -} -// -// load G_2 -// X_2 = X_1 * Z_2 -// Add offset to Table 2 ptr. -// float_N = significand of N -// -{ .mmi -(p0) ldfs FR_H_2 = [GR_Index2],8 ;; -// -// load H_2 -// G = G * G_2 -// -(p0) ldfe FR_h_2 = [GR_Index2],0 -(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;; -} -{ .mmi - nop.m 999 - nop.m 999 - nop.i 999;; -} -{ .mmi - nop.m 999 - nop.m 999 - nop.i 999;; -} -{ .mmi - nop.m 999 - nop.m 999 - nop.i 999;; +// Here if |x-1| >= 2^-8 +{ .mmf + ldfs FR_H_3 = [GR_Table_Ptr1] // Load H_3 + nop.m 999 + nop.f 999 } -{ .mii - nop.m 999 - nop.i 999 ;; -(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;; +;; + +{ .mfi + ldfe FR_h_3 = [GR_Index3] // Load h_3 + fmerge.se FR_S = f1,FR_Z // S = merge of 1.0 and signif(Z) + nop.i 999 } { .mfi -(p0) shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 - nop.f 999 + add GR_Table_Ptr = 0x740, GR_table_base // Constants_log_80_Q + fmpy.s1 FR_G = FR_G_1, FR_G_2 // G = G_1 * G_2 + nop.i 999 +} +;; + // -// h = h_1 + h_2 -// Adjust Index3 +// Begin Loading Q's - load log2_hi part // -(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr ;; -} -{ .mmb - nop.m 999 -(p0) ldfe FR_h_3 = [GR_Index3],12 - nop.b 999 ;; -} -{ .mmf -(p0) ldfs FR_H_3 = [GR_Table_Ptr1],0 +{ .mfi + ldfe FR_log2_hi = [GR_Table_Ptr],16 // Load log2_hi + fadd.s1 FR_H = FR_H_1, FR_H_2 // H = H_1 + H_2 + nop.i 999 +};; + // -// float_N = Make N a fp number -// Load h_3 -// Get pointer to Q table. +// h = h_1 + h_2 // -(p0) ldfs FR_G_3 = [GR_Index3],0 -(p0) fmpy.s1 FR_G = FR_G_1, FR_G_2 +{ .mfi + ldfe FR_log2_lo = [GR_Table_Ptr],16 // Load log2_lo + fadd.s1 FR_h = FR_h_1, FR_h_2 // h = h_1 + h_2 + nop.i 999 } ;; -{ .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Q#), gp +{ .mfi + ldfe FR_Q_6 = [GR_Table_Ptr],16 // Load Q_6 + fcvt.xf FR_float_N = FR_float_N nop.i 999 } ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - nop.m 999 +{ .mfi + ldfe FR_Q_5 = [GR_Table_Ptr],16 // Load Q_5 + nop.f 999 nop.i 999 } ;; - - -{ .mfi -(p0) ldfe FR_log2_hi = [GR_Table_Ptr],16 -(p0) fadd.s1 FR_H = FR_H_1, FR_H_2 - nop.i 999 ;; -} -{ .mmf - nop.m 999 -// -// G = G_1 * G_2 * G_3 -// -(p0) ldfe FR_log2_lo = [GR_Table_Ptr],16 -// -// load h_2 -// H = H_1 + H_2 -// Get Index3 // -(p0) fadd.s1 FR_h = FR_h_1, FR_h_2 ;; -} -// -// Load log2_lo part -// r = G*S -1 +// G = G_1 * G_2 * G_3 // { .mfi -(p0) ldfe FR_Q_6 = [GR_Table_Ptr],16 -// -// Load H_3 -// -(p0) fcvt.xf FR_float_N = FR_float_N - nop.i 999 ;; + ldfe FR_Q_4 = [GR_Table_Ptr],16 // Load Q_4 + fmpy.s1 FR_G = FR_G, FR_G_3 + nop.i 999 } +;; + // -// Load Q_6 +// H = H_1 + H_2 + H_3 // -{ .mmi -(p0) ldfe FR_Q_5 = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_Q_4 = [GR_Table_Ptr],16 - nop.i 999 ;; -} -{ .mmi -(p0) ldfe FR_Q_3 = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_Q_2 = [GR_Table_Ptr],16 - nop.i 999 ;; +{ .mfi + ldfe FR_Q_3 = [GR_Table_Ptr],16 // Load Q_3 + fadd.s1 FR_H = FR_H, FR_H_3 + nop.i 999 } -{ .mmf - nop.m 999 -// -// poly_lo = Q_5 + r * Q_6 -// Load Q_2 -// rsq = r * r +;; + // -(p0) ldfe FR_Q_1 = [GR_Table_Ptr],16 +// Y_lo = poly + Y_lo // -// h = h_1 + h_2 + h_3 -// H = H_1 + H_2 + H_3 -// Load G_3. -// Begin Loading Q's - load log2_hi part +// h = h_1 + h_2 + h_3 // -(p0) fmpy.s1 FR_G = FR_G, FR_G_3 -} { .mfi - nop.m 999 -(p0) fadd.s1 FR_H = FR_H, FR_H_3 - nop.i 999 + ldfe FR_Q_2 = [GR_Table_Ptr],16 // Load Q_2 + fadd.s1 FR_h = FR_h, FR_h_3 + nop.i 999 } ;; // -// Y_lo = poly + Y_lo +// GS_hi = G*S +// r = G*S -1 // - -{ .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Arg#), gp +{ .mfi + ldfe FR_Q_1 = [GR_Table_Ptr],16 // Load Q_1 + fmpy.s1 FR_GS_hi = FR_G, FR_S nop.i 999 } -;; - -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] +{ .mfi nop.m 999 + fms.s1 FR_r = FR_G, FR_S, f1 nop.i 999 } ;; - -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_h = FR_h, FR_h_3 - nop.i 999 ;; -} -{ .mfi - nop.m 999 // -// Load Q_5 +// poly_lo = Q_5 + r * Q_6 // -(p0) fmpy.s1 FR_GS_hi = FR_G, FR_S - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fms.s1 FR_r = FR_G, FR_S, f1 - nop.i 999 ;; -} { .mfi - nop.m 999 -(p0) fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5 - nop.i 999 + getf.exp GR_Delta_Exp = FR_Delta // Get signexp of y-1 for exp calc + fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5 + nop.i 999 } -{ .mfi - nop.m 999 // -// GS_hi = G*S -// Load Q_4 +// r_cor = GS_hi -1 // -(p0) fsub.s1 FR_r_cor = FR_GS_hi, f1 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi - nop.i 999 -} { .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1 - nop.i 999 ;; + nop.m 999 + fsub.s1 FR_r_cor = FR_GS_hi, f1 + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// Load Q_3 -// r_cor = GS_hi -1 // GS_lo = G*S - GS_hi // -(p0) fmpy.s1 FR_rsq = FR_r, FR_r - nop.i 999 -} { .mfi - nop.m 999 -(p0) fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H - nop.i 999 ;; + nop.m 999 + fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// poly = poly_hi + rsq * poly_lo -// Tbl = float_N*log2_hi + H +// rsq = r * r // -(p0) fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h - nop.i 999 ;; -} { .mfi - nop.m 999 -// -// r_cor = r_cor - r -// poly_hi = r * Q_2 + Q_1 -// -(p0) fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4 - nop.i 999 + nop.m 999 + fmpy.s1 FR_rsq = FR_r, FR_r + nop.i 999 } -{ .mfi - nop.m 999 // -// Load Q_1 +// G = float_N*log2_hi + H // -(p0) fsub.s1 FR_r_cor = FR_r_cor, FR_r - nop.i 999 ;; -} { .mfi - nop.m 999 -// -// Y_lo = float_N*log2_lo + h -// -(p0) fadd.s1 FR_Y_hi = FR_G, FR_r - nop.i 999 ;; + nop.m 999 + fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// poly_lo = Q_4 + r * poly_lo;; -// r_cor = r_cor + GS_lo;; +// Y_lo = float_N*log2_lo + h // -(p0) fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3 - nop.i 999 -} { .mfi - nop.m 999 -(p0) fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo - nop.i 999 + nop.m 999 + fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// poly_lo = Q_3 + r * poly_lo;; +// poly_lo = Q_4 + r * poly_lo +// r_cor = r_cor - r // -(p0) fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly - nop.i 999 ;; -} { .mfi - nop.m 999 -(p0) fsub.s1 FR_Y_lo = FR_G, FR_Y_hi - nop.i 999 -} -{ .mmi -(p0) ldfe FR_L_Inv = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_L_hi = [GR_Table_Ptr],16 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4 + nop.i 999 } { .mfi -(p0) ldfe FR_L_lo = [GR_Table_Ptr],16 - nop.f 999 - nop.i 999 ;; + nop.m 999 + fsub.s1 FR_r_cor = FR_r_cor, FR_r + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// Y_hi = Tbl + r -// r_cor = r_cor + Y_lo +// poly_hi = r * Q_2 + Q_1 +// Y_hi = G + r // -(p0) fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor - nop.i 999 ;; -} { .mfi - nop.m 999 -// Y_lo = Tbl - Y_hi -// poly = rsq * poly + r_cor -// -(p0) fadd.s1 FR_Y_lo = FR_Y_lo, FR_r - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1 + nop.i 999 } -{ .mfb - nop.m 999 -// -// Y_lo = Y_lo + r -// -(p0) fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly -// -// Load L_Inv -// Load L_hi -// Load L_lo -// all long before they are needed. -// They are used in LOGL_RETURN PATH -// -br.cond.sptk L(LOGL_RETURN) ;; +{ .mfi + nop.m 999 + fadd.s1 FR_Y_hi = FR_G, FR_r + nop.i 999 } -L(LOGL80_NEAR): +;; + // -// Branch LOGL80_NEAR +// poly_lo = Q_3 + r * poly_lo +// r_cor = r_cor + GS_lo // - -{ .mmi +{ .mfi nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_P#), gp + fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3 nop.i 999 } -;; - -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] +{ .mfi nop.m 999 + fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo nop.i 999 } ;; -{ .mfi - nop.m 999 -(p0) fmpy.s1 FR_Wsq = FR_W, FR_W -(p0) add GR_Table_Ptr1 = 0x50,GR_Table_Ptr -} // -// Adjust ptr to 1/2 -// Adjust Ptr1 to P_4 +// Y_lo = G - Y_hi // -{ .mmi -(p0) ldfe FR_Half = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_P_4 = [GR_Table_Ptr1],16 - nop.i 999 +{ .mfi + nop.m 999 + fsub.s1 FR_Y_lo_2 = FR_G, FR_Y_hi + nop.i 999 } +;; + // -// Load 1/2 +// r_cor = r_cor + Y_lo +// poly = poly_hi + rsq * poly_lo // -{ .mmi -(p0) ldfe FR_P_8 = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_P_3 = [GR_Table_Ptr1],16 - nop.i 999 +{ .mfi + add GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg + fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo + nop.i 999 } -{ .mmi -(p0) ldfe FR_P_7 = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_P_2 = [GR_Table_Ptr1],16 - nop.i 999 +{ .mfi + nop.m 999 + fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly + nop.i 999 } +;; + // -// Load P_7 -// half_W = .5 * W -// Load P_3 -// -{ .mmi -(p0) ldfe FR_P_6 = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_P_1 = [GR_Table_Ptr1],16 - nop.i 999 ;; -} +// Load L_hi +// Load L_lo +// all long before they are needed. +// They are used in LOGL_RETURN PATH // -// Load P_6 -// Wsq = w * w -// poly = w*P_4 + P_3 -// Load P_2 +// Y_lo = Y_lo + r +// poly = rsq * poly + r_cor // { .mfi -(p0) ldfe FR_P_5 = [GR_Table_Ptr],16 -// -// Load P_5 -// poly_lo = w * P_8 + P_7 -// Y_hi = w - (1/2)w*w -// Load P_1 -// -(p0) fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq - nop.i 999 + ldfe FR_L_hi = [GR_Table_Ptr],16 // Load L_hi + fadd.s1 FR_Y_lo = FR_Y_lo_2, FR_r + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s1 FR_W3 = FR_Wsq, FR_W - nop.i 999 + nop.m 999 + fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor + nop.i 999 } ;; +{ .mfb + ldfe FR_L_lo = [GR_Table_Ptr],16 // Load L_lo + fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly + br.cond.sptk LOGL_RETURN // Branch to common code +} +;; + + +LOGL80_NEAR: +// Here if |x-1| < 2^-8 // -// Y_lo = W3 * poly + Y_lo +// Branch LOGL80_NEAR // +{ .mmf + ldfe FR_P_8 = [GR_P_ptr1],16 // Load P_8 + ldfe FR_P_4 = [GR_P_ptr2],16 // Load P_4 + fmpy.s1 FR_Wsq = FR_W, FR_W +} +;; + { .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Arg#), gp + ldfe FR_P_7 = [GR_P_ptr1],16 // Load P_7 + ldfe FR_P_3 = [GR_P_ptr2],16 // Load P_3 nop.i 999 } ;; { .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - nop.m 999 + ldfe FR_P_6 = [GR_P_ptr1],16 // Load P_6 + ldfe FR_P_2 = [GR_P_ptr2],16 // Load P_2 nop.i 999 } ;; - { .mmi -(p0) ldfe FR_L_Inv = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_L_hi = [GR_Table_Ptr],16 - nop.i 999 ;; -} -{ .mfi -(p0) ldfe FR_L_lo = [GR_Table_Ptr],16 -// -// Load P_8 -// Load P_4 -// -(p0) fmpy.s1 FR_half_W = FR_Half, FR_W - nop.i 999 ;; + ldfe FR_P_5 = [GR_P_ptr1],16 // Load P_5 + ldfe FR_P_1 = [GR_P_ptr2],16 // Load P_1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7 - nop.i 999 + getf.exp GR_Delta_Exp = FR_Delta // Get signexp of y-1 for exp calc + fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3 - nop.i 999 ;; + add GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg + fmpy.s1 FR_W3 = FR_Wsq, FR_W + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_half_W = FR_Half, FR_W + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// W4 = Wsq * Wsq -// poly = w *poly + P_2 -// -(p0) fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6 - nop.i 999 + ldfe FR_L_hi = [GR_Table_Ptr],16 + fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_W, FR_poly, FR_P_2 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fsub.s1 FR_Y_lo = FR_W, FR_Y_hi - nop.i 999 ;; + ldfe FR_L_lo = [GR_Table_Ptr],16 + fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly = w * poly + P_1 -// w3 = wsq * w -// -(p0) fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5 - nop.i 999 + nop.m 999 + fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6 + nop.i 999 } { .mfi - nop.m 999 -// -// poly_lo = w * poly_lo + P_6 -// Y_lo = W - Y_hi -// -(p0) fma.s1 FR_poly = FR_W, FR_poly, FR_P_1 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_W, FR_poly, FR_P_2 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo - nop.i 999 ;; + nop.m 999 + fsub.s1 FR_Y_lo = FR_W, FR_Y_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly_lo = w * poly_lo + -// Y_lo = Y_lo - w * (1/2)w -// -(p0) fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5 + nop.i 999 } { .mfi - nop.m 999 -// -// Y_lo = (W-Y_hi) - w * (1/2)w -// poly = W4* poly_lo + poly -// -(p0) fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_W, FR_poly, FR_P_1 + nop.i 999 } -L(LOGL_RETURN): +;; + { .mfi -(p0) add GR_Expo_Range = 0x2,r0 -// -// Load L_Inv -// Load L_hi -// Load L_lo -// all long before they are needed. -// -// -// kernel_log_80 computed ln(X) -// and return logX_hi and logX_lo as results. -// PR_pow_Safe set as well. -// -(p0) fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo -// -// Compute Y * (logX_hi + logX_lo) -// P_hi -> X -// P_lo -> X_cor -// (Manipulate names so that inputs are in -// the place kernel_exp expects them) -// Set GR_Flag to 2 -// Set GR_Expo_Range to Double -// -// This function computes exp( x + x_cor) -// Input FR 1: FR_X -// Input FR 2: FR_X_cor -// Input GR 1: GR_Flag -// Input GR 2: GR_Expo_Range -// Output FR 3: FR_Y_hi -// Output FR 4: FR_Y_lo -// Output FR 5: FR_Scale -// Output PR 1: PR_Safe -// -(p0) cmp.eq.unc p15, p0 = r0, r0 + nop.m 999 + fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo + nop.i 999 } ;; -{ .mmi -(p0) addl GR_W1_ptr = @ltoff(Constants_exp_64_W1#), gp -(p0) addl GR_W2_ptr = @ltoff(Constants_exp_64_W2#), gp -(p0) add GR_Flag = 0x2,r0 +{ .mfi + nop.m 999 + fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly + nop.i 999 } ;; -{ .mmi - ld8 GR_W1_ptr = [GR_W1_ptr] - ld8 GR_W2_ptr = [GR_W2_ptr] -(p0) cmp.ne.unc p7, p0 = 0x1, GR_Flag +{ .mfi + nop.m 999 + fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo + nop.i 999 } ;; -{ .mlx - nop.m 999 -(p0) movl GR_Mask = 0x1FFFF ;; -} +LOGL_RETURN: +// Common code for completion of both logx paths -{ .mlx - nop.m 999 -(p0) movl GR_BIAS = 0x0FFFF ;; -} -{ .mfi - nop.m 999 // -// X_lo = Y * logX_lo +// L_hi, L_lo already loaded. // -(p0) fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo - nop.i 999 ;; -} -{ .mfi - nop.m 999 // -// Set Safe=True -// Flag is always 2 for this routine +// kernel_log_80 computed ln(X) +// and return logX_hi and logX_lo as results. +// PR_pow_Safe set as well. // -(p0) fmpy.s1 FR_float_N = FR_X, FR_L_Inv - nop.i 999 -} -{ .mfi - nop.m 999 // -// X_hi = Y * logX_hi + X_lo -// Set GR_Flag = 2 for exp(x + xcor) +// Compute Y * (logX_hi + logX_lo) +// P_hi -> X +// P_lo -> X_cor +// (Manipulate names so that inputs are in +// the place kernel_exp expects them) // -(p0) fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi - nop.i 999 ;; +// This function computes exp( x + x_cor) +// Input FR 1: FR_X +// Input FR 2: FR_X_cor +// Output FR 3: FR_Y_hi +// Output FR 4: FR_Y_lo +// Output FR 5: FR_Scale +// Output PR 1: PR_Safe +// +// P15 is True +// +// Load constants used in computing N using right-shift technique +{ .mlx + mov GR_exp_2tom51 = 0xffff-51 + movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 } -{ .mmi - nop.m 999 ;; -(p0) getf.exp GR_Expo_X = FR_X - nop.i 999 ;; +{ .mlx + add GR_Special_Exp = -50,GR_exp_bias + movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51) } -{ .mfi -(p0) and GR_Expo_X = GR_Expo_X, GR_Mask +;; + // -// Calculate unBIASed exponent of X // Point to Table of W1s // Point to Table of W2s // -(p0) fcvt.fx.s1 FR_N = FR_float_N - nop.i 999 ;; -} +{ .mmi + add GR_W1_ptr = 0x2b0, GR_table_base // Constants_exp_64_W1 + add GR_W2_ptr = 0x4b0, GR_table_base // Constants_exp_64_W2 + cmp.le p6,p0= GR_Delta_Exp,GR_Special_Exp +};; + +// Form two constants we need +// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 +// 1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand + { .mfi - nop.m 999 -(p0) fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo -// -// Float_N = X * L_Inv -// Create exponent BIAS -// Get BIASed exponent of X -// -(p0) sub GR_Expo_X = GR_Expo_X, GR_BIAS ;; + setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63 + nop.f 999 + and GR_Delta_Exp=GR_Delta_Exp,GR_exp_mask // Get exponent of y-1 } -{ .mib -(p0) cmp.gt.unc p9, p0 = -6, GR_Expo_X - nop.i 999 -// -// N = fcvt.fx(float_N) -// If -6 > Expo_X, set P9 -// -(p9) br.cond.spnt L(EXPL_SMALL) +{ .mlx + setf.d FR_RSHF_2TO51 = GR_rshf_2to51 // Form const 1.1000 * 2^(63+51) + movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift } ;; -// -// If expo_X < -6 goto exp_small -// -{ .mmi +{ .mfi nop.m 999 -(p0) addl GR_T1_ptr = @ltoff(Constants_exp_64_T1#), gp -(p0) cmp.lt.unc p10, p0 = 14, GR_Expo_X + fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo // logx_lo is Y_lo + cmp.eq p15, p0= r0, r0 // Set p15, assume safe +};; + +{ .mmi + setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N + setf.d FR_RSHF = GR_rshf // Form right shift const 1.1000 * 2^63 + add GR_Table_Ptr1 = 0x50, GR_table_base // Constants_exp_64_P for + // EXPL_SMALL path } ;; { .mmi - ld8 GR_T1_ptr = [GR_T1_ptr] - nop.m 999 + ldfe FR_P_6 = [GR_Table_Ptr1],16 // Load P_6 for EXPL_SMALL path +;; + ldfe FR_P_5 = [GR_Table_Ptr1],16 // Load P_5 for EXPL_SMALL path nop.i 999 } ;; -{ .mib - nop.m 999 - nop.i 999 -// -// If 14 < Expo_X, set P10 -// Create pointer to T1 table -// -(p10) br.cond.spnt L(EXPL_HUGE) ;; +{ .mfi + ldfe FR_P_4 = [GR_Table_Ptr1],16 // Load P_4 for EXPL_SMALL path + fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo // logx_hi ix Y_hi + nop.i 999 } - +;; { .mmi -(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Exponents#), gp -(p0) addl GR_T2_ptr = @ltoff(Constants_exp_64_T2#), gp + ldfe FR_P_3 = [GR_Table_Ptr1],16 // Load P_3 for EXPL_SMALL path +;; + ldfe FR_P_2 = [GR_Table_Ptr1],16 // Load P_2 for EXPL_SMALL path nop.i 999 } ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - ld8 GR_T2_ptr = [GR_T2_ptr] +// N = X * Inv_log2_by_2^12 +// By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand. +// We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing. +{ .mfi + ldfe FR_P_1 = [GR_Table_Ptr1] // Load P_1 for EXPL_SMALL path + fma.s1 FR_N = FR_X, FR_INV_LN2_2TO63, FR_RSHF_2TO51 nop.i 999 } +{ .mfb + nop.m 999 + fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi // P_hi is X +(p6) br.cond.spnt POWL_Y_ALMOST_1 // Branch if |y-1| < 2^-50 +} ;; - { .mmi -(p0) shladd GR_Table_Ptr = GR_Expo_Range,4,GR_Table_Ptr ;; -// -// Adjust T1_ptr by x 4 for single-precision values -// Adjust T2_ptr by x 4 for single-precision values -// -(p0) ld8 GR_Big_Pos_Exp = [GR_Table_Ptr],8 - nop.i 999 ;; -} -// -// Load double W1 -// Load +max exponent -// -{ .mfi -(p0) ld8 GR_Big_Neg_Exp = [GR_Table_Ptr],0 -// -// If 14 < Expo_X, goto exp_huge -// -(p0) fcvt.xf FR_float_N = FR_N - nop.i 999 + getf.exp GR_Expo_X = FR_X + add GR_T1_ptr = 0x0b0, GR_table_base // Constants_exp_64_T1 + add GR_T2_ptr = 0x1b0, GR_table_base // Constants_exp_64_T2 } ;; -// -// Load double W2 -// Load -max exponent -// Load ptr to A's -// +// float_N = round_int(N) +// The signficand of N contains the rounded integer part of X * 2^12/ln2, +// as a twos complement number in the lower bits (that is, it may be negative). +// That twos complement number (called N) is put into GR_N_fix. -{ .mmi -(p0) getf.sig GR_N_fix = FR_N -(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_A#), gp +// Since N is scaled by 2^51, it must be multiplied by 2^-51 +// before the shift constant 1.10000 * 2^63 is subtracted to yield float_N. +// Thus, float_N contains the floating point version of N + + +{ .mfi + add GR_Table_Ptr = 0x20, GR_table_base // Constants_exp_64_A + fms.s1 FR_float_N = FR_N, FR_2TOM51, FR_RSHF // Form float_N nop.i 999 } -;; +// Create low part of Y(ln(x)_hi + ln(x)_lo) as P_lo +{ .mfi + mov GR_Big_Pos_Exp = 0x3ffe // 16382, largest safe exponent + fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo + mov GR_Big_Neg_Exp = -0x3ffd // -16381 smallest safe exponent +};; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] +{ .mfi nop.m 999 - nop.i 999 + fmpy.s1 FR_rsq = FR_X, FR_X // rsq = X*X for EXPL_SMALL path + mov GR_vsm_expo = -70 // Exponent for very small path +} +{ .mfi + nop.m 999 + fma.s1 FR_poly_lo = FR_P_6, FR_X, FR_P_5 // poly_lo for EXPL_SMALL path + add GR_temp = 0x1,r0 // For tiny signif if small path } ;; // -// Load single T1 -// Load single T2 -// W_1_p1 = W_1 + 1 -// -{ .mmi -(p0) ldfe FR_A_3 = [GR_Table_Ptr],16 ;; -// -// Load A_3 -// if k > big_pos_exp, set p14 and Safe=False -// -(p0) ldfe FR_A_2 = [GR_Table_Ptr],16 -(p0) extr.u GR_M1 = GR_N_fix, 6, 6 -} -{ .mmi - nop.m 999 ;; -(p0) shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr -// -// float_N = fcvt.xf(N) -// N_fix = significand of N -// Create pointer to T2 table -// -(p0) extr.u GR_M2 = GR_N_fix, 0, 6 -} -// -// r = r + X_cor -// Adjust W1_ptr by x 8 for double-precision values -// Adjust W2_ptr by x 8 for double-precision values -// Adjust Table_ptr by Expo_Rangex16 +// If expo_X < -6 goto exp_small // { .mmi -(p0) shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr ;; -(p0) ldfd FR_W1 = [GR_W1_ptr],0 -(p0) shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr + getf.sig GR_N_fix = FR_N + ldfe FR_A_3 = [GR_Table_Ptr],16 // Load A_3 + and GR_Expo_X = GR_Expo_X, GR_exp_mask // Get exponent of X } -// -// Load ptr to A's -// +;; + { .mfi -(p0) ldfs FR_T1 = [GR_T1_ptr],0 -(p0) fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X -(p0) shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr ;; + ldfe FR_A_2 = [GR_Table_Ptr],16 // Load A_2 + nop.f 999 + sub GR_Expo_X = GR_Expo_X, GR_exp_bias // Get true exponent of X } -{ .mmi -(p0) ldfd FR_W2 = [GR_W2_ptr],0 -(p0) ldfs FR_T2 = [GR_T2_ptr],0 +;; + // -// r = x - L_hi * float_N -// M2 = extr.u(N_fix,0,6) -// M1 = extr.u(N_fix,6,6) +// If -6 > Expo_X, set P9 and branch // -(p0) extr GR_k = GR_N_fix, 12, 52 ;; +{ .mfb + cmp.gt p9, p0 = -6, GR_Expo_X + fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X // r = X - L_hi * float_N +(p9) br.cond.spnt EXPL_SMALL // Branch if |X| < 2^-6 } +;; + // -// Load A_1 -// poly = A_3 * r + A_2 -// rsq = r*r +// If 14 <= Expo_X, set P10 // -{ .mii -(p0) add GR_BIAS_p_k = GR_BIAS, GR_k -(p0) cmp.gt.unc p14,p15 = GR_k,GR_Big_Pos_Exp ;; -(p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp +{ .mib + cmp.le p10, p0 = 14, GR_Expo_X + nop.i 999 +(p10) br.cond.spnt EXPL_HUGE // Branch if |X| >= 2^14 } +;; + // -// BIAS_p_K = BIAS + k -// T = T1 * T2 +// Load single T1 +// Load single T2 +// W_1_p1 = W_1 + 1 // -{ .mfi -(p0) setf.exp FR_Scale = GR_BIAS_p_k - nop.f 999 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r - nop.i 999 +{ .mmi + nop.m 999 + nop.m 999 + extr.u GR_M1 = GR_N_fix, 6, 6 // Extract index M_1 } +;; + // -// W = W_1_p1 * W2 + W1 +// k = extr.u(N_fix,0,6) // -{ .mfi -(p0) ldfe FR_A_1 = [GR_Table_Ptr],16 - nop.f 999 - nop.i 999 ;; +{ .mmi + shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr // Point to W1 + shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr // Point to T1 + extr.u GR_M2 = GR_N_fix, 0, 6 // Extract index M_2 } -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_W_1_p1 = FR_W1, f1 - nop.i 999 ;; +;; + +// N_fix is only correct up to 50 bits because of our right shift technique. +// Actually in the normal path we will have restricted K to about 14 bits. +// Somewhat arbitrarily we extract 32 bits. +{ .mmi + ldfd FR_W1 = [GR_W1_ptr] + shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr // Point to W2 + extr GR_k = GR_N_fix, 12, 32 // Extract k } +;; + { .mfi - nop.m 999 -// -// k = extr.u(N_fix,0,6) -// r = r - N * L_lo -// Load ptr to Table of exponent thresholds. -// -(p0) fadd.s1 FR_r = FR_r, FR_X_cor - nop.i 999 + ldfs FR_T1 = [GR_T1_ptr] + fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r + shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr // Point to T2 } { .mfi - nop.m 999 -(p0) fmpy.s1 FR_T = FR_T1, FR_T2 - nop.i 999 ;; + add GR_exp_bias_p_k = GR_exp_bias, GR_k + nop.f 999 + cmp.gt p14,p15 = GR_k,GR_Big_Pos_Exp } -{ .mfi - nop.m 999 +;; + // -// if k < big_neg_exp, set p14 and Safe=False -// Load A_2 +// if k < big_neg_exp, set p14 and Safe=False // -(p0) fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1 - nop.i 999 ;; +{ .mmi + ldfs FR_T2 = [GR_T2_ptr] +(p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp + nop.i 999 } -{ .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2 - nop.i 999 +;; + +{ .mmi + setf.exp FR_Scale = GR_exp_bias_p_k + ldfd FR_W2 = [GR_W2_ptr] + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmpy.s1 FR_rsq = FR_r, FR_r - nop.i 999 ;; + ldfe FR_A_1 = [GR_Table_Ptr],16 + fadd.s1 FR_r = FR_r, FR_X_cor + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) mov FR_Y_hi = FR_T - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_W_1_p1 = FR_W1, f1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Scale = set_exp(BIAS_p_k) -// poly = r * poly + A_1 -// -(p0) fadd.s1 FR_Wp1 = FR_W, f1 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_r, FR_poly, FR_A_1 - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_rsq = FR_r, FR_r + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_rsq, FR_poly,FR_r - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_T = FR_T1, FR_T2 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Wp1 = W + 1 -// poly = rsq * poly + rk -// -(p0) fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W - nop.i 999 ;; -} -{ .mfb - nop.m 999 -// -// Y_lo = poly * Wp1 + W -// Y_hi = T -// -(p0) fmpy.s1 FR_Y_lo = FR_Y_lo, FR_T -// -// Y_lo = T * Y_lo -// -(p0) br.cond.sptk L(EXPL_RETURN) ;; + nop.m 999 + fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1 + nop.i 999 } +;; -L(EXPL_SMALL): - -// -// r4 = rsq * rsq -// - -{ .mmi +{ .mfi nop.m 999 -(p0) addl GR_Table_Ptr1 = @ltoff(Constants_exp_64_P), gp + fma.s1 FR_TMP1 = FR_Scale, FR_Sgn, f0 nop.i 999 } ;; -{ .mmi - ld8 GR_Table_Ptr1 = [GR_Table_Ptr1] +{ .mfi nop.m 999 + fma.s1 FR_poly = FR_r, FR_poly, FR_A_1 nop.i 999 } ;; -{ .mmf - nop.m 999 -(p0) ldfe FR_P_6 = [GR_Table_Ptr1],16 -// -// Return -// -(p0) fadd.s1 FR_r = FR_X,f0 ;; +{ .mfi + nop.m 999 + fma.s1 FR_TMP2 = FR_T, f1, f0 // TMP2 = Y_hi = T + nop.i 999 } +;; -{ .mmi +{ .mfi nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Exponents#), gp + fadd.s1 FR_Wp1 = FR_W, f1 nop.i 999 } ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] -(p0) ldfe FR_P_5 = [GR_Table_Ptr1],16 +{ .mfi + nop.m 999 + fma.s1 FR_poly = FR_rsq, FR_poly,FR_r nop.i 999 } ;; -// -// Is input very small? -// Load P_5 -// -{ .mii -(p0) ldfe FR_P_4 = [GR_Table_Ptr1],16 -(p0) add GR_Table_Ptr = 0x040,GR_Table_Ptr ;; -(p0) shladd GR_Table_Ptr = GR_Expo_Range,3,GR_Table_Ptr ;; -} -{ .mmb -(p0) ldfe FR_P_3 = [GR_Table_Ptr1],16 -// -// Adjust ptr. -// -(p0) ld8 GR_vsm_expo = [GR_Table_Ptr],0 - nop.b 999 ;; -} { .mfi - nop.m 999 -// -// r = X (don't seem to need X_Cor) -// Load the threshold exponents -// -(p0) fmpy.s1 FR_rsq = FR_r, FR_r - nop.i 999 ;; + nop.m 999 + fma.s1 FR_Tscale = FR_T, FR_TMP1, f0 // Scale * Sgn * T + nop.i 999 } -// -// Load the negative integer -// Load P_5 -// { .mfi -(p0) cmp.lt.unc p12, p0 = GR_Expo_X, GR_vsm_expo - nop.f 999 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W + nop.i 999 } +;; + { .mfb - nop.m 999 -// -// rsq = r * r -// Offset into exponents -// -(p0) fmpy.s1 FR_r4 = FR_rsq, FR_rsq -(p12) br.cond.spnt L(EXPL_VERY_SMALL) ;; + nop.m 999 + fmpy.s1 FR_TMP3 = FR_Y_lo, FR_Tscale + br.cond.sptk POWL_64_SHARED } -{ .mfi -(p0) ldfe FR_P_2 = [GR_Table_Ptr1],16 -// -// Load p4,p3,p2,p1 -// -(p0) fma.s1 FR_poly_lo = FR_P_6, FR_r, FR_P_5 +;; + + +EXPL_SMALL: +// Here if |ylogx| < 2^-6 // -// Y_lo = r4 * poly_lo + poly_hi -// Scale = 1.0 +// Begin creating lsb to perturb final result // -(p0) add GR_temp = 0x1,r0 ;; +{ .mfi + setf.sig FR_temp = GR_temp + fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_4 + cmp.lt p12, p0 = GR_Expo_X, GR_vsm_expo // Test |ylogx| < 2^-70 } -{ .mmf - nop.m 999 -(p0) ldfe FR_P_1 = [GR_Table_Ptr1],0 -(p0) mov FR_Scale = f1 +{ .mfi + nop.m 999 + fma.s1 FR_poly_hi = FR_P_2, FR_X, FR_P_1 + nop.i 999 } -// -// Begin creating lsb to perturb final result -// +;; + { .mfi -(p0) setf.sig FR_temp = GR_temp -(p0) mov FR_Y_hi = f1 - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_TMP2 = f1, f1 + nop.i 999 } { .mfi - nop.m 999 -// -// poly_lo = p_5 + p_6 * r -// poly_hi = p_1 + p_2 * r -// -(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_P_4 - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_TMP1 = FR_Sgn, f1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly_lo = p_4 + poly_lo * r -// poly_hi = r + poly_hi * rsq -// -(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_P_3 - nop.i 999 + nop.m 999 + fmpy.s1 FR_r4 = FR_rsq, FR_rsq +(p12) cmp.eq p15, p0 = r0, r0 // Set safe if |ylogx| < 2^-70 } +{ .mfb + nop.m 999 +(p12) fmpy.s1 FR_TMP3 = FR_Sgn, FR_X +(p12) br.cond.spnt POWL_64_SHARED // Branch if |ylogx| < 2^-70 +} +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_poly_hi = FR_P_2, FR_r, FR_P_1 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_3 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_r - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_X + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly_lo = p_3 + poly_lo * r -// Y_hi = 1, always -// -(p0) fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi - nop.i 999 ;; + nop.m 999 + fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Set lsb in fp register -// -(p0) for FR_temp = FR_Y_lo,FR_temp - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_TMP3 = FR_Y_lo, FR_TMP1 // Add sign info + nop.i 999 } -{ .mfb - nop.m 999 +;; + // // Toggle on last bit of Y_lo -// -(p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_temp -// // Set lsb of Y_lo to 1 // -(p0) br.cond.sptk L(EXPL_RETURN) ;; -} -L(EXPL_VERY_SMALL): { .mfi - nop.m 999 -(p0) mov FR_Y_lo = FR_r -(p0) cmp.eq.unc p15, p0 = r0, r0 + nop.m 999 + for FR_temp = FR_Y_lo,FR_temp + nop.i 999 } -{ .mfi - nop.m 999 -(p0) mov FR_Scale = f1 - nop.i 999 -};; +;; + { .mfb - nop.m 999 -(p0) mov FR_Y_hi = f1 -// -// If flag_not_1, -// Y_hi = 1.0 -// Y_lo = X + X_cor -// PR_Safe = true -// -(p0) br.cond.sptk L(EXPL_RETURN) ;; + nop.m 999 + fmerge.se FR_TMP3 = FR_TMP3,FR_temp + br.cond.sptk POWL_64_SHARED } -L(EXPL_HUGE): +;; + + +EXPL_HUGE: +// Here if |ylogx| >= 2^14 { .mfi - nop.m 999 -// -// Return for flag=2 -// -(p0) fcmp.gt.unc.s1 p12, p13 = FR_X, f0 -(p0) cmp.eq.unc p14, p15 = r0, r0 ;; + mov GR_temp = 0x0A1DC // If X < 0, exponent -24100 + fcmp.gt.s1 p12, p13 = FR_X, f0 // Test X > 0 + cmp.eq p14, p15 = r0, r0 // Set Safe to false } -{ .mlx - nop.m 999 -// -// Set Safe to false -// Is x > 0 -// -(p12) movl GR_Mask = 0x15DC0 ;; -} -{ .mlx -(p12) setf.exp FR_Y_hi = GR_Mask -(p13) movl GR_Mask = 0xA240 ;; +;; + +{ .mmi +(p12) mov GR_Mask = 0x15DC0 // If X > 0, exponent +24000 +(p13) mov GR_Mask = 0x0A240 // If X < 0, exponent -24000 + nop.i 999 } -{ .mlx -(p13) setf.exp FR_Y_hi = GR_Mask -// -// x > 0: Create mask for Y_hi = 2**(24,000) -// x <= 0: Create mask for Y_hi = 2**(-24,000) -// -(p13) movl GR_temp = 0xA1DC ;; +;; + +{ .mmf + setf.exp FR_TMP2 = GR_Mask // Form Y_hi = TMP2 +(p13) setf.exp FR_Y_lo = GR_temp // If X < 0, Y_lo = 2^-24100 +(p12) mov FR_Y_lo = f1 // IF X > 0, Y_lo = 1.0 } +;; + { .mfi -(p13) setf.exp FR_Y_lo = GR_temp -// -// x < =0: Create mask for 2**(-24,100) -// x <= 0: Y_lo = w**(-24,100) -// -(p12) mov FR_Y_lo = f1 - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_TMP1 = FR_TMP2, FR_Sgn // TMP1 = Y_hi * Sgn + nop.i 999 } -{ .mfi - nop.m 999 -(p12) mov FR_Scale = FR_Y_hi - nop.i 999 ;; +;; + +{ .mfb + nop.m 999 + fmpy.s1 FR_TMP3 = FR_Y_lo,FR_TMP1 // TMP3 = Y_lo * (Y_hi * Sgn) + br.cond.sptk POWL_64_SHARED } -{ .mfi - nop.m 999 +;; + +POWL_Y_ALMOST_1: +// Here if delta = |y-1| < 2^-50 // -// x > 0: Y_lo = 1.0 -// x > 0: Scale = 2**(24,000) +// x**(1 + delta) = x * e (ln(x)*delta) = x ( 1 + ln(x) * delta) // -(p13) mov FR_Scale = FR_Y_hi - nop.i 999 ;; -} -L(EXPL_RETURN): +// Computation will be safe for 2^-16381 <= x < 2^16383 + { .mfi - nop.m 999 -// -// Scale = 2**(24,000) -// -// -// exp(y *ln(x)) almost complete -// FR_Scale is Scale -// f34 is Z_hi -// f35 is Z_lo -// -(p0) fmpy.s1 FR_Sgn = FR_Scale, FR_Sgn - nop.i 999 ;; + mov GR_exp_ynear1_oflow = 0xffff + 16383 + fma.s1 FR_TMP1 = FR_Input_X,FR_Delta,f0 + and GR_exp_x = GR_exp_mask, GR_signexp_x } +;; + { .mfi - nop.m 999 -// -// sgn * scale -// -(p0) fmpy.s1 FR_Y_lo = FR_Y_lo,FR_Sgn - nop.i 999 ;; + cmp.lt p15, p14 = GR_exp_x, GR_exp_ynear1_oflow + fma.s1 FR_TMP2 = FR_logx_hi,f1,FR_X_lo + mov GR_exp_ynear1_uflow = 0xffff - 16381 } +;; + { .mfb - nop.m 999 -// -// Z_lo * (sgn * scale) +(p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_ynear1_uflow + fma.s1 FR_TMP3 = FR_Input_X,f1,f0 + br.cond.sptk POWL_64_SHARED +};; + +POWL_64_SQUARE: // -(p0) fma.s0 FR_Result = FR_Y_hi, FR_Sgn, FR_Y_lo +// Here if x not zero and y=2. // -// Z_hi * (sgn * scale) + Z_lo +// Setup for multipath code // -(p15) br.cond.sptk L(POWL_64_RETURN) ;; -} { .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x01 - nop.i 999 -} -{ .mlx - nop.m 999 -// -// Z_hi * (sgn * scale) + Z_lo with wre & td -// Z_hi * (sgn * scale) + Z_lo with fz & td -// -(p0) movl GR_T1_ptr = 0x00000000013FFF ;; + mov GR_exp_square_oflow = 0xffff + 8192 // Exponent where x*x overflows + fmerge.se FR_TMP1 = FR_Input_X, FR_Input_X + and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x } +;; + { .mfi - nop.m 999 -(p0) fma.s3 FR_Result_small = FR_Y_hi, FR_Sgn, FR_Y_lo - nop.i 999 + cmp.lt p15, p14 = GR_exp_x, GR_exp_square_oflow // Decide safe/unsafe + fmerge.se FR_TMP2 = FR_Input_X, FR_Input_X + mov GR_exp_square_uflow = 0xffff - 8191 // Exponent where x*x underflows } +;; + { .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x40 - nop.i 999 ;; +(p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_square_uflow // Decide safe/unsafe + fma.s1 FR_TMP3 = f0,f0,f0 + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// Return if no danger of over of underflow. +// This is the shared path that will set overflow and underflow. // -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; -} -{ .mfi - nop.m 999 +POWL_64_SHARED: + // -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) +// Return if no danger of over or underflow. // -(p0) fma.s2 FR_Result_big = FR_Y_hi, FR_Sgn, FR_Y_lo - nop.i 999 ;; +{ .mfb + nop.m 999 + fma.s0 FR_Result = FR_TMP1, FR_TMP2, FR_TMP3 +(p15) br.ret.sptk b0 // Main path return if certain no over/underflow } +;; + // -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S2 user supplied status + FZ + TD (Underflows) // // // If (Safe) is true, then @@ -2430,973 +2022,742 @@ L(EXPL_RETURN): // No overflow or underflow here, but perhaps inexact. // Return // Else -// Determine if overflow or underflow was raised. -// Fetch +/- overflow threshold for IEEE single, double, -// double extended -// -{ .mfi -(p0) setf.exp FR_Big = GR_T1_ptr -(p0) fsetc.s2 0x7F,0x40 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = FR_Result_small, 0x00F - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fmerge.ns FR_NBig = FR_Big, FR_Big - nop.i 999 -} -{ .mfi - nop.m 999 -// -// Create largest double exponent + 1. -// Create smallest double exponent - 1. -// Identify denormals -// -(p0) fcmp.ge.unc.s1 p8, p0 = FR_Result_big , FR_Big - nop.i 999 ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -// -// fcmp: resultS2 <= - overflow threshold -// fclass: resultS3 is denorm/unorm/0 -// -(p8) mov GR_Parameter_TAG = 18 ;; -} -{ .mfb - nop.m 999 -// -// fcmp: resultS2 >= + overflow threshold -// -(p0) fcmp.le.unc.s1 p9, p0 = FR_Result_big, FR_NBig -(p8) br.cond.spnt __libm_error_region ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) mov GR_Parameter_TAG = 18 -} -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt __libm_error_region ;; -} -// -// Report that pow overflowed - either +Inf, or -Inf -// -{ .mmb -(p11) mov GR_Parameter_TAG = 19 - nop.m 999 -(p11) br.cond.spnt __libm_error_region ;; -} -{ .mib - nop.m 999 - nop.i 999 -// -// Report that pow underflowed -// -(p0) br.cond.sptk L(POWL_64_RETURN) ;; -} - +// Determine if overflow or underflow was raised. +// Fetch +/- overflow threshold for IEEE double extended -L(POWL_64_SQUARE): -// Here if x not zero and y=2. -// Must call __libm_error_support for overflow or underflow -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) -// { .mfi - nop.m 999 -(p0) fma.s0 FR_Result = FR_Input_X, FR_Input_X, f0 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x01 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_T1_ptr = 0x00000000013FFF ;; -} -{ .mfi - nop.m 999 -(p0) fma.s3 FR_Result_small = FR_Input_X, FR_Input_X, f0 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x40 - nop.i 999 ;; + nop.m 999 + fsetc.s2 0x7F,0x41 // For underflow test, set S2=User+TD+FTZ + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Return if no danger of over of underflow. -// -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; + nop.m 999 + fma.s2 FR_Result_small = FR_TMP1, FR_TMP2, FR_TMP3 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s2 FR_Result_big = FR_Input_X, FR_Input_X, f0 - nop.i 999 ;; + nop.m 999 + fsetc.s2 0x7F,0x42 // For overflow test, set S2=User+TD+WRE + nop.i 999 } -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) -// -// -// If (Safe) is true, then -// Compute result using user supplied status field. -// No overflow or underflow here, but perhaps inexact. -// Return -// Else -// Determine if overflow or underflow was raised. -// Fetch +/- overflow threshold for IEEE single, double, -// double extended -// +;; + { .mfi -(p0) setf.exp FR_Big = GR_T1_ptr -(p0) fsetc.s2 0x7F,0x40 - nop.i 999 ;; + nop.m 999 + fma.s2 FR_Result_big = FR_TMP1, FR_TMP2,FR_TMP3 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = FR_Result_small, 0x00F - nop.i 999 ;; + nop.m 999 + fsetc.s2 0x7F,0x40 // Reset S2=User + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmerge.ns FR_NBig = FR_Big, FR_Big - nop.i 999 + nop.m 999 + fclass.m p11, p0 = FR_Result_small, 0x00F // Test small result unorm/zero + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Create largest double exponent + 1. -// Create smallest double exponent - 1. -// Identify denormals -// -(p0) fcmp.ge.unc.s1 p8, p0 = FR_Result_big , FR_Big - nop.i 999 ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -// -// fcmp: resultS2 <= - overflow threshold -// fclass: resultS3 is denorm/unorm/0 -// -(p8) mov GR_Parameter_TAG = 18 ;; + nop.m 999 + fcmp.ge.s1 p8, p0 = FR_Result_big , FR_Big // Test >= + oflow threshold + nop.i 999 } +;; + { .mfb - nop.m 999 -// -// fcmp: resultS2 >= + overflow threshold -// -(p0) fcmp.le.unc.s1 p9, p0 = FR_Result_big, FR_NBig -(p8) br.cond.spnt __libm_error_region ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) mov GR_Parameter_TAG = 18 -} -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt __libm_error_region ;; -} -// -// Report that pow overflowed - either +Inf, or -Inf -// -{ .mmb -(p11) mov GR_Parameter_TAG = 19 - nop.m 999 -(p11) br.cond.spnt __libm_error_region ;; -} -{ .mib - nop.m 999 - nop.i 999 -// -// Report that pow underflowed -// -(p0) br.cond.sptk L(POWL_64_RETURN) ;; +(p11) mov GR_Parameter_TAG = 19 // Set tag for underflow + fcmp.le.s1 p9, p0 = FR_Result_big, FR_NBig // Test <= - oflow threshold +(p11) br.cond.spnt __libm_error_region // Branch if pow underflowed } +;; +{ .mfb +(p8) mov GR_Parameter_TAG = 18 // Set tag for overflow + nop.f 999 +(p8) br.cond.spnt __libm_error_region // Branch if pow +overflow +} +;; +{ .mbb +(p9) mov GR_Parameter_TAG = 18 // Set tag for overflow +(p9) br.cond.spnt __libm_error_region // Branch if pow -overflow + br.ret.sptk b0 // Branch if result really ok +} +;; -L(POWL_64_SPECIAL): +POWL_64_SPECIAL: +// Here if x or y is NatVal, nan, inf, or zero { .mfi - nop.m 999 -(p0) fcmp.eq.s1 p15, p0 = FR_Input_X, f1 // Is x=+1 - nop.i 999 ;; + nop.m 999 + fcmp.eq.s1 p15, p0 = FR_Input_X, f1 // Test x=+1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p14, p0 = FR_Input_Y, 0x023 - nop.i 999 ;; + nop.m 999 + fclass.m p8, p0 = FR_Input_X, 0x143 // Test x natval, snan + nop.i 999 } +;; { .mfi - nop.m 999 -(p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN - nop.i 999 + nop.m 999 +(p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN + nop.i 999 } { .mfb - nop.m 999 -(p15) fmpy.s0 FR_Result = f1,f1 // If x=1, result=1 -(p15) br.cond.spnt L(POWL_64_RETURN) ;; // Exit if x=1 + nop.m 999 +(p15) fmpy.s0 FR_Result = f1,f1 // If x=1, result=1 +(p15) br.ret.spnt b0 // Exit if x=1 } +;; { .mfi - nop.m 999 -(p0) fclass.m.unc p13, p0 = FR_Input_X, 0x023 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fclass.m.unc p8, p0 = FR_Input_X, 0x143 - nop.i 999 + nop.m 999 + fclass.m p6, p0 = FR_Input_Y, 0x007 // Test y zero + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p9, p0 = FR_Input_Y, 0x143 - nop.i 999 ;; + nop.m 999 + fclass.m p9, p0 = FR_Input_Y, 0x143 // Test y natval, snan + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p10, p0 = FR_Input_X, 0x083 - nop.i 999 + nop.m 999 + fclass.m p10, p0 = FR_Input_X, 0x083 // Test x qnan + nop.i 999 } { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = FR_Input_Y, 0x083 - nop.i 999 ;; + nop.m 999 +(p8) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If x=snan, result=qnan +(p6) cmp.ne p8,p0 = r0,r0 // Don't exit if x=snan, y=0 ==> result=+1 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p6, p0 = FR_Input_Y, 0x007 - nop.i 999 + nop.m 999 +(p6) fclass.m.unc p15, p0 = FR_Input_X,0x007 // Test x=0, y=0 + nop.i 999 } -{ .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p7, p0 = FR_Input_Y, f1 - nop.i 999 ;; +{ .mfb + nop.m 999 +(p9) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If y=snan, result=qnan +(p8) br.ret.spnt b0 // Exit if x=snan, y not 0, + // result=qnan } +;; + { .mfi - nop.m 999 -// -// set p13 if x +/- Inf -// set p14 if y +/- Inf -// set p8 if x Natval or +/-SNaN -// set p9 if y Natval or +/-SNaN -// set p10 if x QNaN -// set p11 if y QNaNs -// set p6 if y is +/-0 -// set p7 if y is 1 -// -(p8) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X -(p6) cmp.ne p8,p0 = r0,r0 ;; // Don't exit if x=snan, y=0 ==> result=+1 -} -{ .mfb - nop.m 999 -(p9) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X -(p8) br.cond.spnt L(POWL_64_RETURN) ;; + nop.m 999 + fcmp.eq.s1 p7, p0 = FR_Input_Y, f1 // Test y +1.0 + nop.i 999 } { .mfb - nop.m 999 -(p10) fmpy.s0 FR_Result = FR_Input_X, f0 -(p9) br.cond.spnt L(POWL_64_RETURN) ;; -} -{ .mfi - nop.m 999 -// -// Produce result for SNaN and NatVals and return -// -(p6) fclass.m.unc p15, p0 = FR_Input_X,0x007 - nop.i 999 + nop.m 999 +(p10) fmpy.s0 FR_Result = FR_Input_X, f0 // If x=qnan, result=qnan +(p9) br.ret.spnt b0 // Exit if y=snan, result=qnan } +;; + { .mfi - nop.m 999 -// -// If Y +/- 0, set p15 if x +/- 0 -// -(p6) fclass.m.unc p8, p0 = FR_Input_X,0x0C3 - nop.i 999 ;; + nop.m 999 +(p6) fclass.m.unc p8, p0 = FR_Input_X,0x0C3 // Test x=nan, y=0 + nop.i 999 } +;; { .mfi - nop.m 999 -(p6) fcmp.eq.s0 p9,p0 = FR_Input_X, f0 // If y=0, flag if x denormal - nop.i 999 + nop.m 999 +(p6) fcmp.eq.s0 p9,p0 = FR_Input_X, f0 // If y=0, flag if x denormal + nop.i 999 } { .mfi - nop.m 999 -(p6) fadd.s0 FR_Result = f1, f0 - nop.i 999 ;; + nop.m 999 +(p6) fadd.s0 FR_Result = f1, f0 // If y=0, result=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Set p8 if y = +/-0 and X is a QNaN/SNaN -// If y = +/-0, let result = 1.0 -// -(p7) fmpy.s0 FR_Result = FR_Input_X,f1 -// -// If y == 1, result = x * 1 -// -(p15) mov GR_Parameter_TAG = 20 -} -{ .mib - nop.m 999 - nop.i 999 -(p15) br.cond.spnt __libm_error_region ;; -} -{ .mib - nop.m 999 -// -// If x and y are both zero, result = 1.0 and call error -// support. -// -(p8) mov GR_Parameter_TAG = 23 -(p8) br.cond.spnt __libm_error_region ;; + nop.m 999 + fclass.m p11, p0 = FR_Input_Y, 0x083 // Test y qnan + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 -// -// If y = +/-0 and x is a QNaN, result = 1.0 and call error -// support. -// -(p6) br.cond.spnt L(POWL_64_RETURN) ;; +{ .mfb +(p15) mov GR_Parameter_TAG = 20 // Error tag for x=0, y=0 +(p7) fmpy.s0 FR_Result = FR_Input_X,f1 // If y=1, result=x +(p15) br.cond.spnt __libm_error_region // Branch if x=0, y=0, result=1 } +;; -// If x=0, y=-inf, go to the X_IS_ZERO path { .mfb - nop.m 999 -(p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0 -(p7) br.cond.spnt L(POWL_64_RETURN) ;; +(p8) mov GR_Parameter_TAG = 23 // Error tag for x=nan, y=0 + fclass.m p14, p0 = FR_Input_Y, 0x023 // Test y inf +(p8) br.cond.spnt __libm_error_region // Branch if x=snan, y=0, + // result=1 } +;; -{ .mfi - nop.m 999 -// -// Produce all results for x**0 and x**1 -// Let all the result x ** 0 == 1 and return -// Let all x ** 1 == x and return -// -(p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X - nop.i 999 ;; -} { .mfb - nop.m 999 -(p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X -(p10) br.cond.spnt L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p11) br.cond.spnt L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -// -// Return result for x or y QNaN input with QNaN result -// -(p14) br.cond.spnt L(POWL_64_Y_IS_INF) ;; + nop.m 999 + fclass.m p13, p0 = FR_Input_X, 0x023 // Test x inf +(p6) br.ret.spnt b0 // Exit y=0, x not nan or 0, + // result=1 } -{ .mib - nop.m 999 - nop.i 999 -(p13) br.cond.spnt L(POWL_64_X_IS_INF) ;; +;; + +{ .mfb + nop.m 999 +(p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0 // Test x not 0, y=inf +(p7) br.ret.spnt b0 // Exit y=1, x not snan, + // result=x } -L(POWL_64_X_IS_ZERO): -{ .mmb -(p0) getf.sig GR_signif_y = FR_Input_Y -(p0) getf.exp GR_BIASed_exp_y = FR_Input_Y - nop.b 999 ;; +;; + +{ .mfb + nop.m 999 +(p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X // If x=qnan, y not snan, + // result=qnan +(p10) br.ret.spnt b0 // Exit x=qnan, y not snan, + // result=qnan } -{ .mlx - nop.m 999 -(p0) movl GR_Mask = 0x1FFFF +;; + +{ .mfb + nop.m 999 +(p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X // If y=qnan, x not nan or 1, + // result=qnan +(p11) br.ret.spnt b0 // Exit y=qnan, x not nan or 1, + // result=qnan } -{ .mlx - nop.m 999 -(p0) movl GR_y_sign = 0x20000 ;; +;; + +{ .mbb + nop.m 999 +(p14) br.cond.spnt POWL_64_Y_IS_INF // Branch if y=inf, x not 1 or nan +(p13) br.cond.spnt POWL_64_X_IS_INF // Branch if x=inf, y not 1 or nan } -// -// Get BIASed exp and significand of y +;; + + +POWL_64_X_IS_ZERO: +// Here if x=0, y not nan or 1 or inf or 0 + +// There is logic starting here to determine if y is an integer when x = 0. +// If 0 < |y| < 1 then clearly y is not an integer. +// If |y| > 1, then the significand of y is shifted left by the size of +// the exponent of y. This preserves the lsb of the integer part + the +// fractional bits. The lsb of the integer can be tested to determine if +// the integer is even or odd. The fractional bits can be tested. If zero, +// then y is an integer. // { .mfi -(p0) and GR_exp_y = GR_Mask,GR_BIASed_exp_y - nop.f 999 -(p0) and GR_y_sign = GR_y_sign,GR_BIASed_exp_y -} -{ .mlx - nop.m 999 -(p0) movl GR_BIAS = 0xFFFF ;; + and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y + nop.f 999 + and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y } -{ .mfi -(p0) cmp.lt.unc p9, p8 = GR_exp_y,GR_BIAS - nop.f 999 +;; + // // Maybe y is < 1 already, so // can never be an integer. -// Remove sign bit from exponent. -// -(p0) sub GR_exp_y = GR_exp_y,GR_BIAS ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -// -// Remove exponent BIAS // -(p8) shl GR_exp_y= GR_signif_y,GR_exp_y ;; -} { .mfi -(p9) or GR_exp_y= 0xF,GR_signif_y - nop.f 999 - nop.i 999 ;; + cmp.lt p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 + nop.f 999 + sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent of y } -{ .mii - nop.m 999 +;; + // // Shift significand of y looking for nonzero bits // For y > 1, shift signif_y exp_y bits to the left -// For y < 1, turn on 4 low order bits of significand of y +// For y < 1, turn on 4 low order bits of significand of y // so that the fraction will always be non-zero // -(p0) shl GR_signif_y= GR_exp_y,1 ;; -(p0) extr.u GR_low_order_bit = GR_exp_y,63,1 +{ .mmi +(p9) or GR_exp_y= 0xF,GR_signif_y // Force nonzero fraction if y<1 +;; + nop.m 999 +(p8) shl GR_exp_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction + // Wait 4 cycles to use result } +;; + +{ .mmi + nop.m 999 +;; + nop.m 999 + nop.i 999 +} +;; + +{ .mmi + nop.m 999 +;; + nop.m 999 + shl GR_fraction_y= GR_exp_y,1 // Shift left 1 to get fraction +} +;; + // // Integer part of y shifted off. // Get y's low even or odd bit - y might not be an int. // { .mii -(p0) cmp.eq.unc p13,p0 = GR_signif_y, r0 -(p0) cmp.eq.unc p8,p9 = GR_y_sign, r0 ;; -// -// Is y an int? -// Is y positive -// -(p13) cmp.ne.unc p13,p0 = GR_low_order_bit, r0 ;; + cmp.eq p13,p0 = GR_fraction_y, r0 // Test for y integer + cmp.eq p8,p0 = GR_y_sign, r0 // Test for y > 0 +;; +(p13) tbit.nz.unc p13,p0 = GR_exp_y, 63 // Test if y an odd integer +} +;; + +{ .mfi +(p13) cmp.eq.unc p13,p14 = GR_y_sign, r0 // Test y pos odd integer +(p8) fcmp.eq.s0 p12,p0 = FR_Input_Y, f0 // If x=0 and y>0 flag if y denormal + nop.i 999 } +;; + // -// Is y and int and odd? +// Return +/-0 when x=+/-0 and y is positive odd integer // { .mfb -(p13) cmp.eq.unc p13,p14 = GR_y_sign, r0 -(p8) fcmp.eq.s0 p12,p0 = FR_Input_Y, f0 // If x=0 and y>0 flag if y denormal - nop.b 999 ;; + nop.m 999 +(p13) mov FR_Result = FR_Input_X // If x=0, y pos odd int, result=x +(p13) br.ret.spnt b0 // Exit x=0, y pos odd int, result=x } -{ .mfb - nop.m 999 +;; + // -// Is y and int and odd and positive? +// Return +/-inf when x=+/-0 and y is negative odd int // -(p13) mov FR_Result = FR_Input_X -(p13) br.cond.sptk L(POWL_64_RETURN) ;; +{ .mfb +(p14) mov GR_Parameter_TAG = 21 +(p14) frcpa.s0 FR_Result, p0 = f1, FR_Input_X // Result +-inf, set Z flag +(p14) br.cond.spnt __libm_error_region } -{ .mfi - nop.m 999 +;; + // -// Return +/-0 when x=+/-0 and y is and odd pos. int +// Return +0 when x=+/-0 and y positive and not an odd integer // -(p14) frcpa.s0 FR_Result, p10 = f1, FR_Input_X -(p14) mov GR_Parameter_TAG = 21 -} -{ .mib - nop.m 999 - nop.i 999 -(p14) br.cond.spnt __libm_error_region ;; +{ .mfb + nop.m 999 +(p8) mov FR_Result = f0 // If x=0, y>0 and not odd integer, result=+0 +(p8) br.ret.sptk b0 // Exit x=0, y>0 and not odd integer, result=+0 } +;; -{ .mfb - nop.m 999 // -// Return +/-0 when x=+/-Inf and y is and odd neg int -// and raise dz exception +// Return +inf when x=+/-0 and y is negative and not odd int // -(p8) mov FR_Result = f0 -(p8) br.cond.sptk L(POWL_64_RETURN) ;; +{ .mfb + mov GR_Parameter_TAG = 21 + frcpa.s0 FR_Result, p10 = f1,f0 // Result +inf, raise Z flag + br.cond.sptk __libm_error_region } -{ .mfi - nop.m 999 +;; + + +POWL_64_X_IS_INF: // -// Return +0 when x=+/-0 and y > 0 and not odd. +// Here if x=inf, y not 1 or nan // -(p9) frcpa.s0 FR_Result, p10 = f1,f0 -(p9) mov GR_Parameter_TAG = 21 -} -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.sptk __libm_error_region ;; -} -L(POWL_64_X_IS_INF): { .mfi -(p0) getf.exp GR_exp_y = FR_Input_Y -(p0) fclass.m.unc p13, p0 = FR_Input_X,0x022 -(p0) mov GR_Mask = 0x1FFFF ;; + and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent y + fclass.m p13, p0 = FR_Input_X,0x022 // Test x=-inf + nop.i 999 } +;; { .mfi -(p0) getf.sig GR_signif_y = FR_Input_Y -(p0) fcmp.eq.s0 p9,p0 = FR_Input_Y, f0 // Flag if y denormal - nop.i 999 ;; + and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y + fcmp.eq.s0 p9,p0 = FR_Input_Y, f0 // Dummy to set flag if y denorm + nop.i 999 } +;; // -// Get exp and significand of y -// Create exponent mask and sign mask +// Maybe y is < 1 already, so +// isn't an int. // -{ .mlx -(p0) and GR_low_order_bit = GR_Mask,GR_exp_y -(p0) movl GR_BIAS = 0xFFFF +{ .mfi +(p13) cmp.lt.unc p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 if x=-inf + fclass.m p11, p0 = FR_Input_X,0x021 // Test x=+inf + sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent y } -{ .mmi - nop.m 999 ;; +;; + // -// Remove sign bit from exponent. +// Shift significand of y looking for nonzero bits +// For y > 1, shift signif_y exp_y bits to the left +// For y < 1, turn on 4 low order bits of significand of y +// so that the fraction will always be non-zero // -(p0) cmp.lt.unc p9, p8 = GR_low_order_bit,GR_BIAS +{ .mmi +(p9) or GR_exp_y= 0xF,GR_signif_y // Force nonzero fraction if y<1 +;; +(p11) cmp.eq.unc p14,p12 = GR_y_sign, r0 // Test x=+inf, y>0 +(p8) shl GR_exp_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction + // Wait 4 cycles to use result +} +;; + // -// Maybe y is < 1 already, so -// isn't an int. +// Return +inf for x=+inf, y > 0 +// Return +0 for x=+inf, y < 0 // -(p0) sub GR_low_order_bit = GR_low_order_bit,GR_BIAS +{ .mfi + nop.m 999 +(p12) mov FR_Result = f0 // If x=+inf, y<0, result=+0 + nop.i 999 } -{ .mlx - nop.m 999 -(p0) movl GR_sign_mask = 0x20000 ;; +{ .mfb + nop.m 999 +(p14) fma.s0 FR_Result = FR_Input_X,f1,f0 // If x=+inf, y>0, result=+inf +(p11) br.ret.sptk b0 // Exit x=+inf } -{ .mfi -(p0) and GR_sign_mask = GR_sign_mask,GR_exp_y +;; + // -// Return +Inf when x=+/-0 and y < 0 and not odd and raise -// divide-by-zero exception. +// Here only if x=-inf. Wait until can use result of shl... // -(p0) fclass.m.unc p11, p0 = FR_Input_X,0x021 - nop.i 999 ;; -} { .mmi - nop.m 999 ;; -// -// Is shift off integer part of y. -// Get y's even or odd bit - y might not be an int. -// -(p11) cmp.eq.unc p11,p12 = GR_sign_mask, r0 -// -// Remove exponent BIAS -// -(p8) shl GR_exp_y = GR_signif_y,GR_low_order_bit ;; + nop.m 999 +;; + nop.m 999 + nop.i 999 } +;; + { .mfi -(p9) or GR_exp_y = 0xF,GR_signif_y -// -// Is y positive or negative when x is +Inf? -// Is y and int when x = -Inf -// -(p11) mov FR_Result = FR_Input_X - nop.i 999 ;; + cmp.eq p8,p9 = GR_y_sign, r0 // Test y pos + nop.f 999 + shl GR_fraction_y = GR_exp_y,1 // Shift left 1 to get fraction } -{ .mfi - nop.m 999 -(p12) mov FR_Result = f0 - nop.i 999 ;; +;; + +{ .mmi + cmp.eq p13,p0 = GR_fraction_y, r0 // Test y integer +;; + nop.m 999 +(p13) tbit.nz.unc p13,p0 = GR_exp_y, 63 // Test y odd integer } -{ .mii - nop.m 999 +;; + // -// Shift signficand looking for nonzero bits -// For y non-ints, upset the significand. +// Is y even or odd? // -(p0) shl GR_signif_y = GR_exp_y,1 ;; -(p13) cmp.eq.unc p13,p0 = GR_signif_y, r0 -} { .mii - nop.m 999 -(p0) extr.u GR_low_order_bit = GR_exp_y,63,1 ;; -(p13) cmp.ne.unc p13,p0 = GR_low_order_bit, r0 -} -{ .mib - nop.m 999 - nop.i 999 -(p11) br.cond.sptk L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p12) br.cond.sptk L(POWL_64_RETURN) ;; +(p13) cmp.eq.unc p14,p10 = GR_y_sign, r0 // Test x=-inf, y pos odd int +(p13) cmp.ne.and p8,p9 = r0,r0 // If y odd int, turn off p8,p9 + nop.i 999 } +;; + // -// Return Inf for y > 0 -// Return +0 for y < 0 -// Is y even or odd? +// Return -0 for x = -inf and y < 0 and odd int. +// Return -Inf for x = -inf and y > 0 and odd int. // -{ .mii -(p13) cmp.eq.unc p13,p10 = GR_sign_mask, r0 -(p0) cmp.eq.unc p8,p9 = GR_sign_mask, r0 ;; - nop.i 999 +{ .mfi + nop.m 999 +(p10) fmerge.ns FR_Result = f0, f0 // If x=-inf, y neg odd int, result=-0 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 +(p14) fmpy.s0 FR_Result = FR_Input_X,f1 // If x=-inf, y pos odd int, result=-inf + nop.i 999 +} +;; + // -// For x = -inf, y is and int, positive -// and odd -// Is y positive in general? +// Return Inf for x = -inf and y > 0 not an odd int. +// Return +0 for x = -inf and y < 0 not an odd int. // -(p13) mov FR_Result = FR_Input_X - nop.i 999 ;; +.pred.rel "mutex",p8,p9 +{ .mfi + nop.m 999 +(p8) fmerge.ns FR_Result = FR_Input_X, FR_Input_X // If x=-inf, y>0 not odd int + // result=+inf + nop.i 999 } { .mfb - nop.m 999 -(p10) fmerge.ns FR_Result = f0, f0 -(p13) br.cond.sptk L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p10) br.cond.sptk L(POWL_64_RETURN) ;; + nop.m 999 +(p9) fmpy.s0 FR_Result = f0,f0 // If x=-inf, y<0 not odd int + // result=+0 + br.ret.sptk b0 // Exit for x=-inf } -{ .mfi - nop.m 999 +;; + + +POWL_64_Y_IS_INF: +// Here if y=inf, x not 1 or nan // -// Return -Inf for x = -inf and y > 0 and odd int. -// Return -0 for x = -inf and y < 0 and odd int. +// For y = +Inf and |x| < 1 returns 0 +// For y = +Inf and |x| > 1 returns Inf +// For y = -Inf and |x| < 1 returns Inf +// For y = -Inf and |x| > 1 returns 0 +// For y = Inf and |x| = 1 returns 1 // -(p8) fmerge.ns FR_Result = FR_Input_X, FR_Input_X - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p9) mov FR_Result = f0 -(p8) br.cond.sptk L(POWL_64_RETURN) ;; +{ .mfi + nop.m 999 + fclass.m p8, p0 = FR_Input_Y, 0x021 // Test y=+inf + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.sptk L(POWL_64_RETURN) ;; +;; + +{ .mfi + nop.m 999 + fclass.m p9, p0 = FR_Input_Y, 0x022 // Test y=-inf + nop.i 999 } -L(POWL_64_Y_IS_INF): +;; + { .mfi - nop.m 999 -// -// Return Inf for x = -inf and y > 0 not an odd int. -// Return +0 for x = -inf and y < 0 and not an odd int. -// -(p0) fclass.m.unc p8, p0 = FR_Input_Y, 0x021 - nop.i 999 + nop.m 999 + fabs FR_X = FR_Input_X // Form |x| + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p9, p0 = FR_Input_Y, 0x022 - nop.i 999 ;; + nop.m 999 + fcmp.eq.s0 p10,p0 = FR_Input_X, f0 // flag if x denormal + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fabs FR_X = FR_Input_X - nop.i 999 ;; + nop.m 999 +(p8) fcmp.lt.unc.s1 p6, p0 = FR_X, f1 // Test y=+inf, |x|<1 + nop.i 999 } +;; { .mfi - nop.m 999 -(p0) fcmp.eq.s0 p10,p0 = FR_Input_X, f0 // flag if x denormal - nop.i 999 ;; + nop.m 999 +(p8) fcmp.gt.unc.s1 p7, p0 = FR_X, f1 // Test y=+inf, |x|>1 + nop.i 999 } +;; { .mfi - nop.m 999 -// -// Find y = +/- Inf -// Compute |x| -// -(p8) fcmp.lt.unc.s1 p6, p0 = FR_X, f1 - nop.i 999 + nop.m 999 +(p9) fcmp.lt.unc.s1 p12, p0 = FR_X, f1 // Test y=-inf, |x|<1 + nop.i 999 } { .mfi - nop.m 999 -(p8) fcmp.gt.unc.s1 p7, p0 = FR_X, f1 - nop.i 999 ;; + nop.m 999 +(p6) fmpy.s0 FR_Result = f0,f0 // If y=+inf, |x|<1, result=+0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p9) fcmp.lt.unc.s1 p12, p0 = FR_X, f1 - nop.i 999 + nop.m 999 +(p9) fcmp.gt.unc.s1 p13, p0 = FR_X, f1 // Test y=-inf, |x|>1 + nop.i 999 } { .mfi - nop.m 999 -(p9) fcmp.gt.unc.s1 p13, p0 = FR_X, f1 - nop.i 999 ;; + nop.m 999 +(p7) fmpy.s0 FR_Result = FR_Input_Y, f1 // If y=+inf, |x|>1, result=+inf + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// For y = +Inf and |x| < 1 returns 0 -// For y = +Inf and |x| > 1 returns Inf -// For y = -Inf and |x| < 1 returns Inf -// For y = -Inf and |x| > 1 returns 0 -// -(p6) mov FR_Result = f0 - nop.i 999 ;; + nop.m 999 + fcmp.eq.s1 p14, p0 = FR_X, f1 // Test y=inf, |x|=1 + nop.i 999 } { .mfi - nop.m 999 -(p7) mov FR_Result = FR_Input_Y - nop.i 999 ;; + nop.m 999 +(p12) fnma.s0 FR_Result = FR_Input_Y, f1, f0 // If y=-inf, |x|<1, result=+inf + nop.i 999 } +;; + { .mfi - nop.m 999 -(p12) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_Y - nop.i 999 ;; + nop.m 999 +(p13) mov FR_Result = f0 // If y=-inf, |x|>1, result=+0 + nop.i 999 } +;; + { .mfb - nop.m 999 -(p13) mov FR_Result = f0 -// -// Produce x ** +/- Inf results -// -(p6) br.cond.spnt L(POWL_64_RETURN) ;; + nop.m 999 +(p14) fmpy.s0 FR_Result = f1,f1 // If y=inf, |x|=1, result=+1 + br.ret.sptk b0 // Common return for y=inf } -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.spnt L(POWL_64_RETURN) ;; +;; + + +// Here if x or y denorm/unorm +POWL_DENORM: +{ .mmi + getf.sig GR_signif_Z = FR_norm_X // Get significand of x +;; + getf.exp GR_signexp_y = FR_norm_Y // Get sign and exp of y + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 -(p12) br.cond.spnt L(POWL_64_RETURN) ;; +;; + +{ .mfi + getf.sig GR_signif_y = FR_norm_Y // Get significand of y + nop.f 999 + nop.i 999 } +;; + { .mib - nop.m 999 - nop.i 999 -(p13) br.cond.spnt L(POWL_64_RETURN) ;; + getf.exp GR_signexp_x = FR_norm_X // Get sign and exp of x + extr.u GR_Index1 = GR_signif_Z, 59, 4 // Extract upper 4 signif bits of x + br.cond.sptk POWL_COMMON // Branch back to main path } -{ .mfb - nop.m 999 +;; + + +POWL_64_UNSUPPORT: // -// +/-1 ** +/-Inf, result is +1 +// Raise exceptions for specific +// values - pseudo NaN and +// infinities. +// Return NaN and raise invalid // -(p0) fmpy.s0 FR_Result = f1,f1 -(p0) br.cond.sptk L(POWL_64_RETURN) ;; -} -L(POWL_64_UNSUPPORT): { .mfb - nop.m 999 + nop.m 999 + fmpy.s0 FR_Result = FR_Input_X,f0 + br.ret.sptk b0 +} +;; + +POWL_64_XNEG: // -// Return NaN and raise invalid +// Raise invalid for x < 0 and +// y not an integer // -(p0) fmpy.s0 FR_Result = FR_Input_X,f0 -// -// Raise exceptions for specific -// values - pseudo NaN and -// infinities. -// -(p0) br.cond.sptk L(POWL_64_RETURN) ;; -} -L(POWL_64_XNEG): { .mfi - nop.m 999 -(p0) frcpa.s0 FR_Result, p8 = f0, f0 -// -// Raise invalid for x < 0 and -// y not an integer and -// -(p0) mov GR_Parameter_TAG = 22 + nop.m 999 + frcpa.s0 FR_Result, p8 = f0, f0 + mov GR_Parameter_TAG = 22 } { .mib - nop.m 999 - nop.i 999 -(p0) br.cond.sptk __libm_error_region ;; + nop.m 999 + nop.i 999 + br.cond.sptk __libm_error_region } -L(POWL_64_SQRT): +;; + +POWL_64_SQRT: { .mfi - nop.m 999 -(p0) frsqrta.s0 FR_Result,p10 = FR_Input_X - nop.i 999 ;; + nop.m 999 + frsqrta.s0 FR_Result,p10 = FR_save_Input_X + nop.i 999 ;; } { .mfi - nop.m 999 -(p10) fma.s1 f62=FR_Half,FR_Input_X,f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f62=FR_Half,FR_save_Input_X,f0 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (2) -// h = 1/2 * a in f9 -// -(p10) fma.s1 f63=FR_Result,FR_Result,f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f63=FR_Result,FR_Result,f0 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (3) -// t1 = y0 * y0 in f10 -// -(p10) fnma.s1 f32=f63,f62,f11 - nop.i 999 ;; + nop.m 999 +(p10) fnma.s1 f32=f63,f62,FR_Half + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (4) -// t2 = 1/2 - t1 * h in f10 -// -(p10) fma.s1 f33=f32,FR_Result,FR_Result - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f33=f32,FR_Result,FR_Result + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (5) -// y1 = y0 + t2 * y0 in f13 -// -(p10) fma.s1 f34=f33,f62,f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f34=f33,f62,f0 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (6) -// t3 = y1 * h in f10 -// -(p10) fnma.s1 f35=f34,f33,f11 - nop.i 999 ;; + nop.m 999 +(p10) fnma.s1 f35=f34,f33,FR_Half + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (7) -// t4 = 1/2 - t3 * y1 in f10 -// -(p10) fma.s1 f63=f35,f33,f33 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f63=f35,f33,f33 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (8) -// y2 = y1 + t4 * y1 in f13 -// -(p10) fma.s1 f32=FR_Input_X,f63,f0 - nop.i 999 + nop.m 999 +(p10) fma.s1 f32=FR_save_Input_X,f63,f0 + nop.i 999 } { .mfi - nop.m 999 -// -// Step (9) -// S = a * y2 in f10 -// -(p10) fma.s1 FR_Result=f63,f62,f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_Result=f63,f62,f0 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (10) -// t5 = y2 * h in f9 -// -(p10) fma.s1 f33=f11,f63,f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f33=f11,f63,f0 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (11) -// H = 1/2 * y2 in f11 -// -(p10) fnma.s1 f34=f32,f32,f8 - nop.i 999 + nop.m 999 +(p10) fnma.s1 f34=f32,f32,FR_save_Input_X + nop.i 999 } { .mfi - nop.m 999 -// -// Step (12) -// d = a - S * S in f12 -// -(p10) fnma.s1 f35=FR_Result,f63,f11 - nop.i 999 ;; + nop.m 999 +(p10) fnma.s1 f35=FR_Result,f63,FR_Half + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (13) -// t6 = 1/2 - t5 * y2 in f7 -// -(p10) fma.s1 f62=f33,f34,f32 - nop.i 999 + nop.m 999 +(p10) fma.s1 f62=f33,f34,f32 + nop.i 999 } { .mfi - nop.m 999 -// -// Step (14) -// S1 = S + d * H in f13 -// -(p10) fma.s1 f63=f33,f35,f33 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f63=f33,f35,f33 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (15) -// H1 = H + t6 * h in f7 -// -(p10) fnma.s1 f32=f62,f62,FR_Input_X - nop.i 999 ;; + nop.m 999 +(p10) fnma.s1 f32=f62,f62,FR_save_Input_X + nop.i 999 ;; } { .mfb - nop.m 999 -// -// Step (16) -// d1 = a - S1 * S1 -// -(p10) fma.s0 FR_Result=f32,f63,f62 -// -// Step (17) -// R = S1 + d1 * H1 -// -(p10) br.cond.sptk L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -// -// Do the Newton-Raphson iteration from the EAS. -// -(p0) br.cond.sptk L(POWL_64_RETURN) ;; + nop.m 999 +(p10) fma.s0 FR_Result=f32,f63,f62 + br.ret.sptk b0 // Exit for x > 0, y = 0.5 } -// -// Take care of the degenerate cases. -// +;; + +GLOBAL_LIBM_END(powl) -L(POWL_64_RETURN): -{ .mfb - nop.m 999 -(p0) mov FR_Output = FR_Result -(p0) br.ret.sptk b0 ;; -} -.endp powl -ASM_SIZE_DIRECTIVE(powl) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -3411,32 +2772,32 @@ __libm_error_region: mov GR_SAVE_GP=gp // Save gp };; { .mmi - stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + stfe [GR_Parameter_Y] = FR_Input_Y,16 // Save Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0 };; .body { .mib - stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + stfe [GR_Parameter_X] = FR_save_Input_X // Store Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y nop.b 0 // Parameter 3 address } { .mib - stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + stfe [GR_Parameter_Y] = FR_Result // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; { .mmi - ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; { .mib mov gp = GR_SAVE_GP // Restore gp @@ -3444,7 +2805,6 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region#) .type __libm_error_support#,@function .global __libm_error_support# |