#ifdef L__divdf3 // Compute a 64-bit IEEE double quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // farg0 holds the dividend. farg1 holds the divisor. .text .align 16 .global __divdf3 .proc __divdf3 __divdf3: frcpa f10, p6 = farg0, farg1 ;; (p6) fma.s1 f11 = farg0, f10, f0 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fma.s1 f13 = f12, f12, f0 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fma.s1 f11 = f13, f11, f11 (p6) fma.s1 f12 = f13, f13, f0 (p6) fma.s1 f10 = f13, f10, f10 ;; (p6) fma.d.s1 f11 = f12, f11, f11 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fnma.d.s1 f8 = farg1, f11, farg0 ;; (p6) fma.d f10 = f8, f10, f11 ;; mov fret0 = f10 br.ret.sptk rp ;; .endp __divdf3 #endif #ifdef L__divsf3 // Compute a 32-bit IEEE float quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // farg0 holds the dividend. farg1 holds the divisor. .text .align 16 .global __divsf3 .proc __divsf3 __divsf3: frcpa f10, p6 = farg0, farg1 ;; (p6) fma.s1 f8 = farg0, f10, f0 (p6) fnma.s1 f9 = farg1, f10, f1 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fma.s1 f9 = f9, f9, f0 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fma.s1 f9 = f9, f9, f0 ;; (p6) fma.d.s1 f8 = f9, f8, f8 ;; (p6) fma.s f10 = f8, f1, f0 ;; mov fret0 = f10 br.ret.sptk rp ;; .endp __divsf3 #endif #ifdef L__divdi3 // Compute a 64-bit integer quotient. // // Use reciprocal approximation and Newton-Raphson iteration to compute the // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations // to get more than the 64 bits of precision that we need for DImode. // // Must use max precision for the reciprocal computations to get 64 bits of // precision. // // r32/f8 holds the dividend. r33/f9 holds the divisor. // f10 holds the value 2.0. f11 holds the reciprocal approximation. // f12 is a temporary. .text .align 16 .global __divdi3 .proc __divdi3 __divdi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f8 = in0 setf.sig f9 = in1 ;; // Convert the inputs to FP, so that they won't be treated as unsigned. fcvt.xf f8 = f8 fcvt.xf f9 = f9 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fma.s1 f11 = farg0, f10, f0 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fma.s1 f13 = f12, f12, f0 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fma.s1 f11 = f13, f11, f11 (p6) fma.s1 f12 = f13, f13, f0 (p6) fma.s1 f10 = f13, f10, f10 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fnma.s1 f8 = f9, f11, f8 ;; (p6) fma.s1 f10 = f8, f10, f11 ;; // Round quotient to an integer. fcvt.fx.trunc.s1 f8 = f10 ;; // Transfer result to GP registers. getf.sig ret0 = f8 br.ret.sptk rp ;; .endp __divdi3 #endif #ifdef L__moddi3 // Compute a 64-bit integer modulus. // // Use reciprocal approximation and Newton-Raphson iteration to compute the // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations // to get more than the 64 bits of precision that we need for DImode. // // Must use max precision for the reciprocal computations to get 64 bits of // precision. // // r32/f8 holds the dividend. r33/f9 holds the divisor. // f10 holds the value 2.0. f11 holds the reciprocal approximation. // f12 is a temporary. .text .align 16 .global __moddi3 .proc __moddi3 __moddi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f8 = in0 setf.sig f9 = in1 ;; // Convert the inputs to FP, so that they won't be treated as unsigned. fcvt.xf f8 = f8 fcvt.xf f9 = f9 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fma.s1 f11 = farg0, f10, f0 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fma.s1 f13 = f12, f12, f0 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fma.s1 f11 = f13, f11, f11 (p6) fma.s1 f12 = f13, f13, f0 (p6) fma.s1 f10 = f13, f10, f10 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fnma.s1 f12 = f9, f11, f8 ;; (p6) fma.s1 f10 = f12, f10, f11 ;; // Round quotient to an integer. fcvt.fx.trunc.s1 f10 = f10 ;; // Renormalize. fcvt.xf f10 = f10 ;; // Compute remainder. fnma.s1 f8 = f10, f9, f8 ;; // Round remainder to an integer. fcvt.fx.trunc.s1 f8 = f8 ;; // Transfer result to GP registers. getf.sig ret0 = f8 br.ret.sptk rp ;; .endp __moddi3 #endif #ifdef L__udivdi3 // Compute a 64-bit unsigned integer quotient. // // Use reciprocal approximation and Newton-Raphson iteration to compute the // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations // to get more than the 64 bits of precision that we need for DImode. // // Must use max precision for the reciprocal computations to get 64 bits of // precision. // // r32/f8 holds the dividend. r33/f9 holds the divisor. // f10 holds the value 2.0. f11 holds the reciprocal approximation. // f12 is a temporary. .text .align 16 .global __udivdi3 .proc __udivdi3 __udivdi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f8 = in0 setf.sig f9 = in1 ;; // Convert the inputs to FP, to avoid FP software-assist faults. fcvt.xuf.s1 f8 = f8 fcvt.xuf.s1 f9 = f9 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fma.s1 f11 = farg0, f10, f0 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fma.s1 f13 = f12, f12, f0 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fma.s1 f11 = f13, f11, f11 (p6) fma.s1 f12 = f13, f13, f0 (p6) fma.s1 f10 = f13, f10, f10 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fnma.s1 f8 = f9, f11, f8 ;; (p6) fma.s1 f10 = f8, f10, f11 ;; // Round quotient to an unsigned integer. fcvt.fxu.trunc.s1 f8 = f10 ;; // Transfer result to GP registers. getf.sig ret0 = f8 br.ret.sptk rp ;; .endp __udivdi3 #endif #ifdef L__umoddi3 // Compute a 64-bit unsigned integer modulus. // // Use reciprocal approximation and Newton-Raphson iteration to compute the // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations // to get more than the 64 bits of precision that we need for DImode. // // Must use max precision for the reciprocal computations to get 64 bits of // precision. // // r32/f8 holds the dividend. r33/f9 holds the divisor. // f10 holds the value 2.0. f11 holds the reciprocal approximation. // f12 is a temporary. .text .align 16 .global __umoddi3 .proc __umoddi3 __umoddi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f8 = in0 setf.sig f9 = in1 ;; // Convert the inputs to FP, to avoid FP software assist faults. fcvt.xuf.s1 f8 = f8 fcvt.xuf.s1 f9 = f9 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fma.s1 f11 = farg0, f10, f0 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fma.s1 f13 = f12, f12, f0 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fma.s1 f11 = f13, f11, f11 (p6) fma.s1 f12 = f13, f13, f0 (p6) fma.s1 f10 = f13, f10, f10 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fnma.s1 f12 = f9, f11, f8 ;; (p6) fma.s1 f10 = f12, f10, f11 ;; // Round quotient to an unsigned integer. fcvt.fxu.trunc.s1 f10 = f10 ;; // Renormalize. fcvt.xuf.s1 f10 = f10 ;; // Compute remainder. fnma.s1 f8 = f10, f9, f8 ;; // Round remainder to an integer. fcvt.fxu.trunc.s1 f8 = f8 ;; // Transfer result to GP registers. getf.sig ret0 = f8 br.ret.sptk rp ;; .endp __umoddi3 #endif #ifdef L__divsi3 // Compute a 32-bit integer quotient. // // Use reciprocal approximation and Newton-Raphson iteration to compute the // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations // to get more than the 32 bits of precision that we need for SImode. // // ??? This is currently not used. It needs to be fixed to be more like the // above DImode routines. // // ??? Check to see if the error is less than >.5ulp error. We may need // some adjustment code to get precise enough results. // // ??? Should probably use max precision for the reciprocal computations. // // r32/f8 holds the dividend. r33/f9 holds the divisor. // f10 holds the value 2.0. f11 holds the reciprocal approximation. // f12 is a temporary. .text .align 16 .global __divsi3 .proc __divsi3 __divsi3: .regstk 2,0,0,0 setf.sig f8 = in0 setf.sig f9 = in1 ;; fcvt.xf f8 = f8 fcvt.xf f9 = f9 ;; frcpa f11, p6 = f8, f9 fadd f10 = f1, f1 ;; fnma f12 = f9, f11, f10 ;; fmpy f11 = f11, f12 ;; fnma f12 = f9, f11, f10 ;; fmpy f11 = f11, f12 ;; fmpy f8 = f8, f11 ;; fcvt.fx.trunc f8 = f8 ;; getf.sig ret0 = f8 br.ret.sptk rp ;; .endp __divsi3 #endif #ifdef L__modsi3 // Compute a 32-bit integer modulus. // // Use reciprocal approximation and Newton-Raphson iteration to compute the // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations // to get more than the 32 bits of precision that we need for SImode. // // ??? This is currently not used. It needs to be fixed to be more like the // above DImode routines. // // ??? Check to see if the error is less than >.5ulp error. We may need // some adjustment code to get precise enough results. // // ??? Should probably use max precision for the reciprocal computations. // // r32/f8 holds the dividend. r33/f9 holds the divisor. // f10 holds the value 2.0. f11 holds the reciprocal approximation. // f12 is a temporary. .text .align 16 .global __modsi3 .proc __modsi3 __modsi3: .regstk 2,0,0,0 setf.sig f8 = r32 setf.sig f9 = r33 ;; fcvt.xf f8 = f8 fcvt.xf f9 = f9 ;; frcpa f11, p6 = f8, f9 fadd f10 = f1, f1 ;; fnma f12 = f9, f11, f10 ;; fmpy f11 = f11, f12 ;; fnma f12 = f9, f11, f10 ;; fmpy f11 = f11, f12 ;; fmpy f10 = f8, f11 ;; fcvt.fx.trunc f10 = f10 ;; fcvt.xf f10 = f10 ;; fnma f8 = f10, f9, f8 ;; fcvt.fx f8 = f8 ;; getf.sig r32 = f8 br.ret.sptk rp ;; .endp __modsi3 #endif #ifdef L__udivsi3 // Compute a 32-bit unsigned integer quotient. // // Use reciprocal approximation and Newton-Raphson iteration to compute the // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations // to get more than the 32 bits of precision that we need for SImode. // // ??? This is currently not used. It needs to be fixed to be more like the // above DImode routines. // // ??? Check to see if the error is less than >.5ulp error. We may need // some adjustment code to get precise enough results. // // ??? Should probably use max precision for the reciprocal computations. // // r32/f8 holds the dividend. r33/f9 holds the divisor. // f10 holds the value 2.0. f11 holds the reciprocal approximation. // f12 is a temporary. // // This is the same as divsi3, except that we don't need fcvt instructions // before the frcpa. .text .align 16 .global __udivsi3 .proc __udivsi3 __udivsi3: .regstk 2,0,0,0 setf.sig f8 = r32 setf.sig f9 = r33 ;; frcpa f11, p6 = f8, f9 fadd f10 = f1, f1 ;; fnma f12 = f9, f11, f10 ;; fmpy f11 = f11, f12 ;; fnma f12 = f9, f11, f10 ;; fmpy f11 = f11, f12 ;; fmpy f8 = f8, f11 ;; fcvt.fxu.trunc f8 = f8 ;; getf.sig ret0 = f8 br.ret.sptk rp ;; .endp __udivsi3 #endif #ifdef L__umodsi3 // Compute a 32-bit unsigned integer modulus. // // Use reciprocal approximation and Newton-Raphson iteration to compute the // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations // to get more than the 32 bits of precision that we need for SImode. // // ??? This is currently not used. It needs to be fixed to be more like the // above DImode routines. // // ??? Check to see if the error is less than >.5ulp error. We may need // some adjustment code to get precise enough results. // // ??? Should probably use max precision for the reciprocal computations. // // r32/f8 holds the dividend. r33/f9 holds the divisor. // f10 holds the value 2.0. f11 holds the reciprocal approximation. // f12 is a temporary. // // This is the same as modsi3, except that we don't need fcvt instructions // before the frcpa. .text .align 16 .global __umodsi3 .proc __umodsi3 __umodsi3: .regstk 2,0,0,0 setf.sig f8 = r32 setf.sig f9 = r33 ;; frcpa f11, p6 = f8, f9 fadd f10 = f1, f1 ;; fnma f12 = f9, f11, f10 ;; fmpy f11 = f11, f12 ;; fnma f12 = f9, f11, f10 ;; fmpy f11 = f11, f12 ;; fmpy f10 = f8, f11 ;; fcvt.fxu.trunc f10 = f10 ;; fcvt.xuf f10 = f10 ;; fnma f8 = f10, f9, f8 ;; fcvt.fxu f8 = f8 ;; getf.sig r32 = f8 br.ret.sptk rp ;; .endp __umodsi3 #endif #ifdef L__save_stack_nonlocal // Notes on save/restore stack nonlocal: We read ar.bsp but write // ar.bspstore. This is because ar.bsp can be read at all times // (independent of the RSE mode) but since it's read-only we need to // restore the value via ar.bspstore. This is OK because // ar.bsp==ar.bspstore after executing "flushrs". // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) .text .align 16 .global __ia64_save_stack_nonlocal .proc __ia64_save_stack_nonlocal __ia64_save_stack_nonlocal: alloc r18=ar.pfs,2,0,0,0 st8 [in0]=in1,8 mov r19=ar.rsc ;; flushrs and r19=0x1c,r19 mov ar.pfs=r18 ;; mov ar.rsc=r19 mov r16=ar.bsp adds r2=16,in0 ;; mov r17=ar.rnat st8 [in0]=r16,8 or r19=0x3,r19 ;; st8 [in0]=r17 mov ar.rsc=r19 st8 [r2]=r18 mov ar.pfs=r18 br.ret.sptk.few rp ;; .endp __ia64_save_stack_nonlocal #endif #ifdef L__nonlocal_goto // void __ia64_nonlocal_goto(void *fp, void *target_label, void *save_area, // void *static_chain); .text .align 16 .global __ia64_nonlocal_goto .proc __ia64_nonlocal_goto __ia64_nonlocal_goto: alloc r20=ar.pfs,4,0,0,0 mov r19=ar.rsc adds r2=8,in2 ld8 r12=[in2],16 mov.ret.sptk rp = r33, .L0 ;; flushrs ld8 r16=[r2],16 and r19=0x1c,r19 ld8 r17=[in2] ;; ld8 r18=[r2] mov ar.rsc=r19 ;; mov ar.bspstore=r16 ;; mov ar.rnat=r17 mov ar.pfs=r18 or r19=0x3,r19 ;; loadrs invala mov r7=r32 .L0: { mov ar.rsc=r19 mov r15=r35 br.ret.sptk.few rp } ;; .endp __ia64_nonlocal_goto #endif #ifdef L__restore_stack_nonlocal // This is mostly the same as nonlocal_goto above. // ??? This has not been tested yet. // void __ia64_restore_stack_nonlocal(void *save_area) .text .align 16 .global __ia64_restore_stack_nonlocal .proc __ia64_restore_stack_nonlocal __ia64_restore_stack_nonlocal: alloc r20=ar.pfs,4,0,0,0 mov r19=ar.rsc adds r2=8,in0 ld8 r12=[in0],16 ;; flushrs ld8 r16=[r2],16 and r19=0x1c,r19 ld8 r17=[in0] ;; ld8 r18=[r2] mov ar.rsc=r19 ;; mov ar.bspstore=r16 ;; mov ar.rnat=r17 mov ar.pfs=r18 or r19=0x3,r19 ;; loadrs invala .L0: { mov ar.rsc=r19 br.ret.sptk.few rp } ;; .endp __ia64_restore_stack_nonlocal #endif