1 files changed, 47 insertions, 31 deletions
diff --git a/sysdeps/libm-i387/s_cbrt.S b/sysdeps/libm-i387/s_cbrt.S
index 4b9a2b1..3f6a017 100644
--- a/sysdeps/libm-i387/s_cbrt.S
+++ b/sysdeps/libm-i387/s_cbrt.S
@@ -28,34 +28,36 @@
 #endif
 
         .align ALIGNARG(4)
-        ASM_TYPE_DIRECTIVE(f1,@object)
-f1:	.double 0.354895765043919860
-	ASM_SIZE_DIRECTIVE(f1)
-        ASM_TYPE_DIRECTIVE(f2,@object)
-f2:	.double 1.50819193781584896
-	ASM_SIZE_DIRECTIVE(f2)
-        ASM_TYPE_DIRECTIVE(f3,@object)
-f3:	.double -2.11499494167371287
-	ASM_SIZE_DIRECTIVE(f3)
-        ASM_TYPE_DIRECTIVE(f4,@object)
-f4:	.double 2.44693122563534430
-	ASM_SIZE_DIRECTIVE(f4)
-        ASM_TYPE_DIRECTIVE(f5,@object)
-f5:	.double -1.83469277483613086
-	ASM_SIZE_DIRECTIVE(f5)
-        ASM_TYPE_DIRECTIVE(f6,@object)
-f6:	.double 0.784932344976639262
-	ASM_SIZE_DIRECTIVE(f6)
         ASM_TYPE_DIRECTIVE(f7,@object)
 f7:	.double -0.145263899385486377
 	ASM_SIZE_DIRECTIVE(f7)
+        ASM_TYPE_DIRECTIVE(f6,@object)
+f6:	.double 0.784932344976639262
+	ASM_SIZE_DIRECTIVE(f6)
+        ASM_TYPE_DIRECTIVE(f5,@object)
+f5:	.double -1.83469277483613086
+	ASM_SIZE_DIRECTIVE(f5)
+        ASM_TYPE_DIRECTIVE(f4,@object)
+f4:	.double 2.44693122563534430
+	ASM_SIZE_DIRECTIVE(f4)
+        ASM_TYPE_DIRECTIVE(f3,@object)
+f3:	.double -2.11499494167371287
+	ASM_SIZE_DIRECTIVE(f3)
+        ASM_TYPE_DIRECTIVE(f2,@object)
+f2:	.double 1.50819193781584896
+	ASM_SIZE_DIRECTIVE(f2)
+        ASM_TYPE_DIRECTIVE(f1,@object)
+f1:	.double 0.354895765043919860
+	ASM_SIZE_DIRECTIVE(f1)
 
-#define CBRT2 1.2599210498948731648
-#define SQR_CBRT2 1.5874010519681994748
+#define CBRT2		1.2599210498948731648
+#define ONE_CBRT2	0.793700525984099737355196796584
+#define SQR_CBRT2	1.5874010519681994748
+#define ONE_SQR_CBRT2	0.629960524947436582364439673883
 
 	ASM_TYPE_DIRECTIVE(factor,@object)
-factor:	.double 1.0 / SQR_CBRT2
-	.double 1.0 / CBRT2
+factor:	.double ONE_SQR_CBRT2
+	.double ONE_CBRT2
 	.double 1.0
 	.double CBRT2
 	.double SQR_CBRT2
@@ -67,10 +69,10 @@ two54:  .byte 0, 0, 0, 0, 0, 0, 0x50, 0x43
 
 #ifdef PIC
 #define MO(op) op##@GOTOFF(%ebx)
-#define MOX(op,x,f) op##@GOTOFF(%ebx,x,f)
+#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
 #else
 #define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+#define MOX(op,x) op(x)
 #endif
 
 	.text
@@ -102,8 +104,13 @@ ENTRY(__cbrt)
 #endif
 	fmull	MO(two54)
 	movl	$-54, %ecx
+#ifdef PIC
+	fstpl	8(%esp)
+	movl	12(%esp), %eax
+#else
 	fstpl	4(%esp)
 	movl	8(%esp), %eax
+#endif
 	movl	%eax, %edx
 	andl	$0x7fffffff, %eax
 
@@ -123,11 +130,16 @@ ENTRY(__cbrt)
 #endif
 	fabs
 
-	/* The following code has two track:
+	/* The following code has two tracks:
 	    a) compute the normalized cbrt value
 	    b) compute xe/3 and xe%3
 	   The right track computes the value for b) and this is done
-	   in an optimized way by avoiding division.  */
+	   in an optimized way by avoiding division.
+
+	   But why two tracks at all?  Very easy: efficiency.  Some FP
+	   instruction can overlap with a certain amount of integer (and
+	   FP) instructions.  So we get (except for the imull) all
+	   instructions for free.  */
 
 	fld	%st(0)			/* xm : xm */
 
@@ -161,20 +173,24 @@ ENTRY(__cbrt)
 	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
 			subl	%edx, %ecx
 	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
+			shll	$3, %ecx
 	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
 	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
-	fmull	MOX(16+factor,%ecx,8)	/* u*(t2+2*xm)/(2*t2+xm)*FACT */
+	fmull	MOX(16+factor,%ecx)	/* u*(t2+2*xm)/(2*t2+xm)*FACT */
 	pushl	%eax
 	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
 	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
-	popl	%eax
 	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
-	fstp	%st(1)
+	popl	%edx
 #ifdef PIC
+	movl	12(%esp), %eax
 	popl	%ebx
+#else
+	movl	8(%esp), %eax
 #endif
-	testl	$0x80000000, 8(%esp)
-	jz	4f
+	testl	%eax, %eax
+	fstp	%st(1)
+	jns	4f
 	fchs
 4:	ret