2 files changed, 43 insertions, 27 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5b70058..541c41a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,16 @@
+Tue Jun 25 20:59:56 2002  J"orn Rennecke <joern.rennecke@superh.com>
+
+	* config/sh/lib1funcs.asm (udivdi3): Make first divide step 
+	produce a 32 bit result before normalization, then normalize with a
+	left shift.  Compute approximative error of 2nd reciprocal
+	approximation in 2's complement.  Fix mask generation from upper
+	longword of second divide stage result.
+	For large divisor, fix shift count used to truncate first stage
+	divide result; make decision if to adjust upwards based on comparison
+	of higher parts of normalized values.
+	(udivdi): Likewise.  Undo normalization of result for large divisor
+	case.
+
 2002-06-25  David S. Miller  <davem@redhat.com>
 
 	* config/sparc/sparc.md: Change \\{t,n} to \{t,n}.
diff --git a/gcc/config/sh/lib1funcs.asm b/gcc/config/sh/lib1funcs.asm
index 2364db9..1021c9b 100644
--- a/gcc/config/sh/lib1funcs.asm
+++ b/gcc/config/sh/lib1funcs.asm
@@ -1501,33 +1501,33 @@ GLOBAL(udivdi3):
 	shlri r6,32,r7
 	bgt/u r9,r63,tr0 // large_divisor
 	mmulfx.w r5,r4,r4
-	shlri r2,32,r19
-	addi r20,14-1,r0
+	shlri r2,32+14,r19
+	addi r22,-31,r0
 	msub.w r1,r4,r1
 
 	mulu.l r1,r7,r4
 	addi r1,-3,r5
 	mulu.l r5,r19,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
 	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
 	                 the case may be, %0000000000000000 000.11111111111, still */
 	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
-	shlrd r5,r0,r8
-	mulu.l r8,r3,r5
+	mulu.l r5,r3,r8
 	mshalds.l r1,r21,r1
 	shari r4,26,r4
-	shlli r5,32,r5
-	sub r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
-	sub r2,r5,r2
+	shlld r8,r0,r8
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	sub r2,r8,r2
 	/* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */
 
 	shlri r2,22,r21
 	mulu.l r21,r1,r21
+	shlld r5,r0,r8
 	addi r20,30-22,r0
-	shlli r8,32,r8
 	shlrd r21,r0,r21
 	mulu.l r21,r3,r5
 	add r8,r21,r8
-	mcmpeq.l r21,r63,r21 // See Note 1
+	mcmpgt.l r21,r63,r21 // See Note 1
 	addi r20,30,r0
 	mshfhi.l r63,r21,r21
 	sub r2,r5,r2
@@ -1555,14 +1555,15 @@ LOCAL(large_divisor):
 	mulu.l r1,r7,r4
 	addi r1,-3,r5
 	mulu.l r5,r8,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
 	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
 	                 the case may be, %0000000000000000 000.11111111111, still */
 	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
-	shlri r5,14-1+32,r8
+	shlri r5,14-1,r8
 	mulu.l r8,r7,r5
 	mshalds.l r1,r21,r1
 	shari r4,26,r4
-	sub r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
 	sub r25,r5,r25
 	/* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */
 
@@ -1575,11 +1576,11 @@ LOCAL(large_divisor):
 	add r8,r21,r8
 	shlld r2,r0,r2
 	sub r25,r5,r25
-	mextr4 r2,r25,r2
-	bgtu/u r6,r2,tr0 // no_lo_adj
+	bgtu/u r7,r25,tr0 // no_lo_adj
 	addi r8,1,r8
-	sub r2,r6,r2
+	sub r25,r7,r25
 LOCAL(no_lo_adj):
+	mextr4 r2,r25,r2
 
 	/* large_divisor: only needs a few adjustments.  */
 	mulu.l r8,r6,r5
@@ -1647,22 +1648,22 @@ GLOBAL(umoddi3):
 	shlri r6,32,r7
 	bgt/u r9,r63,tr0 // large_divisor
 	mmulfx.w r5,r4,r4
-	shlri r2,32,r19
-	addi r20,14-1,r0
+	shlri r2,32+14,r19
+	addi r22,-31,r0
 	msub.w r1,r4,r1
 
 	mulu.l r1,r7,r4
 	addi r1,-3,r5
 	mulu.l r5,r19,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
 	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
 	                 the case may be, %0000000000000000 000.11111111111, still */
 	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
-	shlrd r5,r0,r8
-	mulu.l r8,r3,r5
+	mulu.l r5,r3,r5
 	mshalds.l r1,r21,r1
 	shari r4,26,r4
-	shlli r5,32,r5
-	sub r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	shlld r5,r0,r5
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
 	sub r2,r5,r2
 	/* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */
 
@@ -1672,7 +1673,7 @@ GLOBAL(umoddi3):
 	/* bubble */ /* could test r3 here to check for divide by zero.  */
 	shlrd r21,r0,r21
 	mulu.l r21,r3,r5
-	mcmpeq.l r21,r63,r21 // See Note 1
+	mcmpgt.l r21,r63,r21 // See Note 1
 	addi r20,30,r0
 	mshfhi.l r63,r21,r21
 	sub r2,r5,r2
@@ -1700,14 +1701,15 @@ LOCAL(large_divisor):
 	mulu.l r1,r7,r4
 	addi r1,-3,r5
 	mulu.l r5,r8,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
 	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
 	                 the case may be, %0000000000000000 000.11111111111, still */
 	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
-	shlri r5,14-1+32,r8
+	shlri r5,14-1,r8
 	mulu.l r8,r7,r5
 	mshalds.l r1,r21,r1
 	shari r4,26,r4
-	sub r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
 	sub r25,r5,r25
 	/* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */
 
@@ -1720,19 +1722,20 @@ LOCAL(large_divisor):
 	add r8,r21,r8
 	shlld r2,r0,r2
 	sub r25,r5,r25
-	mextr4 r2,r25,r2
-	bgtu/u r6,r2,tr0 // no_lo_adj
+	bgtu/u r7,r25,tr0 // no_lo_adj
 	addi r8,1,r8
-	sub r2,r6,r2
+	sub r25,r7,r25
 LOCAL(no_lo_adj):
+	mextr4 r2,r25,r2
 
 	/* large_divisor: only needs a few adjustments.  */
 	mulu.l r8,r6,r5
 	ptabs r18,tr0
-	add r2,r3,r7
+	add r2,r6,r7
 	cmpgtu r5,r2,r8
 	cmvne r8,r7,r2
 	sub r2,r5,r2
+	shlrd r2,r22,r2
 	blink tr0,r63
 /* Note 1: To shift the result of the second divide stage so that the result
    always fits into 32 bits, yet we still reduce the rest sufficiently