3 files changed, 33 insertions, 15 deletions
diff --git a/ChangeLog b/ChangeLog
index 25691cd..26429c6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2010-02-24  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/i386/i686/multiarch/memset-sse2-rep.S: Remove redundant
+	punpcklbw.
+	Use unsigned conditional jumps.
+	(128bytesormore_nt): Renamed to ...
+	(128bytesormore_endof_L1): This.
+	Use add instead of lea if possible.
+	Correct unwind info.
+	* sysdeps/i386/i686/multiarch/memset-sse2.S: Remove redundant
+	punpcklbw.
+	Use unsigned conditional jumps.
+	Use add instead of lea if possible.
+	Correct unwind info.
+
 2010-02-24  Ulrich Drepper  <drepper@redhat.com>
 
 	[BZ #11319]
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
index 84afffe..f9a0b13 100644
--- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
+++ b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -243,7 +243,6 @@ L(32bytesormore):
 	pxor	%xmm0, %xmm0
 #else
 	movd	%eax, %xmm0
-	punpcklbw %xmm0, %xmm0
 	pshufd	$0, %xmm0, %xmm0
 #endif
 	testl	$0xf, %edx
@@ -261,7 +260,7 @@ L(not_aligned_16):
 	ALIGN (4)
 L(aligned_16):
 	cmp	$128, %ecx
-	jge	L(128bytesormore)
+	jae	L(128bytesormore)
 
 L(aligned_16_less128bytes):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
@@ -293,7 +292,7 @@ L(128bytesormore):
  * fast string will prefetch and combine data efficiently.
  */
 	cmp	%edi, %ecx
-	jae	L(128bytesormore_nt)
+	jae	L(128bytesormore_endof_L1)
 	subl	$128, %ecx
 L(128bytesormore_normal):
 	sub	$128, %ecx
@@ -306,7 +305,7 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jl	L(128bytesless_normal)
+	jb	L(128bytesless_normal)
 
 
 	sub	$128, %ecx
@@ -319,15 +318,16 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jge	L(128bytesormore_normal)
+	jae	L(128bytesormore_normal)
 
 L(128bytesless_normal):
 	POP (%edi)
-	lea	128(%ecx), %ecx
+	add	$128, %ecx
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
+	CFI_PUSH (%edi)
 	ALIGN (4)
-L(128bytesormore_nt):
+L(128bytesormore_endof_L1):
 	mov	%edx, %edi
 	mov	%ecx, %edx
 	shr	$2, %ecx
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S
index b2b9791..92ad601 100644
--- a/sysdeps/i386/i686/multiarch/memset-sse2.S
+++ b/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -243,7 +243,6 @@ L(32bytesormore):
 	pxor	%xmm0, %xmm0
 #else
 	movd	%eax, %xmm0
-	punpcklbw %xmm0, %xmm0
 	pshufd	$0, %xmm0, %xmm0
 #endif
 	testl	$0xf, %edx
@@ -261,7 +260,7 @@ L(not_aligned_16):
 	ALIGN (4)
 L(aligned_16):
 	cmp	$128, %ecx
-	jge	L(128bytesormore)
+	jae	L(128bytesormore)
 
 L(aligned_16_less128bytes):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
@@ -287,14 +286,17 @@ L(128bytesormore):
 
 #ifdef DATA_CACHE_SIZE
 	POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
 	cmp	$DATA_CACHE_SIZE, %ecx
 #else
 # ifdef SHARED
+#  define RESTORE_EBX_STATE
 	call	__i686.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
 # else
 	POP (%ebx)
+#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
 	cmp	__x86_data_cache_size, %ecx
 # endif
 #endif
@@ -312,7 +314,7 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jl	L(128bytesless_normal)
+	jb	L(128bytesless_normal)
 
 
 	sub	$128, %ecx
@@ -325,10 +327,10 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jge	L(128bytesormore_normal)
+	jae	L(128bytesormore_normal)
 
 L(128bytesless_normal):
-	lea	128(%ecx), %ecx
+	add	$128, %ecx
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
 	ALIGN (4)
@@ -346,11 +348,12 @@ L(128bytes_L2_normal):
 	movaps	%xmm0, 0x70(%edx)
 	add	$128, %edx
 	cmp	$128, %ecx
-	jge	L(128bytes_L2_normal)
+	jae	L(128bytes_L2_normal)
 
 L(128bytesless_L2_normal):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
+	RESTORE_EBX_STATE
 L(128bytesormore_nt_start):
 	sub	%ebx, %ecx
 	ALIGN (4)
@@ -368,7 +371,7 @@ L(128bytesormore_shared_cache_loop):
 	movdqa	%xmm0, 0x70(%edx)
 	add	$0x80, %edx
 	cmp	$0x80, %ebx
-	jge	L(128bytesormore_shared_cache_loop)
+	jae	L(128bytesormore_shared_cache_loop)
 	cmp	$0x80, %ecx
 	jb	L(shared_cache_loop_end)
 	ALIGN (4)
@@ -384,7 +387,7 @@ L(128bytesormore_nt):
 	movntdq	%xmm0, 0x70(%edx)
 	add	$0x80, %edx
 	cmp	$0x80, %ecx
-	jge	L(128bytesormore_nt)
+	jae	L(128bytesormore_nt)
 	sfence
 L(shared_cache_loop_end):
 #if defined DATA_CACHE_SIZE || !defined SHARED