newlib/libc/machine/i386/memset.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

/*
 * ====================================================
 * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
 *
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */

	#include "i386mach.h"

	.global SYM (memset)
       SOTYPE_FUNCTION(memset)

SYM (memset):

#ifdef __iamcu__
	pushl edi
	movl eax,edi
	movzbl dl,eax
	mov edi,edx
	rep stosb
	mov edx,eax
	popl edi
#else
	pushl ebp
	movl esp,ebp
	pushl edi
	movl 8(ebp),edi
	movzbl 12(ebp),eax
	movl 16(ebp),ecx
	cld

#ifndef __OPTIMIZE_SIZE__
/* Less than 16 bytes won't benefit from the 'rep stosl' loop.  */
	cmpl $16,ecx
	jbe .L19
	testl $7,edi
	je .L10

/* It turns out that 8-byte aligned 'rep stosl' outperforms
   4-byte aligned on some x86 platforms.  */
	movb al,(edi)
	incl edi
	decl ecx
	testl $7,edi
	je .L10

	movb al,(edi)
	incl edi
	decl ecx
	testl $7,edi
	je .L10

	movb al,(edi)
	incl edi
	decl ecx
	testl $7,edi
	je .L10

	movb al,(edi)
	incl edi
	decl ecx
	testl $7,edi
	je .L10

	movb al,(edi)
	incl edi
	decl ecx
	testl $7,edi
	je .L10

	movb al,(edi)
	incl edi
	decl ecx
	testl $7,edi
	je .L10

	movb al,(edi)
	incl edi
	decl ecx

/* At this point, ecx>8 and edi%8==0.  */
.L10:
	movb al,ah
	movl eax,edx
	sall $16,edx
	orl edx,eax

	movl ecx,edx
	shrl $2,ecx
	andl $3,edx
	rep
	stosl
	movl edx,ecx
#endif /* not __OPTIMIZE_SIZE__ */

.L19:
	rep
	stosb

	movl 8(ebp),eax

	leal -4(ebp),esp
	popl edi
	leave
#endif
	ret