]>
Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* Copyright 2002 Andi Kleen */ |
038b0a6d | 2 | |
8d379dad | 3 | #include <linux/linkage.h> |
8d379dad | 4 | #include <asm/cpufeature.h> |
f3b6eaf0 | 5 | #include <asm/dwarf2.h> |
101068c1 | 6 | #include <asm/alternative-asm.h> |
8d379dad | 7 | |
e0bc8d17 BP |
8 | /* |
9 | * We build a jump to memcpy_orig by default which gets NOPped out on | |
10 | * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which | |
11 | * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs | |
12 | * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. | |
13 | */ | |
14 | ||
15 | .weak memcpy | |
16 | ||
1da177e4 LT |
17 | /* |
18 | * memcpy - Copy a memory block. | |
19 | * | |
f3b6eaf0 IM |
20 | * Input: |
21 | * rdi destination | |
22 | * rsi source | |
23 | * rdx count | |
24 | * | |
1da177e4 LT |
25 | * Output: |
26 | * rax original destination | |
f3b6eaf0 | 27 | */ |
e0bc8d17 BP |
28 | ENTRY(__memcpy) |
29 | ENTRY(memcpy) | |
30 | ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ | |
31 | "jmp memcpy_erms", X86_FEATURE_ERMS | |
1da177e4 | 32 | |
f3b6eaf0 | 33 | movq %rdi, %rax |
2ab56091 JB |
34 | movq %rdx, %rcx |
35 | shrq $3, %rcx | |
f3b6eaf0 | 36 | andl $7, %edx |
8d379dad | 37 | rep movsq |
f3b6eaf0 | 38 | movl %edx, %ecx |
8d379dad JB |
39 | rep movsb |
40 | ret | |
e0bc8d17 BP |
41 | ENDPROC(memcpy) |
42 | ENDPROC(__memcpy) | |
8d379dad | 43 | |
101068c1 | 44 | /* |
e0bc8d17 BP |
45 | * memcpy_erms() - enhanced fast string memcpy. This is faster and |
46 | * simpler than memcpy. Use memcpy_erms when possible. | |
101068c1 | 47 | */ |
e0bc8d17 | 48 | ENTRY(memcpy_erms) |
101068c1 | 49 | movq %rdi, %rax |
2ab56091 | 50 | movq %rdx, %rcx |
101068c1 FY |
51 | rep movsb |
52 | ret | |
e0bc8d17 | 53 | ENDPROC(memcpy_erms) |
393f203f | 54 | |
e0bc8d17 | 55 | ENTRY(memcpy_orig) |
8d379dad | 56 | CFI_STARTPROC |
59daa706 | 57 | movq %rdi, %rax |
7bcd3f34 | 58 | |
2ab56091 | 59 | cmpq $0x20, %rdx |
59daa706 | 60 | jb .Lhandle_tail |
7bcd3f34 | 61 | |
f3b6eaf0 | 62 | /* |
9de4966a | 63 | * We check whether memory false dependence could occur, |
59daa706 | 64 | * then jump to corresponding copy mode. |
f3b6eaf0 | 65 | */ |
59daa706 ML |
66 | cmp %dil, %sil |
67 | jl .Lcopy_backward | |
2ab56091 | 68 | subq $0x20, %rdx |
59daa706 ML |
69 | .Lcopy_forward_loop: |
70 | subq $0x20, %rdx | |
7bcd3f34 | 71 | |
f3b6eaf0 | 72 | /* |
59daa706 | 73 | * Move in blocks of 4x8 bytes: |
f3b6eaf0 | 74 | */ |
59daa706 ML |
75 | movq 0*8(%rsi), %r8 |
76 | movq 1*8(%rsi), %r9 | |
77 | movq 2*8(%rsi), %r10 | |
78 | movq 3*8(%rsi), %r11 | |
79 | leaq 4*8(%rsi), %rsi | |
80 | ||
81 | movq %r8, 0*8(%rdi) | |
82 | movq %r9, 1*8(%rdi) | |
83 | movq %r10, 2*8(%rdi) | |
84 | movq %r11, 3*8(%rdi) | |
85 | leaq 4*8(%rdi), %rdi | |
86 | jae .Lcopy_forward_loop | |
2ab56091 | 87 | addl $0x20, %edx |
59daa706 ML |
88 | jmp .Lhandle_tail |
89 | ||
90 | .Lcopy_backward: | |
91 | /* | |
92 | * Calculate copy position to tail. | |
93 | */ | |
94 | addq %rdx, %rsi | |
95 | addq %rdx, %rdi | |
96 | subq $0x20, %rdx | |
97 | /* | |
98 | * At most 3 ALU operations in one cycle, | |
d50ba368 | 99 | * so append NOPS in the same 16 bytes trunk. |
59daa706 ML |
100 | */ |
101 | .p2align 4 | |
102 | .Lcopy_backward_loop: | |
103 | subq $0x20, %rdx | |
104 | movq -1*8(%rsi), %r8 | |
105 | movq -2*8(%rsi), %r9 | |
106 | movq -3*8(%rsi), %r10 | |
107 | movq -4*8(%rsi), %r11 | |
108 | leaq -4*8(%rsi), %rsi | |
109 | movq %r8, -1*8(%rdi) | |
110 | movq %r9, -2*8(%rdi) | |
111 | movq %r10, -3*8(%rdi) | |
112 | movq %r11, -4*8(%rdi) | |
113 | leaq -4*8(%rdi), %rdi | |
114 | jae .Lcopy_backward_loop | |
7bcd3f34 | 115 | |
59daa706 ML |
116 | /* |
117 | * Calculate copy position to head. | |
118 | */ | |
2ab56091 | 119 | addl $0x20, %edx |
59daa706 ML |
120 | subq %rdx, %rsi |
121 | subq %rdx, %rdi | |
7bcd3f34 | 122 | .Lhandle_tail: |
2ab56091 | 123 | cmpl $16, %edx |
59daa706 | 124 | jb .Lless_16bytes |
f3b6eaf0 | 125 | |
59daa706 ML |
126 | /* |
127 | * Move data from 16 bytes to 31 bytes. | |
128 | */ | |
129 | movq 0*8(%rsi), %r8 | |
130 | movq 1*8(%rsi), %r9 | |
131 | movq -2*8(%rsi, %rdx), %r10 | |
132 | movq -1*8(%rsi, %rdx), %r11 | |
133 | movq %r8, 0*8(%rdi) | |
134 | movq %r9, 1*8(%rdi) | |
135 | movq %r10, -2*8(%rdi, %rdx) | |
136 | movq %r11, -1*8(%rdi, %rdx) | |
137 | retq | |
7bcd3f34 | 138 | .p2align 4 |
59daa706 | 139 | .Lless_16bytes: |
2ab56091 | 140 | cmpl $8, %edx |
59daa706 ML |
141 | jb .Lless_8bytes |
142 | /* | |
143 | * Move data from 8 bytes to 15 bytes. | |
144 | */ | |
145 | movq 0*8(%rsi), %r8 | |
146 | movq -1*8(%rsi, %rdx), %r9 | |
147 | movq %r8, 0*8(%rdi) | |
148 | movq %r9, -1*8(%rdi, %rdx) | |
149 | retq | |
150 | .p2align 4 | |
151 | .Lless_8bytes: | |
2ab56091 | 152 | cmpl $4, %edx |
59daa706 | 153 | jb .Lless_3bytes |
f3b6eaf0 | 154 | |
59daa706 ML |
155 | /* |
156 | * Move data from 4 bytes to 7 bytes. | |
157 | */ | |
158 | movl (%rsi), %ecx | |
159 | movl -4(%rsi, %rdx), %r8d | |
160 | movl %ecx, (%rdi) | |
161 | movl %r8d, -4(%rdi, %rdx) | |
162 | retq | |
7bcd3f34 | 163 | .p2align 4 |
59daa706 | 164 | .Lless_3bytes: |
9d8e2277 JB |
165 | subl $1, %edx |
166 | jb .Lend | |
59daa706 ML |
167 | /* |
168 | * Move data from 1 bytes to 3 bytes. | |
169 | */ | |
9d8e2277 JB |
170 | movzbl (%rsi), %ecx |
171 | jz .Lstore_1byte | |
172 | movzbq 1(%rsi), %r8 | |
173 | movzbq (%rsi, %rdx), %r9 | |
174 | movb %r8b, 1(%rdi) | |
175 | movb %r9b, (%rdi, %rdx) | |
176 | .Lstore_1byte: | |
177 | movb %cl, (%rdi) | |
7bcd3f34 | 178 | |
f3b6eaf0 | 179 | .Lend: |
59daa706 | 180 | retq |
8d379dad | 181 | CFI_ENDPROC |
e0bc8d17 | 182 | ENDPROC(memcpy_orig) |