]>
Commit | Line | Data |
---|---|---|
398d1083 DM |
1 | /* NGmemcpy.S: Niagara optimized memcpy. |
2 | * | |
25e5566e | 3 | * Copyright (C) 2006, 2007 David S. Miller ([email protected]) |
398d1083 DM |
4 | */ |
5 | ||
6 | #ifdef __KERNEL__ | |
7 | #include <asm/asi.h> | |
0d4bc95b | 8 | #include <asm/thread_info.h> |
398d1083 | 9 | #define GLOBAL_SPARE %g7 |
0d4bc95b DM |
10 | #define RESTORE_ASI(TMP) \ |
11 | ldub [%g6 + TI_CURRENT_DS], TMP; \ | |
12 | wr TMP, 0x0, %asi; | |
398d1083 DM |
13 | #else |
14 | #define GLOBAL_SPARE %g5 | |
0d4bc95b DM |
15 | #define RESTORE_ASI(TMP) \ |
16 | wr %g0, ASI_PNF, %asi | |
398d1083 DM |
17 | #endif |
18 | ||
25e5566e DM |
19 | #ifdef __sparc_v9__ |
20 | #define SAVE_AMOUNT 128 | |
21 | #else | |
22 | #define SAVE_AMOUNT 64 | |
23 | #endif | |
24 | ||
398d1083 DM |
25 | #ifndef STORE_ASI |
26 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | |
27 | #endif | |
28 | ||
29 | #ifndef EX_LD | |
30 | #define EX_LD(x) x | |
31 | #endif | |
32 | ||
33 | #ifndef EX_ST | |
34 | #define EX_ST(x) x | |
35 | #endif | |
36 | ||
37 | #ifndef EX_RETVAL | |
38 | #define EX_RETVAL(x) x | |
39 | #endif | |
40 | ||
41 | #ifndef LOAD | |
42 | #ifndef MEMCPY_DEBUG | |
43 | #define LOAD(type,addr,dest) type [addr], dest | |
44 | #else | |
45 | #define LOAD(type,addr,dest) type##a [addr] 0x80, dest | |
46 | #endif | |
47 | #endif | |
48 | ||
49 | #ifndef LOAD_TWIN | |
50 | #define LOAD_TWIN(addr_reg,dest0,dest1) \ | |
51 | ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 | |
52 | #endif | |
53 | ||
54 | #ifndef STORE | |
55 | #define STORE(type,src,addr) type src, [addr] | |
56 | #endif | |
57 | ||
58 | #ifndef STORE_INIT | |
25e5566e | 59 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA |
398d1083 | 60 | #define STORE_INIT(src,addr) stxa src, [addr] %asi |
25e5566e DM |
61 | #else |
62 | #define STORE_INIT(src,addr) stx src, [addr + 0x00] | |
63 | #endif | |
398d1083 DM |
64 | #endif |
65 | ||
66 | #ifndef FUNC_NAME | |
67 | #define FUNC_NAME NGmemcpy | |
68 | #endif | |
69 | ||
70 | #ifndef PREAMBLE | |
71 | #define PREAMBLE | |
72 | #endif | |
73 | ||
74 | #ifndef XCC | |
75 | #define XCC xcc | |
76 | #endif | |
77 | ||
78 | .register %g2,#scratch | |
79 | .register %g3,#scratch | |
80 | ||
81 | .text | |
82 | .align 64 | |
83 | ||
84 | .globl FUNC_NAME | |
85 | .type FUNC_NAME,#function | |
25e5566e DM |
86 | FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ |
87 | PREAMBLE | |
88 | save %sp, -SAVE_AMOUNT, %sp | |
89 | srlx %i2, 31, %g2 | |
398d1083 DM |
90 | cmp %g2, 0 |
91 | tne %xcc, 5 | |
25e5566e DM |
92 | mov %i0, %o0 |
93 | cmp %i2, 0 | |
398d1083 | 94 | be,pn %XCC, 85f |
25e5566e DM |
95 | or %o0, %i1, %i3 |
96 | cmp %i2, 16 | |
398d1083 | 97 | blu,a,pn %XCC, 80f |
25e5566e | 98 | or %i3, %i2, %i3 |
398d1083 DM |
99 | |
100 | /* 2 blocks (128 bytes) is the minimum we can do the block | |
101 | * copy with. We need to ensure that we'll iterate at least | |
102 | * once in the block copy loop. At worst we'll need to align | |
103 | * the destination to a 64-byte boundary which can chew up | |
104 | * to (64 - 1) bytes from the length before we perform the | |
105 | * block copy loop. | |
106 | */ | |
25e5566e | 107 | cmp %i2, (2 * 64) |
398d1083 | 108 | blu,pt %XCC, 70f |
25e5566e | 109 | andcc %i3, 0x7, %g0 |
398d1083 DM |
110 | |
111 | /* %o0: dst | |
25e5566e DM |
112 | * %i1: src |
113 | * %i2: len (known to be >= 128) | |
398d1083 | 114 | * |
25e5566e | 115 | * The block copy loops will use %i4/%i5,%g2/%g3 as |
398d1083 DM |
116 | * temporaries while copying the data. |
117 | */ | |
118 | ||
25e5566e | 119 | LOAD(prefetch, %i1, #one_read) |
398d1083 DM |
120 | wr %g0, STORE_ASI, %asi |
121 | ||
122 | /* Align destination on 64-byte boundary. */ | |
25e5566e | 123 | andcc %o0, (64 - 1), %i4 |
398d1083 | 124 | be,pt %XCC, 2f |
25e5566e DM |
125 | sub %i4, 64, %i4 |
126 | sub %g0, %i4, %i4 ! bytes to align dst | |
127 | sub %i2, %i4, %i2 | |
128 | 1: subcc %i4, 1, %i4 | |
129 | EX_LD(LOAD(ldub, %i1, %g1)) | |
398d1083 | 130 | EX_ST(STORE(stb, %g1, %o0)) |
25e5566e | 131 | add %i1, 1, %i1 |
398d1083 DM |
132 | bne,pt %XCC, 1b |
133 | add %o0, 1, %o0 | |
134 | ||
135 | /* If the source is on a 16-byte boundary we can do | |
136 | * the direct block copy loop. If it is 8-byte aligned | |
137 | * we can do the 16-byte loads offset by -8 bytes and the | |
138 | * init stores offset by one register. | |
139 | * | |
140 | * If the source is not even 8-byte aligned, we need to do | |
141 | * shifting and masking (basically integer faligndata). | |
142 | * | |
143 | * The careful bit with init stores is that if we store | |
144 | * to any part of the cache line we have to store the whole | |
145 | * cacheline else we can end up with corrupt L2 cache line | |
146 | * contents. Since the loop works on 64-bytes of 64-byte | |
147 | * aligned store data at a time, this is easy to ensure. | |
148 | */ | |
149 | 2: | |
25e5566e DM |
150 | andcc %i1, (16 - 1), %i4 |
151 | andn %i2, (64 - 1), %g1 ! block copy loop iterator | |
398d1083 | 152 | be,pt %XCC, 50f |
25e5566e DM |
153 | sub %i2, %g1, %i2 ! final sub-block copy bytes |
154 | ||
155 | cmp %i4, 8 | |
156 | be,pt %XCC, 10f | |
157 | sub %i1, %i4, %i1 | |
398d1083 DM |
158 | |
159 | /* Neither 8-byte nor 16-byte aligned, shift and mask. */ | |
25e5566e DM |
160 | and %i4, 0x7, GLOBAL_SPARE |
161 | sll GLOBAL_SPARE, 3, GLOBAL_SPARE | |
162 | mov 64, %i5 | |
163 | EX_LD(LOAD_TWIN(%i1, %g2, %g3)) | |
164 | sub %i5, GLOBAL_SPARE, %i5 | |
165 | mov 16, %o4 | |
166 | mov 32, %o5 | |
167 | mov 48, %o7 | |
168 | mov 64, %i3 | |
169 | ||
170 | bg,pn %XCC, 9f | |
171 | nop | |
398d1083 | 172 | |
25e5566e DM |
173 | #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ |
174 | sllx WORD1, POST_SHIFT, WORD1; \ | |
175 | srlx WORD2, PRE_SHIFT, TMP; \ | |
176 | sllx WORD2, POST_SHIFT, WORD2; \ | |
177 | or WORD1, TMP, WORD1; \ | |
178 | srlx WORD3, PRE_SHIFT, TMP; \ | |
179 | or WORD2, TMP, WORD2; | |
180 | ||
181 | 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3)) | |
182 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) | |
183 | LOAD(prefetch, %i1 + %i3, #one_read) | |
184 | ||
185 | EX_ST(STORE_INIT(%g2, %o0 + 0x00)) | |
186 | EX_ST(STORE_INIT(%g3, %o0 + 0x08)) | |
187 | ||
188 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3)) | |
189 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) | |
190 | ||
191 | EX_ST(STORE_INIT(%o2, %o0 + 0x10)) | |
192 | EX_ST(STORE_INIT(%o3, %o0 + 0x18)) | |
193 | ||
194 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | |
195 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) | |
196 | ||
197 | EX_ST(STORE_INIT(%g2, %o0 + 0x20)) | |
198 | EX_ST(STORE_INIT(%g3, %o0 + 0x28)) | |
199 | ||
200 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3)) | |
201 | add %i1, 64, %i1 | |
202 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) | |
203 | ||
204 | EX_ST(STORE_INIT(%o2, %o0 + 0x30)) | |
205 | EX_ST(STORE_INIT(%o3, %o0 + 0x38)) | |
206 | ||
207 | subcc %g1, 64, %g1 | |
208 | bne,pt %XCC, 8b | |
398d1083 DM |
209 | add %o0, 64, %o0 |
210 | ||
25e5566e DM |
211 | ba,pt %XCC, 60f |
212 | add %i1, %i4, %i1 | |
213 | ||
214 | 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3)) | |
215 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) | |
216 | LOAD(prefetch, %i1 + %i3, #one_read) | |
217 | ||
218 | EX_ST(STORE_INIT(%g3, %o0 + 0x00)) | |
219 | EX_ST(STORE_INIT(%o2, %o0 + 0x08)) | |
220 | ||
221 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3)) | |
222 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) | |
223 | ||
224 | EX_ST(STORE_INIT(%o3, %o0 + 0x10)) | |
225 | EX_ST(STORE_INIT(%g2, %o0 + 0x18)) | |
226 | ||
227 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | |
228 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) | |
229 | ||
230 | EX_ST(STORE_INIT(%g3, %o0 + 0x20)) | |
231 | EX_ST(STORE_INIT(%o2, %o0 + 0x28)) | |
232 | ||
233 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3)) | |
234 | add %i1, 64, %i1 | |
235 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) | |
236 | ||
237 | EX_ST(STORE_INIT(%o3, %o0 + 0x30)) | |
238 | EX_ST(STORE_INIT(%g2, %o0 + 0x38)) | |
239 | ||
240 | subcc %g1, 64, %g1 | |
241 | bne,pt %XCC, 9b | |
242 | add %o0, 64, %o0 | |
398d1083 | 243 | |
398d1083 | 244 | ba,pt %XCC, 60f |
25e5566e | 245 | add %i1, %i4, %i1 |
398d1083 DM |
246 | |
247 | 10: /* Destination is 64-byte aligned, source was only 8-byte | |
248 | * aligned but it has been subtracted by 8 and we perform | |
249 | * one twin load ahead, then add 8 back into source when | |
250 | * we finish the loop. | |
251 | */ | |
25e5566e DM |
252 | EX_LD(LOAD_TWIN(%i1, %o4, %o5)) |
253 | mov 16, %o7 | |
254 | mov 32, %g2 | |
255 | mov 48, %g3 | |
256 | mov 64, %o1 | |
257 | 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | |
258 | LOAD(prefetch, %i1 + %o1, #one_read) | |
398d1083 | 259 | EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line |
25e5566e DM |
260 | EX_ST(STORE_INIT(%o2, %o0 + 0x08)) |
261 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5)) | |
262 | EX_ST(STORE_INIT(%o3, %o0 + 0x10)) | |
398d1083 | 263 | EX_ST(STORE_INIT(%o4, %o0 + 0x18)) |
25e5566e | 264 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3)) |
398d1083 | 265 | EX_ST(STORE_INIT(%o5, %o0 + 0x20)) |
25e5566e DM |
266 | EX_ST(STORE_INIT(%o2, %o0 + 0x28)) |
267 | EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5)) | |
268 | add %i1, 64, %i1 | |
269 | EX_ST(STORE_INIT(%o3, %o0 + 0x30)) | |
398d1083 DM |
270 | EX_ST(STORE_INIT(%o4, %o0 + 0x38)) |
271 | subcc %g1, 64, %g1 | |
272 | bne,pt %XCC, 1b | |
273 | add %o0, 64, %o0 | |
274 | ||
275 | ba,pt %XCC, 60f | |
25e5566e | 276 | add %i1, 0x8, %i1 |
398d1083 DM |
277 | |
278 | 50: /* Destination is 64-byte aligned, and source is 16-byte | |
279 | * aligned. | |
280 | */ | |
25e5566e DM |
281 | mov 16, %o7 |
282 | mov 32, %g2 | |
283 | mov 48, %g3 | |
284 | mov 64, %o1 | |
285 | 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5)) | |
286 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | |
287 | LOAD(prefetch, %i1 + %o1, #one_read) | |
398d1083 DM |
288 | EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line |
289 | EX_ST(STORE_INIT(%o5, %o0 + 0x08)) | |
25e5566e DM |
290 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5)) |
291 | EX_ST(STORE_INIT(%o2, %o0 + 0x10)) | |
292 | EX_ST(STORE_INIT(%o3, %o0 + 0x18)) | |
293 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3)) | |
294 | add %i1, 64, %i1 | |
398d1083 DM |
295 | EX_ST(STORE_INIT(%o4, %o0 + 0x20)) |
296 | EX_ST(STORE_INIT(%o5, %o0 + 0x28)) | |
25e5566e DM |
297 | EX_ST(STORE_INIT(%o2, %o0 + 0x30)) |
298 | EX_ST(STORE_INIT(%o3, %o0 + 0x38)) | |
398d1083 DM |
299 | subcc %g1, 64, %g1 |
300 | bne,pt %XCC, 1b | |
301 | add %o0, 64, %o0 | |
302 | /* fall through */ | |
303 | ||
304 | 60: | |
24d559ca DM |
305 | membar #Sync |
306 | ||
25e5566e | 307 | /* %i2 contains any final bytes still needed to be copied |
398d1083 DM |
308 | * over. If anything is left, we copy it one byte at a time. |
309 | */ | |
25e5566e DM |
310 | RESTORE_ASI(%i3) |
311 | brz,pt %i2, 85f | |
312 | sub %o0, %i1, %i3 | |
398d1083 DM |
313 | ba,a,pt %XCC, 90f |
314 | ||
315 | .align 64 | |
316 | 70: /* 16 < len <= 64 */ | |
317 | bne,pn %XCC, 75f | |
25e5566e | 318 | sub %o0, %i1, %i3 |
398d1083 DM |
319 | |
320 | 72: | |
25e5566e DM |
321 | andn %i2, 0xf, %i4 |
322 | and %i2, 0xf, %i2 | |
323 | 1: subcc %i4, 0x10, %i4 | |
a4aa2e86 | 324 | EX_LD(LOAD(ldx, %i1, %o4)) |
25e5566e DM |
325 | add %i1, 0x08, %i1 |
326 | EX_LD(LOAD(ldx, %i1, %g1)) | |
327 | sub %i1, 0x08, %i1 | |
a4aa2e86 | 328 | EX_ST(STORE(stx, %o4, %i1 + %i3)) |
25e5566e DM |
329 | add %i1, 0x8, %i1 |
330 | EX_ST(STORE(stx, %g1, %i1 + %i3)) | |
398d1083 | 331 | bgu,pt %XCC, 1b |
25e5566e DM |
332 | add %i1, 0x8, %i1 |
333 | 73: andcc %i2, 0x8, %g0 | |
398d1083 DM |
334 | be,pt %XCC, 1f |
335 | nop | |
25e5566e | 336 | sub %i2, 0x8, %i2 |
a4aa2e86 DM |
337 | EX_LD(LOAD(ldx, %i1, %o4)) |
338 | EX_ST(STORE(stx, %o4, %i1 + %i3)) | |
25e5566e DM |
339 | add %i1, 0x8, %i1 |
340 | 1: andcc %i2, 0x4, %g0 | |
398d1083 DM |
341 | be,pt %XCC, 1f |
342 | nop | |
25e5566e DM |
343 | sub %i2, 0x4, %i2 |
344 | EX_LD(LOAD(lduw, %i1, %i5)) | |
345 | EX_ST(STORE(stw, %i5, %i1 + %i3)) | |
346 | add %i1, 0x4, %i1 | |
347 | 1: cmp %i2, 0 | |
398d1083 DM |
348 | be,pt %XCC, 85f |
349 | nop | |
350 | ba,pt %xcc, 90f | |
351 | nop | |
352 | ||
353 | 75: | |
354 | andcc %o0, 0x7, %g1 | |
355 | sub %g1, 0x8, %g1 | |
356 | be,pn %icc, 2f | |
357 | sub %g0, %g1, %g1 | |
25e5566e | 358 | sub %i2, %g1, %i2 |
398d1083 DM |
359 | |
360 | 1: subcc %g1, 1, %g1 | |
25e5566e DM |
361 | EX_LD(LOAD(ldub, %i1, %i5)) |
362 | EX_ST(STORE(stb, %i5, %i1 + %i3)) | |
398d1083 | 363 | bgu,pt %icc, 1b |
25e5566e | 364 | add %i1, 1, %i1 |
398d1083 | 365 | |
25e5566e DM |
366 | 2: add %i1, %i3, %o0 |
367 | andcc %i1, 0x7, %g1 | |
398d1083 DM |
368 | bne,pt %icc, 8f |
369 | sll %g1, 3, %g1 | |
370 | ||
25e5566e | 371 | cmp %i2, 16 |
398d1083 DM |
372 | bgeu,pt %icc, 72b |
373 | nop | |
374 | ba,a,pt %xcc, 73b | |
375 | ||
25e5566e DM |
376 | 8: mov 64, %i3 |
377 | andn %i1, 0x7, %i1 | |
378 | EX_LD(LOAD(ldx, %i1, %g2)) | |
379 | sub %i3, %g1, %i3 | |
380 | andn %i2, 0x7, %i4 | |
398d1083 | 381 | sllx %g2, %g1, %g2 |
25e5566e DM |
382 | 1: add %i1, 0x8, %i1 |
383 | EX_LD(LOAD(ldx, %i1, %g3)) | |
384 | subcc %i4, 0x8, %i4 | |
385 | srlx %g3, %i3, %i5 | |
386 | or %i5, %g2, %i5 | |
387 | EX_ST(STORE(stx, %i5, %o0)) | |
398d1083 DM |
388 | add %o0, 0x8, %o0 |
389 | bgu,pt %icc, 1b | |
390 | sllx %g3, %g1, %g2 | |
391 | ||
392 | srl %g1, 3, %g1 | |
25e5566e | 393 | andcc %i2, 0x7, %i2 |
398d1083 | 394 | be,pn %icc, 85f |
25e5566e | 395 | add %i1, %g1, %i1 |
398d1083 | 396 | ba,pt %xcc, 90f |
25e5566e | 397 | sub %o0, %i1, %i3 |
398d1083 DM |
398 | |
399 | .align 64 | |
400 | 80: /* 0 < len <= 16 */ | |
25e5566e | 401 | andcc %i3, 0x3, %g0 |
398d1083 | 402 | bne,pn %XCC, 90f |
25e5566e | 403 | sub %o0, %i1, %i3 |
398d1083 DM |
404 | |
405 | 1: | |
25e5566e DM |
406 | subcc %i2, 4, %i2 |
407 | EX_LD(LOAD(lduw, %i1, %g1)) | |
408 | EX_ST(STORE(stw, %g1, %i1 + %i3)) | |
398d1083 | 409 | bgu,pt %XCC, 1b |
25e5566e | 410 | add %i1, 4, %i1 |
398d1083 | 411 | |
25e5566e DM |
412 | 85: ret |
413 | restore EX_RETVAL(%i0), %g0, %o0 | |
398d1083 DM |
414 | |
415 | .align 32 | |
416 | 90: | |
25e5566e DM |
417 | subcc %i2, 1, %i2 |
418 | EX_LD(LOAD(ldub, %i1, %g1)) | |
419 | EX_ST(STORE(stb, %g1, %i1 + %i3)) | |
398d1083 | 420 | bgu,pt %XCC, 90b |
25e5566e DM |
421 | add %i1, 1, %i1 |
422 | ret | |
423 | restore EX_RETVAL(%i0), %g0, %o0 | |
398d1083 DM |
424 | |
425 | .size FUNC_NAME, .-FUNC_NAME |