]>
Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* |
1da177e4 LT |
2 | * Optimized RAID-5 checksumming functions for MMX and SSE. |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License as published by | |
6 | * the Free Software Foundation; either version 2, or (at your option) | |
7 | * any later version. | |
8 | * | |
9 | * You should have received a copy of the GNU General Public License | |
10 | * (for example /usr/src/linux/COPYING); if not, write to the Free | |
11 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
12 | */ | |
13 | ||
14 | /* | |
15 | * High-speed RAID5 checksumming functions utilizing MMX instructions. | |
16 | * Copyright (C) 1998 Ingo Molnar. | |
17 | */ | |
18 | ||
19 | #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n" | |
20 | #define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n" | |
21 | #define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" | |
22 | #define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" | |
23 | #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" | |
24 | #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" | |
25 | ||
26 | #include <asm/i387.h> | |
27 | ||
28 | static void | |
29 | xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
30 | { | |
31 | unsigned long lines = bytes >> 7; | |
32 | ||
33 | kernel_fpu_begin(); | |
34 | ||
35 | __asm__ __volatile__ ( | |
36 | #undef BLOCK | |
37 | #define BLOCK(i) \ | |
38 | LD(i,0) \ | |
39 | LD(i+1,1) \ | |
40 | LD(i+2,2) \ | |
41 | LD(i+3,3) \ | |
42 | XO1(i,0) \ | |
43 | ST(i,0) \ | |
44 | XO1(i+1,1) \ | |
45 | ST(i+1,1) \ | |
46 | XO1(i+2,2) \ | |
47 | ST(i+2,2) \ | |
48 | XO1(i+3,3) \ | |
49 | ST(i+3,3) | |
50 | ||
51 | " .align 32 ;\n" | |
52 | " 1: ;\n" | |
53 | ||
54 | BLOCK(0) | |
55 | BLOCK(4) | |
56 | BLOCK(8) | |
57 | BLOCK(12) | |
58 | ||
59 | " addl $128, %1 ;\n" | |
60 | " addl $128, %2 ;\n" | |
61 | " decl %0 ;\n" | |
62 | " jnz 1b ;\n" | |
63 | : "+r" (lines), | |
64 | "+r" (p1), "+r" (p2) | |
65 | : | |
66 | : "memory"); | |
67 | ||
68 | kernel_fpu_end(); | |
69 | } | |
70 | ||
71 | static void | |
72 | xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
73 | unsigned long *p3) | |
74 | { | |
75 | unsigned long lines = bytes >> 7; | |
76 | ||
77 | kernel_fpu_begin(); | |
78 | ||
79 | __asm__ __volatile__ ( | |
80 | #undef BLOCK | |
81 | #define BLOCK(i) \ | |
82 | LD(i,0) \ | |
83 | LD(i+1,1) \ | |
84 | LD(i+2,2) \ | |
85 | LD(i+3,3) \ | |
86 | XO1(i,0) \ | |
87 | XO1(i+1,1) \ | |
88 | XO1(i+2,2) \ | |
89 | XO1(i+3,3) \ | |
90 | XO2(i,0) \ | |
91 | ST(i,0) \ | |
92 | XO2(i+1,1) \ | |
93 | ST(i+1,1) \ | |
94 | XO2(i+2,2) \ | |
95 | ST(i+2,2) \ | |
96 | XO2(i+3,3) \ | |
97 | ST(i+3,3) | |
98 | ||
99 | " .align 32 ;\n" | |
100 | " 1: ;\n" | |
101 | ||
102 | BLOCK(0) | |
103 | BLOCK(4) | |
104 | BLOCK(8) | |
105 | BLOCK(12) | |
106 | ||
107 | " addl $128, %1 ;\n" | |
108 | " addl $128, %2 ;\n" | |
109 | " addl $128, %3 ;\n" | |
110 | " decl %0 ;\n" | |
111 | " jnz 1b ;\n" | |
112 | : "+r" (lines), | |
113 | "+r" (p1), "+r" (p2), "+r" (p3) | |
114 | : | |
115 | : "memory"); | |
116 | ||
117 | kernel_fpu_end(); | |
118 | } | |
119 | ||
120 | static void | |
121 | xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
122 | unsigned long *p3, unsigned long *p4) | |
123 | { | |
124 | unsigned long lines = bytes >> 7; | |
125 | ||
126 | kernel_fpu_begin(); | |
127 | ||
128 | __asm__ __volatile__ ( | |
129 | #undef BLOCK | |
130 | #define BLOCK(i) \ | |
131 | LD(i,0) \ | |
132 | LD(i+1,1) \ | |
133 | LD(i+2,2) \ | |
134 | LD(i+3,3) \ | |
135 | XO1(i,0) \ | |
136 | XO1(i+1,1) \ | |
137 | XO1(i+2,2) \ | |
138 | XO1(i+3,3) \ | |
139 | XO2(i,0) \ | |
140 | XO2(i+1,1) \ | |
141 | XO2(i+2,2) \ | |
142 | XO2(i+3,3) \ | |
143 | XO3(i,0) \ | |
144 | ST(i,0) \ | |
145 | XO3(i+1,1) \ | |
146 | ST(i+1,1) \ | |
147 | XO3(i+2,2) \ | |
148 | ST(i+2,2) \ | |
149 | XO3(i+3,3) \ | |
150 | ST(i+3,3) | |
151 | ||
152 | " .align 32 ;\n" | |
153 | " 1: ;\n" | |
154 | ||
155 | BLOCK(0) | |
156 | BLOCK(4) | |
157 | BLOCK(8) | |
158 | BLOCK(12) | |
159 | ||
160 | " addl $128, %1 ;\n" | |
161 | " addl $128, %2 ;\n" | |
162 | " addl $128, %3 ;\n" | |
163 | " addl $128, %4 ;\n" | |
164 | " decl %0 ;\n" | |
165 | " jnz 1b ;\n" | |
166 | : "+r" (lines), | |
167 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
168 | : | |
169 | : "memory"); | |
170 | ||
171 | kernel_fpu_end(); | |
172 | } | |
173 | ||
174 | ||
175 | static void | |
176 | xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
177 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
178 | { | |
179 | unsigned long lines = bytes >> 7; | |
180 | ||
181 | kernel_fpu_begin(); | |
182 | ||
183 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
184 | such that it won't pass to the asm volatile below a | |
185 | register that is shared with any other variable. That's | |
186 | because we modify p4 and p5 there, but we can't mark them | |
187 | as read/write, otherwise we'd overflow the 10-asm-operands | |
188 | limit of GCC < 3.1. */ | |
189 | __asm__ ("" : "+r" (p4), "+r" (p5)); | |
190 | ||
191 | __asm__ __volatile__ ( | |
192 | #undef BLOCK | |
193 | #define BLOCK(i) \ | |
194 | LD(i,0) \ | |
195 | LD(i+1,1) \ | |
196 | LD(i+2,2) \ | |
197 | LD(i+3,3) \ | |
198 | XO1(i,0) \ | |
199 | XO1(i+1,1) \ | |
200 | XO1(i+2,2) \ | |
201 | XO1(i+3,3) \ | |
202 | XO2(i,0) \ | |
203 | XO2(i+1,1) \ | |
204 | XO2(i+2,2) \ | |
205 | XO2(i+3,3) \ | |
206 | XO3(i,0) \ | |
207 | XO3(i+1,1) \ | |
208 | XO3(i+2,2) \ | |
209 | XO3(i+3,3) \ | |
210 | XO4(i,0) \ | |
211 | ST(i,0) \ | |
212 | XO4(i+1,1) \ | |
213 | ST(i+1,1) \ | |
214 | XO4(i+2,2) \ | |
215 | ST(i+2,2) \ | |
216 | XO4(i+3,3) \ | |
217 | ST(i+3,3) | |
218 | ||
219 | " .align 32 ;\n" | |
220 | " 1: ;\n" | |
221 | ||
222 | BLOCK(0) | |
223 | BLOCK(4) | |
224 | BLOCK(8) | |
225 | BLOCK(12) | |
226 | ||
227 | " addl $128, %1 ;\n" | |
228 | " addl $128, %2 ;\n" | |
229 | " addl $128, %3 ;\n" | |
230 | " addl $128, %4 ;\n" | |
231 | " addl $128, %5 ;\n" | |
232 | " decl %0 ;\n" | |
233 | " jnz 1b ;\n" | |
234 | : "+r" (lines), | |
235 | "+r" (p1), "+r" (p2), "+r" (p3) | |
236 | : "r" (p4), "r" (p5) | |
237 | : "memory"); | |
238 | ||
239 | /* p4 and p5 were modified, and now the variables are dead. | |
240 | Clobber them just to be sure nobody does something stupid | |
241 | like assuming they have some legal value. */ | |
242 | __asm__ ("" : "=r" (p4), "=r" (p5)); | |
243 | ||
244 | kernel_fpu_end(); | |
245 | } | |
246 | ||
247 | #undef LD | |
248 | #undef XO1 | |
249 | #undef XO2 | |
250 | #undef XO3 | |
251 | #undef XO4 | |
252 | #undef ST | |
253 | #undef BLOCK | |
254 | ||
255 | static void | |
256 | xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
257 | { | |
258 | unsigned long lines = bytes >> 6; | |
259 | ||
260 | kernel_fpu_begin(); | |
261 | ||
262 | __asm__ __volatile__ ( | |
263 | " .align 32 ;\n" | |
264 | " 1: ;\n" | |
265 | " movq (%1), %%mm0 ;\n" | |
266 | " movq 8(%1), %%mm1 ;\n" | |
267 | " pxor (%2), %%mm0 ;\n" | |
268 | " movq 16(%1), %%mm2 ;\n" | |
269 | " movq %%mm0, (%1) ;\n" | |
270 | " pxor 8(%2), %%mm1 ;\n" | |
271 | " movq 24(%1), %%mm3 ;\n" | |
272 | " movq %%mm1, 8(%1) ;\n" | |
273 | " pxor 16(%2), %%mm2 ;\n" | |
274 | " movq 32(%1), %%mm4 ;\n" | |
275 | " movq %%mm2, 16(%1) ;\n" | |
276 | " pxor 24(%2), %%mm3 ;\n" | |
277 | " movq 40(%1), %%mm5 ;\n" | |
278 | " movq %%mm3, 24(%1) ;\n" | |
279 | " pxor 32(%2), %%mm4 ;\n" | |
280 | " movq 48(%1), %%mm6 ;\n" | |
281 | " movq %%mm4, 32(%1) ;\n" | |
282 | " pxor 40(%2), %%mm5 ;\n" | |
283 | " movq 56(%1), %%mm7 ;\n" | |
284 | " movq %%mm5, 40(%1) ;\n" | |
285 | " pxor 48(%2), %%mm6 ;\n" | |
286 | " pxor 56(%2), %%mm7 ;\n" | |
287 | " movq %%mm6, 48(%1) ;\n" | |
288 | " movq %%mm7, 56(%1) ;\n" | |
289 | ||
290 | " addl $64, %1 ;\n" | |
291 | " addl $64, %2 ;\n" | |
292 | " decl %0 ;\n" | |
293 | " jnz 1b ;\n" | |
294 | : "+r" (lines), | |
295 | "+r" (p1), "+r" (p2) | |
296 | : | |
297 | : "memory"); | |
298 | ||
299 | kernel_fpu_end(); | |
300 | } | |
301 | ||
302 | static void | |
303 | xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
304 | unsigned long *p3) | |
305 | { | |
306 | unsigned long lines = bytes >> 6; | |
307 | ||
308 | kernel_fpu_begin(); | |
309 | ||
310 | __asm__ __volatile__ ( | |
311 | " .align 32,0x90 ;\n" | |
312 | " 1: ;\n" | |
313 | " movq (%1), %%mm0 ;\n" | |
314 | " movq 8(%1), %%mm1 ;\n" | |
315 | " pxor (%2), %%mm0 ;\n" | |
316 | " movq 16(%1), %%mm2 ;\n" | |
317 | " pxor 8(%2), %%mm1 ;\n" | |
318 | " pxor (%3), %%mm0 ;\n" | |
319 | " pxor 16(%2), %%mm2 ;\n" | |
320 | " movq %%mm0, (%1) ;\n" | |
321 | " pxor 8(%3), %%mm1 ;\n" | |
322 | " pxor 16(%3), %%mm2 ;\n" | |
323 | " movq 24(%1), %%mm3 ;\n" | |
324 | " movq %%mm1, 8(%1) ;\n" | |
325 | " movq 32(%1), %%mm4 ;\n" | |
326 | " movq 40(%1), %%mm5 ;\n" | |
327 | " pxor 24(%2), %%mm3 ;\n" | |
328 | " movq %%mm2, 16(%1) ;\n" | |
329 | " pxor 32(%2), %%mm4 ;\n" | |
330 | " pxor 24(%3), %%mm3 ;\n" | |
331 | " pxor 40(%2), %%mm5 ;\n" | |
332 | " movq %%mm3, 24(%1) ;\n" | |
333 | " pxor 32(%3), %%mm4 ;\n" | |
334 | " pxor 40(%3), %%mm5 ;\n" | |
335 | " movq 48(%1), %%mm6 ;\n" | |
336 | " movq %%mm4, 32(%1) ;\n" | |
337 | " movq 56(%1), %%mm7 ;\n" | |
338 | " pxor 48(%2), %%mm6 ;\n" | |
339 | " movq %%mm5, 40(%1) ;\n" | |
340 | " pxor 56(%2), %%mm7 ;\n" | |
341 | " pxor 48(%3), %%mm6 ;\n" | |
342 | " pxor 56(%3), %%mm7 ;\n" | |
343 | " movq %%mm6, 48(%1) ;\n" | |
344 | " movq %%mm7, 56(%1) ;\n" | |
345 | ||
346 | " addl $64, %1 ;\n" | |
347 | " addl $64, %2 ;\n" | |
348 | " addl $64, %3 ;\n" | |
349 | " decl %0 ;\n" | |
350 | " jnz 1b ;\n" | |
351 | : "+r" (lines), | |
352 | "+r" (p1), "+r" (p2), "+r" (p3) | |
353 | : | |
354 | : "memory" ); | |
355 | ||
356 | kernel_fpu_end(); | |
357 | } | |
358 | ||
359 | static void | |
360 | xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
361 | unsigned long *p3, unsigned long *p4) | |
362 | { | |
363 | unsigned long lines = bytes >> 6; | |
364 | ||
365 | kernel_fpu_begin(); | |
366 | ||
367 | __asm__ __volatile__ ( | |
368 | " .align 32,0x90 ;\n" | |
369 | " 1: ;\n" | |
370 | " movq (%1), %%mm0 ;\n" | |
371 | " movq 8(%1), %%mm1 ;\n" | |
372 | " pxor (%2), %%mm0 ;\n" | |
373 | " movq 16(%1), %%mm2 ;\n" | |
374 | " pxor 8(%2), %%mm1 ;\n" | |
375 | " pxor (%3), %%mm0 ;\n" | |
376 | " pxor 16(%2), %%mm2 ;\n" | |
377 | " pxor 8(%3), %%mm1 ;\n" | |
378 | " pxor (%4), %%mm0 ;\n" | |
379 | " movq 24(%1), %%mm3 ;\n" | |
380 | " pxor 16(%3), %%mm2 ;\n" | |
381 | " pxor 8(%4), %%mm1 ;\n" | |
382 | " movq %%mm0, (%1) ;\n" | |
383 | " movq 32(%1), %%mm4 ;\n" | |
384 | " pxor 24(%2), %%mm3 ;\n" | |
385 | " pxor 16(%4), %%mm2 ;\n" | |
386 | " movq %%mm1, 8(%1) ;\n" | |
387 | " movq 40(%1), %%mm5 ;\n" | |
388 | " pxor 32(%2), %%mm4 ;\n" | |
389 | " pxor 24(%3), %%mm3 ;\n" | |
390 | " movq %%mm2, 16(%1) ;\n" | |
391 | " pxor 40(%2), %%mm5 ;\n" | |
392 | " pxor 32(%3), %%mm4 ;\n" | |
393 | " pxor 24(%4), %%mm3 ;\n" | |
394 | " movq %%mm3, 24(%1) ;\n" | |
395 | " movq 56(%1), %%mm7 ;\n" | |
396 | " movq 48(%1), %%mm6 ;\n" | |
397 | " pxor 40(%3), %%mm5 ;\n" | |
398 | " pxor 32(%4), %%mm4 ;\n" | |
399 | " pxor 48(%2), %%mm6 ;\n" | |
400 | " movq %%mm4, 32(%1) ;\n" | |
401 | " pxor 56(%2), %%mm7 ;\n" | |
402 | " pxor 40(%4), %%mm5 ;\n" | |
403 | " pxor 48(%3), %%mm6 ;\n" | |
404 | " pxor 56(%3), %%mm7 ;\n" | |
405 | " movq %%mm5, 40(%1) ;\n" | |
406 | " pxor 48(%4), %%mm6 ;\n" | |
407 | " pxor 56(%4), %%mm7 ;\n" | |
408 | " movq %%mm6, 48(%1) ;\n" | |
409 | " movq %%mm7, 56(%1) ;\n" | |
410 | ||
411 | " addl $64, %1 ;\n" | |
412 | " addl $64, %2 ;\n" | |
413 | " addl $64, %3 ;\n" | |
414 | " addl $64, %4 ;\n" | |
415 | " decl %0 ;\n" | |
416 | " jnz 1b ;\n" | |
417 | : "+r" (lines), | |
418 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
419 | : | |
420 | : "memory"); | |
421 | ||
422 | kernel_fpu_end(); | |
423 | } | |
424 | ||
425 | static void | |
426 | xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
427 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
428 | { | |
429 | unsigned long lines = bytes >> 6; | |
430 | ||
431 | kernel_fpu_begin(); | |
432 | ||
433 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
434 | such that it won't pass to the asm volatile below a | |
435 | register that is shared with any other variable. That's | |
436 | because we modify p4 and p5 there, but we can't mark them | |
437 | as read/write, otherwise we'd overflow the 10-asm-operands | |
438 | limit of GCC < 3.1. */ | |
439 | __asm__ ("" : "+r" (p4), "+r" (p5)); | |
440 | ||
441 | __asm__ __volatile__ ( | |
442 | " .align 32,0x90 ;\n" | |
443 | " 1: ;\n" | |
444 | " movq (%1), %%mm0 ;\n" | |
445 | " movq 8(%1), %%mm1 ;\n" | |
446 | " pxor (%2), %%mm0 ;\n" | |
447 | " pxor 8(%2), %%mm1 ;\n" | |
448 | " movq 16(%1), %%mm2 ;\n" | |
449 | " pxor (%3), %%mm0 ;\n" | |
450 | " pxor 8(%3), %%mm1 ;\n" | |
451 | " pxor 16(%2), %%mm2 ;\n" | |
452 | " pxor (%4), %%mm0 ;\n" | |
453 | " pxor 8(%4), %%mm1 ;\n" | |
454 | " pxor 16(%3), %%mm2 ;\n" | |
455 | " movq 24(%1), %%mm3 ;\n" | |
456 | " pxor (%5), %%mm0 ;\n" | |
457 | " pxor 8(%5), %%mm1 ;\n" | |
458 | " movq %%mm0, (%1) ;\n" | |
459 | " pxor 16(%4), %%mm2 ;\n" | |
460 | " pxor 24(%2), %%mm3 ;\n" | |
461 | " movq %%mm1, 8(%1) ;\n" | |
462 | " pxor 16(%5), %%mm2 ;\n" | |
463 | " pxor 24(%3), %%mm3 ;\n" | |
464 | " movq 32(%1), %%mm4 ;\n" | |
465 | " movq %%mm2, 16(%1) ;\n" | |
466 | " pxor 24(%4), %%mm3 ;\n" | |
467 | " pxor 32(%2), %%mm4 ;\n" | |
468 | " movq 40(%1), %%mm5 ;\n" | |
469 | " pxor 24(%5), %%mm3 ;\n" | |
470 | " pxor 32(%3), %%mm4 ;\n" | |
471 | " pxor 40(%2), %%mm5 ;\n" | |
472 | " movq %%mm3, 24(%1) ;\n" | |
473 | " pxor 32(%4), %%mm4 ;\n" | |
474 | " pxor 40(%3), %%mm5 ;\n" | |
475 | " movq 48(%1), %%mm6 ;\n" | |
476 | " movq 56(%1), %%mm7 ;\n" | |
477 | " pxor 32(%5), %%mm4 ;\n" | |
478 | " pxor 40(%4), %%mm5 ;\n" | |
479 | " pxor 48(%2), %%mm6 ;\n" | |
480 | " pxor 56(%2), %%mm7 ;\n" | |
481 | " movq %%mm4, 32(%1) ;\n" | |
482 | " pxor 48(%3), %%mm6 ;\n" | |
483 | " pxor 56(%3), %%mm7 ;\n" | |
484 | " pxor 40(%5), %%mm5 ;\n" | |
485 | " pxor 48(%4), %%mm6 ;\n" | |
486 | " pxor 56(%4), %%mm7 ;\n" | |
487 | " movq %%mm5, 40(%1) ;\n" | |
488 | " pxor 48(%5), %%mm6 ;\n" | |
489 | " pxor 56(%5), %%mm7 ;\n" | |
490 | " movq %%mm6, 48(%1) ;\n" | |
491 | " movq %%mm7, 56(%1) ;\n" | |
492 | ||
493 | " addl $64, %1 ;\n" | |
494 | " addl $64, %2 ;\n" | |
495 | " addl $64, %3 ;\n" | |
496 | " addl $64, %4 ;\n" | |
497 | " addl $64, %5 ;\n" | |
498 | " decl %0 ;\n" | |
499 | " jnz 1b ;\n" | |
500 | : "+r" (lines), | |
501 | "+r" (p1), "+r" (p2), "+r" (p3) | |
502 | : "r" (p4), "r" (p5) | |
503 | : "memory"); | |
504 | ||
505 | /* p4 and p5 were modified, and now the variables are dead. | |
506 | Clobber them just to be sure nobody does something stupid | |
507 | like assuming they have some legal value. */ | |
508 | __asm__ ("" : "=r" (p4), "=r" (p5)); | |
509 | ||
510 | kernel_fpu_end(); | |
511 | } | |
512 | ||
513 | static struct xor_block_template xor_block_pII_mmx = { | |
514 | .name = "pII_mmx", | |
515 | .do_2 = xor_pII_mmx_2, | |
516 | .do_3 = xor_pII_mmx_3, | |
517 | .do_4 = xor_pII_mmx_4, | |
518 | .do_5 = xor_pII_mmx_5, | |
519 | }; | |
520 | ||
521 | static struct xor_block_template xor_block_p5_mmx = { | |
522 | .name = "p5_mmx", | |
523 | .do_2 = xor_p5_mmx_2, | |
524 | .do_3 = xor_p5_mmx_3, | |
525 | .do_4 = xor_p5_mmx_4, | |
526 | .do_5 = xor_p5_mmx_5, | |
527 | }; | |
528 | ||
529 | /* | |
530 | * Cache avoiding checksumming functions utilizing KNI instructions | |
531 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | |
532 | */ | |
533 | ||
534 | #define XMMS_SAVE do { \ | |
535 | preempt_disable(); \ | |
4bb0d3ec ZA |
536 | cr0 = read_cr0(); \ |
537 | clts(); \ | |
1da177e4 | 538 | __asm__ __volatile__ ( \ |
4bb0d3ec ZA |
539 | "movups %%xmm0,(%0) ;\n\t" \ |
540 | "movups %%xmm1,0x10(%0) ;\n\t" \ | |
541 | "movups %%xmm2,0x20(%0) ;\n\t" \ | |
542 | "movups %%xmm3,0x30(%0) ;\n\t" \ | |
543 | : \ | |
1da177e4 LT |
544 | : "r" (xmm_save) \ |
545 | : "memory"); \ | |
546 | } while(0) | |
547 | ||
548 | #define XMMS_RESTORE do { \ | |
549 | __asm__ __volatile__ ( \ | |
550 | "sfence ;\n\t" \ | |
4bb0d3ec ZA |
551 | "movups (%0),%%xmm0 ;\n\t" \ |
552 | "movups 0x10(%0),%%xmm1 ;\n\t" \ | |
553 | "movups 0x20(%0),%%xmm2 ;\n\t" \ | |
554 | "movups 0x30(%0),%%xmm3 ;\n\t" \ | |
1da177e4 | 555 | : \ |
4bb0d3ec | 556 | : "r" (xmm_save) \ |
1da177e4 | 557 | : "memory"); \ |
4bb0d3ec | 558 | write_cr0(cr0); \ |
1da177e4 LT |
559 | preempt_enable(); \ |
560 | } while(0) | |
561 | ||
562 | #define ALIGN16 __attribute__((aligned(16))) | |
563 | ||
564 | #define OFFS(x) "16*("#x")" | |
565 | #define PF_OFFS(x) "256+16*("#x")" | |
566 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" | |
567 | #define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" | |
568 | #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" | |
569 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" | |
570 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" | |
571 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" | |
572 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" | |
573 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" | |
574 | #define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" | |
575 | #define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" | |
576 | #define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" | |
577 | #define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" | |
578 | #define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" | |
579 | ||
580 | ||
581 | static void | |
582 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
583 | { | |
584 | unsigned long lines = bytes >> 8; | |
585 | char xmm_save[16*4] ALIGN16; | |
586 | int cr0; | |
587 | ||
588 | XMMS_SAVE; | |
589 | ||
590 | __asm__ __volatile__ ( | |
591 | #undef BLOCK | |
592 | #define BLOCK(i) \ | |
593 | LD(i,0) \ | |
594 | LD(i+1,1) \ | |
595 | PF1(i) \ | |
596 | PF1(i+2) \ | |
597 | LD(i+2,2) \ | |
598 | LD(i+3,3) \ | |
599 | PF0(i+4) \ | |
600 | PF0(i+6) \ | |
601 | XO1(i,0) \ | |
602 | XO1(i+1,1) \ | |
603 | XO1(i+2,2) \ | |
604 | XO1(i+3,3) \ | |
605 | ST(i,0) \ | |
606 | ST(i+1,1) \ | |
607 | ST(i+2,2) \ | |
608 | ST(i+3,3) \ | |
609 | ||
610 | ||
611 | PF0(0) | |
612 | PF0(2) | |
613 | ||
614 | " .align 32 ;\n" | |
615 | " 1: ;\n" | |
616 | ||
617 | BLOCK(0) | |
618 | BLOCK(4) | |
619 | BLOCK(8) | |
620 | BLOCK(12) | |
621 | ||
622 | " addl $256, %1 ;\n" | |
623 | " addl $256, %2 ;\n" | |
624 | " decl %0 ;\n" | |
625 | " jnz 1b ;\n" | |
626 | : "+r" (lines), | |
627 | "+r" (p1), "+r" (p2) | |
628 | : | |
629 | : "memory"); | |
630 | ||
631 | XMMS_RESTORE; | |
632 | } | |
633 | ||
634 | static void | |
635 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
636 | unsigned long *p3) | |
637 | { | |
638 | unsigned long lines = bytes >> 8; | |
639 | char xmm_save[16*4] ALIGN16; | |
640 | int cr0; | |
641 | ||
642 | XMMS_SAVE; | |
643 | ||
644 | __asm__ __volatile__ ( | |
645 | #undef BLOCK | |
646 | #define BLOCK(i) \ | |
647 | PF1(i) \ | |
648 | PF1(i+2) \ | |
649 | LD(i,0) \ | |
650 | LD(i+1,1) \ | |
651 | LD(i+2,2) \ | |
652 | LD(i+3,3) \ | |
653 | PF2(i) \ | |
654 | PF2(i+2) \ | |
655 | PF0(i+4) \ | |
656 | PF0(i+6) \ | |
657 | XO1(i,0) \ | |
658 | XO1(i+1,1) \ | |
659 | XO1(i+2,2) \ | |
660 | XO1(i+3,3) \ | |
661 | XO2(i,0) \ | |
662 | XO2(i+1,1) \ | |
663 | XO2(i+2,2) \ | |
664 | XO2(i+3,3) \ | |
665 | ST(i,0) \ | |
666 | ST(i+1,1) \ | |
667 | ST(i+2,2) \ | |
668 | ST(i+3,3) \ | |
669 | ||
670 | ||
671 | PF0(0) | |
672 | PF0(2) | |
673 | ||
674 | " .align 32 ;\n" | |
675 | " 1: ;\n" | |
676 | ||
677 | BLOCK(0) | |
678 | BLOCK(4) | |
679 | BLOCK(8) | |
680 | BLOCK(12) | |
681 | ||
682 | " addl $256, %1 ;\n" | |
683 | " addl $256, %2 ;\n" | |
684 | " addl $256, %3 ;\n" | |
685 | " decl %0 ;\n" | |
686 | " jnz 1b ;\n" | |
687 | : "+r" (lines), | |
688 | "+r" (p1), "+r"(p2), "+r"(p3) | |
689 | : | |
690 | : "memory" ); | |
691 | ||
692 | XMMS_RESTORE; | |
693 | } | |
694 | ||
695 | static void | |
696 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
697 | unsigned long *p3, unsigned long *p4) | |
698 | { | |
699 | unsigned long lines = bytes >> 8; | |
700 | char xmm_save[16*4] ALIGN16; | |
701 | int cr0; | |
702 | ||
703 | XMMS_SAVE; | |
704 | ||
705 | __asm__ __volatile__ ( | |
706 | #undef BLOCK | |
707 | #define BLOCK(i) \ | |
708 | PF1(i) \ | |
709 | PF1(i+2) \ | |
710 | LD(i,0) \ | |
711 | LD(i+1,1) \ | |
712 | LD(i+2,2) \ | |
713 | LD(i+3,3) \ | |
714 | PF2(i) \ | |
715 | PF2(i+2) \ | |
716 | XO1(i,0) \ | |
717 | XO1(i+1,1) \ | |
718 | XO1(i+2,2) \ | |
719 | XO1(i+3,3) \ | |
720 | PF3(i) \ | |
721 | PF3(i+2) \ | |
722 | PF0(i+4) \ | |
723 | PF0(i+6) \ | |
724 | XO2(i,0) \ | |
725 | XO2(i+1,1) \ | |
726 | XO2(i+2,2) \ | |
727 | XO2(i+3,3) \ | |
728 | XO3(i,0) \ | |
729 | XO3(i+1,1) \ | |
730 | XO3(i+2,2) \ | |
731 | XO3(i+3,3) \ | |
732 | ST(i,0) \ | |
733 | ST(i+1,1) \ | |
734 | ST(i+2,2) \ | |
735 | ST(i+3,3) \ | |
736 | ||
737 | ||
738 | PF0(0) | |
739 | PF0(2) | |
740 | ||
741 | " .align 32 ;\n" | |
742 | " 1: ;\n" | |
743 | ||
744 | BLOCK(0) | |
745 | BLOCK(4) | |
746 | BLOCK(8) | |
747 | BLOCK(12) | |
748 | ||
749 | " addl $256, %1 ;\n" | |
750 | " addl $256, %2 ;\n" | |
751 | " addl $256, %3 ;\n" | |
752 | " addl $256, %4 ;\n" | |
753 | " decl %0 ;\n" | |
754 | " jnz 1b ;\n" | |
755 | : "+r" (lines), | |
756 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
757 | : | |
758 | : "memory" ); | |
759 | ||
760 | XMMS_RESTORE; | |
761 | } | |
762 | ||
763 | static void | |
764 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
765 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
766 | { | |
767 | unsigned long lines = bytes >> 8; | |
768 | char xmm_save[16*4] ALIGN16; | |
769 | int cr0; | |
770 | ||
771 | XMMS_SAVE; | |
772 | ||
773 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
774 | such that it won't pass to the asm volatile below a | |
775 | register that is shared with any other variable. That's | |
776 | because we modify p4 and p5 there, but we can't mark them | |
777 | as read/write, otherwise we'd overflow the 10-asm-operands | |
778 | limit of GCC < 3.1. */ | |
779 | __asm__ ("" : "+r" (p4), "+r" (p5)); | |
780 | ||
781 | __asm__ __volatile__ ( | |
782 | #undef BLOCK | |
783 | #define BLOCK(i) \ | |
784 | PF1(i) \ | |
785 | PF1(i+2) \ | |
786 | LD(i,0) \ | |
787 | LD(i+1,1) \ | |
788 | LD(i+2,2) \ | |
789 | LD(i+3,3) \ | |
790 | PF2(i) \ | |
791 | PF2(i+2) \ | |
792 | XO1(i,0) \ | |
793 | XO1(i+1,1) \ | |
794 | XO1(i+2,2) \ | |
795 | XO1(i+3,3) \ | |
796 | PF3(i) \ | |
797 | PF3(i+2) \ | |
798 | XO2(i,0) \ | |
799 | XO2(i+1,1) \ | |
800 | XO2(i+2,2) \ | |
801 | XO2(i+3,3) \ | |
802 | PF4(i) \ | |
803 | PF4(i+2) \ | |
804 | PF0(i+4) \ | |
805 | PF0(i+6) \ | |
806 | XO3(i,0) \ | |
807 | XO3(i+1,1) \ | |
808 | XO3(i+2,2) \ | |
809 | XO3(i+3,3) \ | |
810 | XO4(i,0) \ | |
811 | XO4(i+1,1) \ | |
812 | XO4(i+2,2) \ | |
813 | XO4(i+3,3) \ | |
814 | ST(i,0) \ | |
815 | ST(i+1,1) \ | |
816 | ST(i+2,2) \ | |
817 | ST(i+3,3) \ | |
818 | ||
819 | ||
820 | PF0(0) | |
821 | PF0(2) | |
822 | ||
823 | " .align 32 ;\n" | |
824 | " 1: ;\n" | |
825 | ||
826 | BLOCK(0) | |
827 | BLOCK(4) | |
828 | BLOCK(8) | |
829 | BLOCK(12) | |
830 | ||
831 | " addl $256, %1 ;\n" | |
832 | " addl $256, %2 ;\n" | |
833 | " addl $256, %3 ;\n" | |
834 | " addl $256, %4 ;\n" | |
835 | " addl $256, %5 ;\n" | |
836 | " decl %0 ;\n" | |
837 | " jnz 1b ;\n" | |
838 | : "+r" (lines), | |
839 | "+r" (p1), "+r" (p2), "+r" (p3) | |
840 | : "r" (p4), "r" (p5) | |
841 | : "memory"); | |
842 | ||
843 | /* p4 and p5 were modified, and now the variables are dead. | |
844 | Clobber them just to be sure nobody does something stupid | |
845 | like assuming they have some legal value. */ | |
846 | __asm__ ("" : "=r" (p4), "=r" (p5)); | |
847 | ||
848 | XMMS_RESTORE; | |
849 | } | |
850 | ||
851 | static struct xor_block_template xor_block_pIII_sse = { | |
852 | .name = "pIII_sse", | |
853 | .do_2 = xor_sse_2, | |
854 | .do_3 = xor_sse_3, | |
855 | .do_4 = xor_sse_4, | |
856 | .do_5 = xor_sse_5, | |
857 | }; | |
858 | ||
859 | /* Also try the generic routines. */ | |
860 | #include <asm-generic/xor.h> | |
861 | ||
862 | #undef XOR_TRY_TEMPLATES | |
863 | #define XOR_TRY_TEMPLATES \ | |
864 | do { \ | |
865 | xor_speed(&xor_block_8regs); \ | |
866 | xor_speed(&xor_block_8regs_p); \ | |
867 | xor_speed(&xor_block_32regs); \ | |
868 | xor_speed(&xor_block_32regs_p); \ | |
869 | if (cpu_has_xmm) \ | |
870 | xor_speed(&xor_block_pIII_sse); \ | |
871 | if (cpu_has_mmx) { \ | |
872 | xor_speed(&xor_block_pII_mmx); \ | |
873 | xor_speed(&xor_block_p5_mmx); \ | |
874 | } \ | |
875 | } while (0) | |
876 | ||
877 | /* We force the use of the SSE xor block because it can write around L2. | |
878 | We may also be able to load into the L1 only depending on how the cpu | |
879 | deals with a load to a line that is being prefetched. */ | |
880 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | |
881 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) |